In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing as prproc
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn import svm
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.metrics import classification_report
from sklearn import svm

In [2]:
data = pd.read_csv('winequality-white.dat', sep = ",")
data.head()

Unnamed: 0,FixedAcidity,VolatileAcidity,CitricAcid,ResidualSugar,Chlorides,FreeSulfurDioxide,TotalSulfurDioxide,Density,PH,Sulphates,Alcohol,Quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [3]:
scalerStd = prproc.StandardScaler()
for i in data.columns[:-1]:
    x = scalerStd.fit_transform(data[[i]])
    data[i] = x[0:]
data.head()

Unnamed: 0,FixedAcidity,VolatileAcidity,CitricAcid,ResidualSugar,Chlorides,FreeSulfurDioxide,TotalSulfurDioxide,Density,PH,Sulphates,Alcohol,Quality
0,0.172097,-0.08177,0.21328,2.821349,-0.035355,0.569932,0.744565,2.331512,-1.246921,-0.349184,-1.393152,6
1,-0.657501,0.215896,0.048001,-0.944765,0.147747,-1.253019,-0.149685,-0.009154,0.740029,0.001342,-0.824276,6
2,1.475751,0.017452,0.543838,0.100282,0.193523,-0.312141,-0.973336,0.358665,0.475102,-0.436816,-0.336667,6
3,0.409125,-0.478657,-0.117278,0.415768,0.559727,0.687541,1.121091,0.525855,0.01148,-0.787342,-0.499203,6
4,0.409125,-0.478657,-0.117278,0.415768,0.559727,0.687541,1.121091,0.525855,0.01148,-0.787342,-0.499203,6


In [4]:
data_train, data_test_val = train_test_split(data, test_size=0.50, stratify = data['Quality'])
data_test, data_val = train_test_split(data_test_val, test_size=0.40, stratify = data_test_val['Quality'])

x_train = data_train[["FixedAcidity", "VolatileAcidity", "CitricAcid", "ResidualSugar", "Chlorides", "FreeSulfurDioxide", "TotalSulfurDioxide", "Density", "PH", "Sulphates", "Alcohol"]]
y_train = data_train['Quality']

x_test = data_test[["FixedAcidity", "VolatileAcidity", "CitricAcid", "ResidualSugar", "Chlorides", "FreeSulfurDioxide", "TotalSulfurDioxide", "Density", "PH", "Sulphates", "Alcohol"]]
y_test = data_test['Quality']

x_val = data_val[["FixedAcidity", "VolatileAcidity", "CitricAcid", "ResidualSugar", "Chlorides", "FreeSulfurDioxide", "TotalSulfurDioxide", "Density", "PH", "Sulphates", "Alcohol"]]
y_val = data_val['Quality']

In [5]:
data["Quality"].unique()

array([6, 5, 7, 8, 4, 3, 9], dtype=int64)

In [5]:
# Нахождение лучших параметров для метода опорных векторов
 #20, 0.01
gammas = ['scale', 'auto', 1, 10, 0.1]
#0.1, 0.01
coef0s = [0, 1, 2, 5, 10]
#4, 5
degrees = [2, 3]
#1, 2, 10, 20
Cs = [0.001, 0.01, 0.1, 0.5]

best = [0]
for krl in ['linear', 'sigmoid', 'rbf']:
    for gamma in gammas:
        for coef0 in coef0s:
            for degree in degrees:
                for C in Cs:
                    clf = svm.SVC(kernel = krl, gamma = gamma, coef0 = coef0, degree = degree, C = C)

                    clf.fit(x_train, y_train)

                    res = clf.predict(x_test)
                    score = clf.score(x_test, y_test)

                    if score > best[0]:
                        best = [score, gamma, coef0, degree, C]
    print(krl, best)

linear [0.517358747447243, 'scale', 0, 2, 0.1]
sigmoid [0.517358747447243, 'scale', 0, 2, 0.1]
rbf [0.5629680054458815, 0.1, 0, 2, 0.5]


In [6]:
# Нахождение лучших параметров для дерева
criterions = ['gini', 'entropy']
alphas = [0.005, 0.01, 0.015, 0.02, 0.25, 0.03, 0.035, 0.2, 0.8]

best = [0]
for criterion in criterions:
    for alpha in alphas:
        clf = tree.DecisionTreeClassifier(criterion = criterion, ccp_alpha = alpha)
        clf.fit(x_train, y_train)
        score = clf.score(x_test, y_test)
                    
        if score > best[0]:
            best = [score, criterion, alpha]
#         tree.plot_tree(clf, filled = True, rounded = True)
print(best)

[0.5248468345813478, 'entropy', 0.005]


In [7]:
clf = svm.SVC(kernel = 'rbf', gamma = 0.1, coef0 = 0, degree = 2, C = 0.5, probability=True)
clf.fit(x_train, y_train)

clf_tree = tree.DecisionTreeClassifier(criterion = 'entropy', ccp_alpha = 0.005)
clf_tree.fit(x_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.005, criterion='entropy')

In [17]:
# Проверка лучшей SVC с тестовой выборкой
res = clf.predict(x_test)
print(classification_report(y_test, res, zero_division=0))

print(roc_auc_score(y_test, clf.predict_proba(x_test), multi_class='ovo', average='macro'))

              precision    recall  f1-score   support

           3       0.00      0.00      0.00         6
           4       0.00      0.00      0.00        49
           5       0.59      0.61      0.60       438
           6       0.55      0.75      0.63       659
           7       0.58      0.24      0.34       264
           8       0.00      0.00      0.00        52
           9       0.00      0.00      0.00         1

    accuracy                           0.56      1469
   macro avg       0.25      0.23      0.23      1469
weighted avg       0.53      0.56      0.52      1469

0.7823523040459468


In [15]:
# Проверка лучшего дерева с тестовой выборкой
res = clf_tree.predict(x_test)
print(classification_report(y_test, res, zero_division=0))

print(roc_auc_score(y_test, clf_tree.predict_proba(x_test), multi_class='ovo', average='weighted'))

              precision    recall  f1-score   support

           3       0.00      0.00      0.00         6
           4       0.33      0.02      0.04        49
           5       0.54      0.65      0.59       438
           6       0.54      0.59      0.56       659
           7       0.44      0.36      0.40       264
           8       0.57      0.08      0.14        52
           9       0.00      0.00      0.00         1

    accuracy                           0.52      1469
   macro avg       0.35      0.24      0.25      1469
weighted avg       0.51      0.52      0.51      1469

0.7201687935176676


In [10]:
# Проверка лучшей SVC с валидационной выборкой
res = clf.predict(x_val)
print(classification_report(y_val, res, zero_division=0))

print(roc_auc_score(y_val, clf.predict_proba(x_val), multi_class='ovo', average='macro'))

              precision    recall  f1-score   support

           3       0.00      0.00      0.00         4
           4       0.00      0.00      0.00        33
           5       0.61      0.60      0.61       291
           6       0.55      0.79      0.65       440
           7       0.64      0.22      0.33       176
           8       0.00      0.00      0.00        35
           9       0.00      0.00      0.00         1

    accuracy                           0.57       980
   macro avg       0.26      0.23      0.23       980
weighted avg       0.54      0.57      0.53       980

0.7783196211729985


In [11]:
# Проверка лучшего дерева с валидационной выборкой
res = clf_tree.predict(x_val)
print(classification_report(y_val, res, zero_division=0))

print(roc_auc_score(y_val, clf_tree.predict_proba(x_val), multi_class='ovo', average='weighted'))

              precision    recall  f1-score   support

           3       0.00      0.00      0.00         4
           4       0.00      0.00      0.00        33
           5       0.53      0.69      0.60       291
           6       0.54      0.60      0.57       440
           7       0.44      0.26      0.33       176
           8       0.45      0.14      0.22        35
           9       0.00      0.00      0.00         1

    accuracy                           0.53       980
   macro avg       0.28      0.24      0.24       980
weighted avg       0.50      0.53      0.50       980

0.7352806018087157
