In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')
%matplotlib inline

In [4]:
df = pd.read_csv('winequality-red.csv', sep=';')


In [5]:
#  Все хорошо?
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [6]:
# Проверяю себя
df.shape

(1599, 12)

In [7]:
# Считаю, сколько пропусков в каждом столбце.
# Если пропуски найдутся, данные надо будет лечить
NAs = pd.concat([df.isnull().sum()], axis=1)
NAs[NAs.sum(axis=1) > 0]

Unnamed: 0,0


In [8]:
NAs

Unnamed: 0,0
fixed acidity,0
volatile acidity,0
citric acid,0
residual sugar,0
chlorides,0
free sulfur dioxide,0
total sulfur dioxide,0
density,0
pH,0
sulphates,0


In [9]:
df.describe(include='all')

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [10]:
df['quality'].value_counts()

5    681
6    638
7    199
4     53
8     18
3     10
Name: quality, dtype: int64

In [11]:
df['quality'] = df['quality'].replace(to_replace=3, value=4).replace(9, 8)


In [12]:
df['quality'].value_counts()

5    681
6    638
7    199
4     63
8     18
Name: quality, dtype: int64

In [13]:
# Разделим предикоторы и отклики
X = df.iloc[:, :-1].values  
y = df.iloc[:, -1].values

In [15]:
#  разделяю на обучающую и тестовую выборку

from sklearn.model_selection import train_test_split  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1234)  

In [19]:
#  Задаю параметры модели
from sklearn.ensemble import RandomForestClassifier

# Инициализируем модель
model = RandomForestClassifier(random_state=13,
                               n_estimators=80,
                               criterion='gini',
                               max_depth=5,
                               max_features='auto', 
                               max_leaf_nodes=None,
                               min_impurity_decrease=0.001, 
                               min_impurity_split=None,
                               bootstrap=True, 
                               min_samples_leaf=10, 
                               min_samples_split=5,
                               min_weight_fraction_leaf=0.0, 
                               verbose=1, 
                               oob_score=True,
                               warm_start=False,
                               class_weight=None)





In [20]:
# Определим тестируемые значения гиперпараметров
params_set = {
    'max_depth': [3, 4, 5, 6],
    'min_samples_leaf': [2, 4, 8, 16],
    'min_samples_split': [3, 5, 8, 10, 15],
    'n_estimators': [30, 40, 100, 200],
    'bootstrap': [True, False],
    'criterion': ["gini", "entropy"]
}

In [21]:
# Активируем поиск на решетке GridSearchCV
from sklearn.model_selection import GridSearchCV
#   Проводим 10-fold кросс-валидацию
#   Качество моделей сравниваем с помощью  balanced_accuracy, так как классы не сбалансрованы
grid_CV = GridSearchCV(estimator=model,
                       param_grid=params_set,
                       scoring='balanced_accuracy',
                       cv=10,
                       n_jobs=-1)

# Обучим на тренировочной выборке
grid_CV.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    0.0s finished


GridSearchCV(cv=10, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=5,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.001,
                                              min_impurity_split=None,
                                              min_samples_leaf=10,
                                              min_samples_split=5,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=80, n_jobs=None,
                                              oob_score=True, random_state=13,
                                   

In [22]:
# Выберем лучшую модель
best_model = grid_CV.best_estimator_
print('Лучшие значения гиперпараметров:\n', grid_CV.best_params_)

Лучшие значения гиперпараметров:
 {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 6, 'min_samples_leaf': 4, 'min_samples_split': 15, 'n_estimators': 30}


In [25]:
from sklearn import metrics
# Строим предсказания модели
y_pred_train = best_model.predict(X_train)
y_pred_test = best_model.predict(X_test)

print(metrics.classification_report(y_pred_test, y_test))

              precision    recall  f1-score   support

           4       0.00      0.00      0.00         0
           5       0.79      0.73      0.76       249
           6       0.66      0.60      0.63       239
           7       0.30      0.45      0.36        40
           8       0.00      0.00      0.00         0

    accuracy                           0.65       528
   macro avg       0.35      0.36      0.35       528
weighted avg       0.70      0.65      0.67       528



[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    0.0s finished
  _warn_prf(average, modifier, msg_start, len(result))


In [26]:
from sklearn.calibration import CalibratedClassifierCV
# Калибруем модель
model_sigmoid = CalibratedClassifierCV(grid_CV, cv=2, method='sigmoid')
model_sigmoid.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:    0.0s finished


CalibratedClassifierCV(base_estimator=GridSearchCV(cv=10, error_score=nan,
                                                   estimator=RandomForestClassifier(bootstrap=True,
                                                                                    ccp_alpha=0.0,
                                                                                    class_weight=None,
                                                                                    criterion='gini',
                                                                                    max_depth=5,
                                                                                    max_features='auto',
                                                                                    max_leaf_nodes=None,
                                                                                    max_samples=None,
                                                                                    min_impurity_decrease=0.001,

In [27]:
# Строим предсказания модели
y_pred_train = model_sigmoid.predict(X_train)
y_pred_test = model_sigmoid.predict(X_test)

print(metrics.classification_report(y_pred_test, y_test))

              precision    recall  f1-score   support

           4       0.00      0.00      0.00         0
           5       0.78      0.70      0.74       256
           6       0.61      0.59      0.60       225
           7       0.37      0.47      0.41        47
           8       0.00      0.00      0.00         0

    accuracy                           0.63       528
   macro avg       0.35      0.35      0.35       528
weighted avg       0.67      0.63      0.65       528



[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:    0.0s finished
  _warn_prf(average, modifier, msg_start, len(result))


In [28]:
# Строим предсказание модели
y_pred_test_probs = model_sigmoid.predict_proba(X_test)

# Оценим долю наблюдений в тестовой выборке, для которых есть класс, вероятность принадлежать которому больше 0.8
(y_pred_test_probs > 0.8).sum() / len(y_pred_test_probs)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:    0.0s finished


0.045454545454545456