In [2]:
import os
import sys
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

Загрузим данные

In [4]:
data_path = os.path.join(sys.path[4], "..", "data", "winequality-red.csv")
df = pd.read_csv(data_path)
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


Уберем пробелы в названиях столбцов

In [5]:
df = df.rename(
    columns={
        "fixed acidity": "fixed_acidity",
        "volatile acidity": "volatile_acidity",
        "citric acid": "citric_acid",
        "residual sugar": "residual_sugar",
        "free sulfur dioxide": "free_sulfur_dioxide",
        "total sulfur dioxide": "total_sulfur_dioxide",
    }
)
df

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


Разобьем на трейн и тест

In [6]:
X = df.drop("quality", axis=1)
y = df["quality"]

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)

Подготовим функцию для оценки качества

In [8]:
def test_model(model, X, y):
    y_pred = model.predict(X)
    y_score = model.predict_proba(X)

    accuracy = accuracy_score(y,y_pred)
    f1 = f1_score(y,y_pred, average="weighted")
    roc_auc = roc_auc_score(y, y_score, multi_class="ovo")

    return accuracy, f1, roc_auc

Обучим модели с дефолтными параметрами и посмотрим на их качество

In [9]:
lr_model = LogisticRegression(solver="sag") # Оптимизатор заменен из-за пердупреждений на дефолтном lbfgs
rf_model = RandomForestClassifier()
dt_model = DecisionTreeClassifier()
gd_model = GradientBoostingClassifier()

In [10]:
# качество моделей будем собирать в табличку для удобства оценки
results = pd.DataFrame(
    index=[
        ["Accuracy", "Accuracy", "F1", "F1", "ROCAUC", "ROCAUC"],
        ["train", "test", "train", "test", "train", "test"],
    ],
)

In [11]:
for model in [lr_model, rf_model, dt_model, gd_model]:
    model.fit(X_train, y_train)
    accuracy_train, f1_train, rocauc_train = test_model(model, X_train, y_train)
    accuracy_test, f1_test, rocauc_test = test_model(model, X_test, y_test)
    results[model] = [
        accuracy_train,
        accuracy_test,
        f1_train,
        f1_test,
        rocauc_train,
        rocauc_test,
    ]



In [78]:
results.columns = [
    "LogisticRegression",
    "RandomForestClassifier",
    "DecisionTreeClassifier",
    "GradientBoostingClassifier",
]
results

Unnamed: 0,Unnamed: 1,LogisticRegression,RandomForestClassifier,DecisionTreeClassifier,GradientBoostingClassifier
Accuracy,train,0.528538,1.0,1.0,0.891321
Accuracy,test,0.503125,0.6625,0.575,0.65
F1,train,0.481507,1.0,1.0,0.891411
F1,test,0.45442,0.644129,0.570405,0.639232
ROCAUC,train,0.628113,1.0,1.0,0.993812
ROCAUC,test,0.631132,0.755216,0.593753,0.710766


3 из 4 моделей показывают склонность к переобучению, с которой можно поработать. 

Для дальнейших экспериментов выберем `Random Forest` и `Logistic Regression`. У первой лучшее качество на тесте, а вторая не переобучается. У первой будем бороться с переобучением, у второй повышать качество.