<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score

In [3]:
df = pd.read_csv("wine_data.csv")
df.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality
0,11.6,0.58,0.66,2.2,0.074,10.0,47.0,1.0008,3.25,0.57,9.0,3
1,10.4,0.61,0.49,2.1,0.2,5.0,16.0,0.9994,3.16,0.63,8.4,3
2,7.4,1.185,0.0,4.25,0.097,5.0,14.0,0.9966,3.63,0.54,10.7,3
3,10.4,0.44,0.42,1.5,0.145,34.0,48.0,0.99832,3.38,0.86,9.9,3
4,8.3,1.02,0.02,3.4,0.084,6.0,11.0,0.99892,3.48,0.49,11.0,3


In [4]:
# split features/target
X = df.drop(columns='quality')
y = df['quality']

In [5]:
# train and test 
X_train, X_test,y_train, y_test= train_test_split(X, y, test_size =0.2, random_state=42)

In [11]:
# define the parameter grid 
param_grid = {
    "criterion" : ["gini", "entropy"], 
    "max_depth" : [3, 5, 10, None],
    "min_samples_split":[2,5,10]
}

In [12]:
# create model

model = DecisionTreeClassifier(random_state=42)
grid_search = GridSearchCV(model,param_grid, cv=3,verbose=1)
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 24 candidates, totalling 72 fits


In [14]:
print(grid_search.best_params_)

{'criterion': 'gini', 'max_depth': None, 'min_samples_split': 2}


In [15]:
# Get the best model 
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

print(accuracy_score(y_test, y_pred))

0.5954761904761905


In [17]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           3       0.60      0.65      0.62       566
           4       0.61      0.58      0.59       627
           5       0.59      0.54      0.56       621
           6       0.63      0.65      0.64       580
           7       0.56      0.50      0.53       625
           8       0.59      0.63      0.61       608
           9       0.59      0.64      0.61       573

    accuracy                           0.60      4200
   macro avg       0.60      0.60      0.60      4200
weighted avg       0.59      0.60      0.59      4200



In [22]:
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("model", DecisionTreeClassifier(random_state=42))
])


In [25]:
# define the parameter grid 
param_grid = {
    "model__criterion" : ["gini", "entropy"], 
    "model__max_depth" : [3, 5, 10, None],
    "model__min_samples_split":[2,5,10]
}

In [26]:
grid_search = GridSearchCV(pipeline, param_grid, cv=3)
grid_search.fit(X_train, y_train)

In [29]:
print(grid_search.best_params_)

{'model__criterion': 'gini', 'model__max_depth': None, 'model__min_samples_split': 2}


In [30]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.5942857142857143
