In [28]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier

%matplotlib inline

In [13]:
df = pd.read_csv('winequality-red.csv')

In [14]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [19]:
df['quality'].value_counts()

5    681
6    638
7    199
4     53
8     18
3     10
Name: quality, dtype: int64

In [20]:
df['quality'] = ['Good' if x>=7 else 'bad' for x in df['quality']]
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,bad
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,bad
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,bad
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,bad
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,bad


In [22]:
X = df.drop('quality', axis=1)
y = df['quality']

In [23]:
X.shape, y.shape

((1599, 11), (1599,))

In [24]:
from sklearn.model_selection import train_test_split, cross_val_score

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=17)

In [26]:
X_train.shape, X_test.shape

((1119, 11), (480, 11))

In [29]:
first_tree = DecisionTreeClassifier(random_state=17)

In [30]:
cross_val_score(first_tree, X_train, y_train, cv=5)

array([0.82142857, 0.86607143, 0.86607143, 0.85267857, 0.84304933])

In [31]:
from sklearn.neighbors import KNeighborsClassifier

In [32]:
first_KNN = KNeighborsClassifier()

In [33]:
cross_val_score(first_KNN, X_train, y_train, cv=5)

array([0.84821429, 0.87946429, 0.85714286, 0.84375   , 0.82511211])

In [34]:
from sklearn.model_selection import GridSearchCV

In [35]:
tree_params = {"max_depth": np.arange(1, 11), "max_features": [0.5, 0.7, 1]}

In [36]:
tree_grid = GridSearchCV(first_tree, tree_params, cv=5, n_jobs=-1)

In [37]:
%%time
tree_grid.fit(X_train, y_train)

Wall time: 4.04 s


GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=17), n_jobs=-1,
             param_grid={'max_depth': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10]),
                         'max_features': [0.5, 0.7, 1]})

In [38]:
tree_grid.best_score_, tree_grid.best_params_

(0.8829356181934658, {'max_depth': 7, 'max_features': 0.7})

In [39]:
knn_params = {"n_neighbors": range(5, 30, 5)}  # + list(range(50, 100, 10))}

In [47]:
knn_grid = GridSearchCV(first_KNN, knn_params, cv=5)

In [48]:
%%time
knn_grid.fit(X_train, y_train);

Wall time: 584 ms


GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': range(5, 30, 5)})

In [49]:
knn_grid.best_score_, knn_grid.best_params_

(0.8641615951313261, {'n_neighbors': 20})

In [40]:
tree_valid_pred = tree_grid.predict(X_test)

In [41]:
from sklearn.metrics import accuracy_score

In [42]:
accuracy_score(y_test, tree_valid_pred)

0.8875

In [46]:
second_tree = DecisionTreeClassifier(max_depth=3).fit(X_train, y_train)
second_tree.score(X_test, y_test)

0.89375

In [50]:
from numpy import std
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier


In [51]:
model = RandomForestClassifier()

In [52]:
# evaluate the model 
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, x, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')

In [54]:
# report model performance
print('Accuracy: %.3f (%.3f)' % (np.mean(n_scores), std(n_scores)))

Accuracy: 0.914 (0.021)
