In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import PolynomialFeatures
import warnings
warnings.filterwarnings("ignore")
np.random.seed(42)

In [2]:
df = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv", sep = ';')
y = df.pop('quality')

In [3]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9


In [4]:
y.value_counts()

6    2198
5    1457
7     880
8     175
4     163
3      20
9       5
Name: quality, dtype: int64

In [5]:
for i in df.columns:
    df[i] = df[i].fillna(np.mean(df[i]))
train, test, y_train, y_test = train_test_split(df, y, test_size = 0.2)

In [6]:
lr = LogisticRegression()
lr.fit(train, y_train)
y_pred = lr.predict(test)
print('Accuracy score baseline:', accuracy_score(y_test, y_pred))

Accuracy score baseline: 0.45918367346938777


In [7]:
def fit_predict(train, test, y_train, y_test, scaler, max_depth, 
                criterion = 'entropy', max_features = 1, min_samples_split = 4):
    train_scaled = scaler.fit_transform(train)
    test_scaled = scaler.transform(test)        
    dt = DecisionTreeClassifier(criterion = criterion, max_depth=max_depth, 
                                random_state=42, max_features=max_features,
                               min_samples_split=min_samples_split)
    dt.fit(train_scaled, y_train)
    y_pred = dt.predict(test_scaled)
    print(accuracy_score(y_test, y_pred))

In [8]:
dt = DecisionTreeClassifier()
dt.fit(train, y_train)
y_pred = dt.predict(test)
print(accuracy_score(y_test, y_pred))

0.6112244897959184


In [9]:
from sklearn.metrics import classification_report,confusion_matrix

In [10]:
print(confusion_matrix(y_test,y_pred))

[[  0   1   3   1   0   0   0]
 [  1   8   6   8   1   1   0]
 [  0  14 183  83   6   5   0]
 [  0   9  82 272  56  12   1]
 [  0   2   6  53 120  11   0]
 [  1   0   0   8   9  16   1]
 [  0   0   0   0   0   0   0]]


In [11]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           3       0.00      0.00      0.00         5
           4       0.24      0.32      0.27        25
           5       0.65      0.63      0.64       291
           6       0.64      0.63      0.63       432
           7       0.62      0.62      0.62       192
           8       0.36      0.46      0.40        35
           9       0.00      0.00      0.00         0

    accuracy                           0.61       980
   macro avg       0.36      0.38      0.37       980
weighted avg       0.62      0.61      0.61       980



### Max depth tuning

In [12]:
for i in range(1, 20):
    print('Accuracy score using max_depth =', i, end = ': ')
    fit_predict(train, test, y_train, y_test, StandardScaler(), i)

Accuracy score using max_depth = 1: 0.44081632653061226
Accuracy score using max_depth = 2: 0.44081632653061226
Accuracy score using max_depth = 3: 0.4530612244897959
Accuracy score using max_depth = 4: 0.4602040816326531
Accuracy score using max_depth = 5: 0.48673469387755103
Accuracy score using max_depth = 6: 0.45918367346938777
Accuracy score using max_depth = 7: 0.49795918367346936
Accuracy score using max_depth = 8: 0.503061224489796
Accuracy score using max_depth = 9: 0.5183673469387755
Accuracy score using max_depth = 10: 0.4969387755102041
Accuracy score using max_depth = 11: 0.5142857142857142
Accuracy score using max_depth = 12: 0.49183673469387756
Accuracy score using max_depth = 13: 0.576530612244898
Accuracy score using max_depth = 14: 0.5714285714285714
Accuracy score using max_depth = 15: 0.5336734693877551
Accuracy score using max_depth = 16: 0.5489795918367347
Accuracy score using max_depth = 17: 0.5520408163265306
Accuracy score using max_depth = 18: 0.57755102040816

### Max features tuning

In [13]:
for i in np.arange(0.1, 1.0, 0.1):
    print('Accuracy score using max features =', i, end = ': ')
    fit_predict(train, test, y_train, y_test, StandardScaler(), max_depth = 18, max_features=i)

Accuracy score using max features = 0.1: 0.5775510204081633
Accuracy score using max features = 0.2: 0.6112244897959184
Accuracy score using max features = 0.30000000000000004: 0.6071428571428571
Accuracy score using max features = 0.4: 0.5959183673469388
Accuracy score using max features = 0.5: 0.6030612244897959
Accuracy score using max features = 0.6: 0.5826530612244898
Accuracy score using max features = 0.7000000000000001: 0.6020408163265306
Accuracy score using max features = 0.8: 0.5724489795918367
Accuracy score using max features = 0.9: 0.573469387755102


### Min samples split tuning

In [14]:
for i in range(2, 10):
    print('Accuracy score using min samples split =', i, end = ': ')
    fit_predict(train, test, y_train, y_test, StandardScaler(), 18, max_features=0.3, min_samples_split=i)

Accuracy score using min samples split = 2: 0.6153061224489796
Accuracy score using min samples split = 3: 0.5928571428571429
Accuracy score using min samples split = 4: 0.6071428571428571
Accuracy score using min samples split = 5: 0.5612244897959183
Accuracy score using min samples split = 6: 0.5795918367346938
Accuracy score using min samples split = 7: 0.563265306122449
Accuracy score using min samples split = 8: 0.5938775510204082
Accuracy score using min samples split = 9: 0.5540816326530612


### Criterion tuning

In [15]:
for i in ['gini', 'entropy']:
    print('Accuracy score using criterion =', i, end = ': ')
    fit_predict(train, test, y_train, y_test, StandardScaler(), 18, 
                max_features=0.3, min_samples_split=2, criterion = 'entropy')

Accuracy score using criterion = gini: 0.6153061224489796
Accuracy score using criterion = entropy: 0.6153061224489796


In [16]:
def create_poly(train,test,degree):
    poly = PolynomialFeatures(degree=degree)
    train_poly = poly.fit_transform(train)
    test_poly = poly.fit_transform(test)
    return train_poly,test_poly

In [22]:
for degree in [1,2,3,4]:
    train_poly, test_poly = create_poly(train, test, degree)
    print("Train and Test samples shapes before degreed",train.shape,test.shape)
    print('Polynomial degree',degree)
    print("Train and Test samples shapes after degreed",train_poly.shape,test_poly.shape)
    fit_predict(train_poly, test_poly, y_train, y_test, StandardScaler(), 18, 
                max_features=0.3, min_samples_split=2, criterion = 'entropy')
    print(10*'-')
    
train_poly, test_poly = create_poly(train, test, 2) 

Train and Test samples shapes before degreed (3918, 15) (980, 15)
Polynomial degree 1
Train and Test samples shapes after degreed (3918, 16) (980, 16)
0.5948979591836735
----------
Train and Test samples shapes before degreed (3918, 15) (980, 15)
Polynomial degree 2
Train and Test samples shapes after degreed (3918, 136) (980, 136)
0.6173469387755102
----------
Train and Test samples shapes before degreed (3918, 15) (980, 15)
Polynomial degree 3
Train and Test samples shapes after degreed (3918, 816) (980, 816)
0.5826530612244898
----------
Train and Test samples shapes before degreed (3918, 15) (980, 15)
Polynomial degree 4
Train and Test samples shapes after degreed (3918, 3876) (980, 3876)
0.5979591836734693
----------


In [18]:
def feat_eng(df):
    df['eng1'] = df['fixed acidity'] * df['pH']
    df['eng2'] = df['total sulfur dioxide'] / df['free sulfur dioxide']
    df['eng3'] = df['sulphates'] / df['chlorides']
    df['eng4'] = df['chlorides'] / df['sulphates']
    return df

train = feat_eng(train)
test = feat_eng(test)

print('Additional feature engineering:')

fit_predict(train, test, y_train, y_test, StandardScaler(), 18, 
                max_features=0.3, min_samples_split=2, criterion = 'entropy')

train_poly, test_poly = create_poly(train, test, 2)

fit_predict(train_poly, test_poly, y_train, y_test, StandardScaler(), 18, 
                max_features=0.3, min_samples_split=2, criterion = 'entropy')


Additional feature engineering:
0.5938775510204082
0.6173469387755102


In [19]:
original_score = 0.514285714286
best_score = 0.625510204082
improvement = np.abs(np.round(100*(original_score - best_score)/original_score,2))
print('overall improvement is {} %'.format(improvement))

overall improvement is 21.63 %
