In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import PolynomialFeatures
import warnings
warnings.filterwarnings("ignore")
np.random.seed(42)

In [2]:
df = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv", sep=";")
y = df.pop("quality")

In [3]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         4898 non-null   float64
 1   volatile acidity      4898 non-null   float64
 2   citric acid           4898 non-null   float64
 3   residual sugar        4898 non-null   float64
 4   chlorides             4898 non-null   float64
 5   free sulfur dioxide   4898 non-null   float64
 6   total sulfur dioxide  4898 non-null   float64
 7   density               4898 non-null   float64
 8   pH                    4898 non-null   float64
 9   sulphates             4898 non-null   float64
 10  alcohol               4898 non-null   float64
dtypes: float64(11)
memory usage: 421.0 KB


In [5]:
df.columns.value_counts()

chlorides               1
pH                      1
fixed acidity           1
density                 1
volatile acidity        1
alcohol                 1
total sulfur dioxide    1
free sulfur dioxide     1
sulphates               1
residual sugar          1
citric acid             1
dtype: int64

In [6]:
y.value_counts()

6    2198
5    1457
7     880
8     175
4     163
3      20
9       5
Name: quality, dtype: int64

In [7]:
for i in df.columns:
    df[i] = df[i].fillna(np.mean(df[i]))

In [8]:
df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8
1,6.3,0.30,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5
2,8.1,0.28,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9
...,...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4
4896,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8


In [9]:
train, test, y_train, y_test = train_test_split(
     df, y, test_size=0.2, random_state=101)

In [10]:
lr = LogisticRegression()

In [11]:
lr.fit(train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [12]:
y_pred = lr.predict(test)
print("Accuracy score baseline", accuracy_score(y_test, y_pred))

Accuracy score baseline 0.47346938775510206


In [13]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix,roc_auc_score

In [14]:
print('Confusion Matrix:',confusion_matrix(y_test,y_pred))
print('Classification Report:',classification_report(y_test,y_pred))

Confusion Matrix: [[  0   0   1   4   0   0]
 [  1   0  15  25   0   0]
 [  0   0 103 201   2   0]
 [  3   0  67 361   2   0]
 [  0   0  17 141   0   0]
 [  0   0   3  33   1   0]]
Classification Report:               precision    recall  f1-score   support

           3       0.00      0.00      0.00         5
           4       0.00      0.00      0.00        41
           5       0.50      0.34      0.40       306
           6       0.47      0.83      0.60       433
           7       0.00      0.00      0.00       158
           8       0.00      0.00      0.00        37

    accuracy                           0.47       980
   macro avg       0.16      0.20      0.17       980
weighted avg       0.36      0.47      0.39       980



In [15]:
def fit_predict(train, test, y_train, y_test, scaler, max_depth, 
                criterion = 'entropy', max_features = 1, min_samples_split = 4):
    train_scaled = scaler.fit_transform(train)
    test_scaled = scaler.transform(test)        
    dt = DecisionTreeClassifier(criterion = criterion, max_depth=max_depth, 
                                random_state=42, max_features=max_features,
                               min_samples_split=min_samples_split)
    dt.fit(train_scaled, y_train)
    y_pred = dt.predict(test_scaled)
    print(accuracy_score(y_test, y_pred))


In [16]:
dt = DecisionTreeClassifier()
dt.fit(train, y_train)
y_pred = dt.predict(test)
print(accuracy_score(y_test, y_pred))

0.5989795918367347


### Max depth tuning

In [17]:
for i in range(1, 30):
    print("Accuracy score using max_depth = ", i, end = ': ')
    fit_predict(train, test, y_train, y_test, StandardScaler(), i)

Accuracy score using max_depth =  1: 0.4418367346938776
Accuracy score using max_depth =  2: 0.4418367346938776
Accuracy score using max_depth =  3: 0.4326530612244898
Accuracy score using max_depth =  4: 0.4489795918367347
Accuracy score using max_depth =  5: 0.45918367346938777
Accuracy score using max_depth =  6: 0.4704081632653061
Accuracy score using max_depth =  7: 0.4806122448979592
Accuracy score using max_depth =  8: 0.47653061224489796
Accuracy score using max_depth =  9: 0.48673469387755103
Accuracy score using max_depth =  10: 0.4928571428571429
Accuracy score using max_depth =  11: 0.4857142857142857
Accuracy score using max_depth =  12: 0.5346938775510204
Accuracy score using max_depth =  13: 0.5346938775510204
Accuracy score using max_depth =  14: 0.5469387755102041
Accuracy score using max_depth =  15: 0.5448979591836735
Accuracy score using max_depth =  16: 0.5428571428571428
Accuracy score using max_depth =  17: 0.5418367346938775
Accuracy score using max_depth =  18:

### Max feature tuning

In [18]:
for i in np.arange(0.1,1.0,0.1):
    print("Accuracy score using max_features = ", i, end = ': ')
    fit_predict(train, test, y_train, y_test, StandardScaler(), max_depth=25, max_features=i)

Accuracy score using max_features =  0.1: 0.5255102040816326
Accuracy score using max_features =  0.2: 0.6173469387755102
Accuracy score using max_features =  0.30000000000000004: 0.5989795918367347
Accuracy score using max_features =  0.4: 0.5928571428571429
Accuracy score using max_features =  0.5: 0.576530612244898
Accuracy score using max_features =  0.6: 0.5846938775510204
Accuracy score using max_features =  0.7000000000000001: 0.5979591836734693
Accuracy score using max_features =  0.8: 0.5683673469387756
Accuracy score using max_features =  0.9: 0.5887755102040816


### Min Samples split tunning

In [19]:
for i in range(2,10):
    print("Accuracy score using max_features = ", i, end = ': ')
    fit_predict(train, test, y_train, y_test, StandardScaler(), max_depth=25, max_features=0.3, min_samples_split=i)

Accuracy score using max_features =  2: 0.5938775510204082
Accuracy score using max_features =  3: 0.5836734693877551
Accuracy score using max_features =  4: 0.5989795918367347
Accuracy score using max_features =  5: 0.5714285714285714
Accuracy score using max_features =  6: 0.5744897959183674
Accuracy score using max_features =  7: 0.5693877551020409
Accuracy score using max_features =  8: 0.5683673469387756
Accuracy score using max_features =  9: 0.55


###  Criterion Tunning

In [20]:
for i in ['gini', 'entropy']:
    print("Accuracy score using criterion = ", i, end = ': ')
    fit_predict(train, test, y_train, y_test, StandardScaler(), 25, max_features=0.3, min_samples_split=2, criterion=i) 

Accuracy score using criterion =  gini: 0.5938775510204082
Accuracy score using criterion =  entropy: 0.5938775510204082


In [21]:
def create_poly(train, test, degree):
    poly = PolynomialFeatures(degree=degree)
    train_poly = poly.fit_transform(train)
    test_poly = poly.fit_transform(test)
    return train_poly, test_poly

In [22]:
for degree in [1,2,3,4]:
    train_poly, test_poly = create_poly(train, test, degree)
    print("Polynominal degree", degree)
    fit_predict(train, test, y_train, y_test, StandardScaler(), 25, max_features=0.3, min_samples_split=2, criterion='entropy')
    print(10*'-')

Polynominal degree 1
0.5938775510204082
----------
Polynominal degree 2
0.5938775510204082
----------
Polynominal degree 3
0.5938775510204082
----------
Polynominal degree 4
0.5938775510204082
----------


In [23]:
def feat_eng(df):
    df['eng1'] = df['fixed acidity'] * df['pH']
    df['eng2'] = df['total sulfur dioxide'] * df['free sulfur dioxide']
    df['eng3'] = df['sulphates'] * df['chlorides']
    df['eng4'] = df['chlorides'] * df['sulphates']
    return df

train = feat_eng(train)
test = feat_eng(test)
print("Additional Feature Engineering:")
                
fit_predict(train, test, y_train, y_test, StandardScaler(), 25, max_features=0.3, min_samples_split=2, criterion='entropy')
train_poly, test_poly = create_poly(train, test, degree)             
fit_predict(train_poly, test_poly, y_train, y_test, StandardScaler(), 25, max_features=0.3, min_samples_split=2, criterion='entropy')

Additional Feature Engineering:
0.6122448979591837
0.5918367346938775


In [24]:
from sklearn.ensemble import RandomForestClassifier

In [25]:
rf = RandomForestClassifier()

In [26]:
rf.fit(train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [27]:
pred_rf = rf.predict(test)

In [28]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, pred_rf))

0.6928571428571428


In [29]:
from sklearn.model_selection import GridSearchCV

In [30]:
params = {'n_estimators': [200,500,700], 'max_depth': [10,15,18,20], 'min_samples_leaf': [3,5,7]}

In [31]:
gs = GridSearchCV(rf, params, verbose=3)

In [None]:
gs.fit(train, y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV] max_depth=10, min_samples_leaf=3, n_estimators=200 ..............


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  max_depth=10, min_samples_leaf=3, n_estimators=200, score=0.617, total=   1.3s
[CV] max_depth=10, min_samples_leaf=3, n_estimators=200 ..............


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.2s remaining:    0.0s


[CV]  max_depth=10, min_samples_leaf=3, n_estimators=200, score=0.597, total=   1.3s
[CV] max_depth=10, min_samples_leaf=3, n_estimators=200 ..............


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.5s remaining:    0.0s


[CV]  max_depth=10, min_samples_leaf=3, n_estimators=200, score=0.619, total=   1.3s
[CV] max_depth=10, min_samples_leaf=3, n_estimators=200 ..............
[CV]  max_depth=10, min_samples_leaf=3, n_estimators=200, score=0.621, total=   1.3s
[CV] max_depth=10, min_samples_leaf=3, n_estimators=200 ..............
[CV]  max_depth=10, min_samples_leaf=3, n_estimators=200, score=0.613, total=   1.2s
[CV] max_depth=10, min_samples_leaf=3, n_estimators=500 ..............
[CV]  max_depth=10, min_samples_leaf=3, n_estimators=500, score=0.622, total=   4.6s
[CV] max_depth=10, min_samples_leaf=3, n_estimators=500 ..............
[CV]  max_depth=10, min_samples_leaf=3, n_estimators=500, score=0.603, total=   4.7s
[CV] max_depth=10, min_samples_leaf=3, n_estimators=500 ..............
[CV]  max_depth=10, min_samples_leaf=3, n_estimators=500, score=0.619, total=   3.6s
[CV] max_depth=10, min_samples_leaf=3, n_estimators=500 ..............
[CV]  max_depth=10, min_samples_leaf=3, n_estimators=500, score=

[CV]  max_depth=15, min_samples_leaf=3, n_estimators=700, score=0.653, total=   5.2s
[CV] max_depth=15, min_samples_leaf=3, n_estimators=700 ..............
[CV]  max_depth=15, min_samples_leaf=3, n_estimators=700, score=0.639, total=   5.0s
[CV] max_depth=15, min_samples_leaf=3, n_estimators=700 ..............
[CV]  max_depth=15, min_samples_leaf=3, n_estimators=700, score=0.642, total=   7.4s
[CV] max_depth=15, min_samples_leaf=3, n_estimators=700 ..............
[CV]  max_depth=15, min_samples_leaf=3, n_estimators=700, score=0.654, total=   5.3s
[CV] max_depth=15, min_samples_leaf=3, n_estimators=700 ..............
[CV]  max_depth=15, min_samples_leaf=3, n_estimators=700, score=0.641, total=   5.7s
[CV] max_depth=15, min_samples_leaf=5, n_estimators=200 ..............
[CV]  max_depth=15, min_samples_leaf=5, n_estimators=200, score=0.633, total=   2.7s
[CV] max_depth=15, min_samples_leaf=5, n_estimators=200 ..............
[CV]  max_depth=15, min_samples_leaf=5, n_estimators=200, score=

[CV]  max_depth=18, min_samples_leaf=5, n_estimators=200, score=0.630, total=   1.3s
[CV] max_depth=18, min_samples_leaf=5, n_estimators=200 ..............
[CV]  max_depth=18, min_samples_leaf=5, n_estimators=200, score=0.628, total=   1.4s
[CV] max_depth=18, min_samples_leaf=5, n_estimators=500 ..............
[CV]  max_depth=18, min_samples_leaf=5, n_estimators=500, score=0.648, total=   3.4s
[CV] max_depth=18, min_samples_leaf=5, n_estimators=500 ..............
[CV]  max_depth=18, min_samples_leaf=5, n_estimators=500, score=0.617, total=   3.5s
[CV] max_depth=18, min_samples_leaf=5, n_estimators=500 ..............
[CV]  max_depth=18, min_samples_leaf=5, n_estimators=500, score=0.635, total=   3.2s
[CV] max_depth=18, min_samples_leaf=5, n_estimators=500 ..............
[CV]  max_depth=18, min_samples_leaf=5, n_estimators=500, score=0.633, total=   3.3s
[CV] max_depth=18, min_samples_leaf=5, n_estimators=500 ..............
[CV]  max_depth=18, min_samples_leaf=5, n_estimators=500, score=

In [None]:
gs.best_estimator_

In [None]:
gs.best_params_

In [None]:
rf1 = RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=20, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=3, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=True, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
rf1.fit(train,y_train)

In [None]:
pred_rf1 = rf1.predict(test)

In [None]:
print(accuracy_score(y_test, pred_rf1))

In [None]:
rf1.oob_score

In [None]:
rf1.feature_importances_

In [None]:
sorted(list(zip(rf1.feature_importances_, df.columns)), reverse=True)

In [None]:
df.corr()