In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import PolynomialFeatures
import warnings
warnings.filterwarnings("ignore")
np.random.seed(42)

In [3]:
df = pd.read_csv("winequality-white.csv", sep = ';')
df.columns

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')

In [4]:
df = pd.read_csv("winequality-white.csv", sep = ';')
y = df.pop('quality')
for i in df.columns:
    df[i] = df[i].fillna(np.mean(df[i]))
train, test, y_train, y_test = train_test_split(df, y, test_size = 0.2)

In [4]:
LogisticRegression

sklearn.linear_model.logistic.LogisticRegression

In [5]:
lr = LogisticRegression()
lr.fit(train, y_train)
y_pred = lr.predict(test)
print('Accuracy score baseline:', accuracy_score(y_test, y_pred))

Accuracy score baseline: 0.5153061224489796


In [7]:
rf = RandomForestClassifier(n_estimators = 20, oob_score=True)
rf.fit(train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=20, n_jobs=1, oob_score=True, random_state=None,
            verbose=0, warm_start=False)

In [8]:
rf.oob_score_

0.6345074017355794

In [18]:
def fit_predict(train, test, y_train, y_test,  max_depth = None , 
                n_estimators = 10, max_features = 'auto', min_samples_split = 2,scaler = None):
    if scaler:
        train = scaler.fit_transform(train)
        test = scaler.transform(test)        
    RF = RandomForestClassifier(n_estimators = n_estimators, max_depth=max_depth, 
                                random_state = 42, max_features = max_features,
                               min_samples_split = min_samples_split,oob_score=True)
    RF.fit(train, y_train)
    y_pred = RF.predict(test)
    print(accuracy_score(y_test, y_pred))
    print(RF.oob_score_)

In [19]:
print('baseline accuracy score', end = ': ')
fit_predict(train,test,y_train,y_test)
print('baseline accuracy score with scaler', end = ': ')
fit_predict(train,test,y_train,y_test,scaler=StandardScaler())

baseline accuracy score: 0.6448979591836734
0.5890760592138846
baseline accuracy score with scaler: 0.6418367346938776
0.5872894333843798


In [8]:
for n_estimators in range(20,200,20):
    print('Accuracy score using n_estimators =', n_estimators, end = ': ')
    fit_predict(train,test,y_train,y_test,n_estimators = n_estimators)


Accuracy score using n_estimators = 20: 0.6612244897959184
Accuracy score using n_estimators = 40: 0.6775510204081633
Accuracy score using n_estimators = 60: 0.6846938775510204
Accuracy score using n_estimators = 80: 0.6877551020408164
Accuracy score using n_estimators = 100: 0.689795918367347
Accuracy score using n_estimators = 120: 0.6979591836734694
Accuracy score using n_estimators = 140: 0.6908163265306122
Accuracy score using n_estimators = 160: 0.6979591836734694
Accuracy score using n_estimators = 180: 0.6948979591836735


In [9]:
for max_depth in range(1,20):
    print('Accuracy score using max_depth =', max_depth, end = ': ')
    fit_predict(train,test,y_train,y_test,n_estimators = 160,max_depth = max_depth)


Accuracy score using max_depth = 1: 0.44081632653061226
Accuracy score using max_depth = 2: 0.4897959183673469
Accuracy score using max_depth = 3: 0.49387755102040815
Accuracy score using max_depth = 4: 0.5051020408163265
Accuracy score using max_depth = 5: 0.5244897959183673
Accuracy score using max_depth = 6: 0.536734693877551
Accuracy score using max_depth = 7: 0.563265306122449
Accuracy score using max_depth = 8: 0.5826530612244898
Accuracy score using max_depth = 9: 0.5948979591836735
Accuracy score using max_depth = 10: 0.6091836734693877
Accuracy score using max_depth = 11: 0.6469387755102041
Accuracy score using max_depth = 12: 0.6744897959183673
Accuracy score using max_depth = 13: 0.6816326530612244
Accuracy score using max_depth = 14: 0.6979591836734694
Accuracy score using max_depth = 15: 0.7
Accuracy score using max_depth = 16: 0.6938775510204082
Accuracy score using max_depth = 17: 0.6989795918367347
Accuracy score using max_depth = 18: 0.6989795918367347
Accuracy score u

In [10]:
for max_features in np.linspace(0.1,1,10):
    print('Accuracy score using max_features =', max_features, end = ': ')
    fit_predict(train,test,y_train,y_test,n_estimators = 160,max_features = max_features,max_depth = 18)


Accuracy score using max_features = 0.1: 0.6979591836734694
Accuracy score using max_features = 0.2: 0.7030612244897959
Accuracy score using max_features = 0.30000000000000004: 0.6989795918367347
Accuracy score using max_features = 0.4: 0.6969387755102041
Accuracy score using max_features = 0.5: 0.6989795918367347
Accuracy score using max_features = 0.6: 0.6938775510204082
Accuracy score using max_features = 0.7000000000000001: 0.6989795918367347
Accuracy score using max_features = 0.8: 0.7010204081632653
Accuracy score using max_features = 0.9: 0.6979591836734694
Accuracy score using max_features = 1.0: 0.7051020408163265


In [11]:
for min_samples_split in range(2,10):
    print('Accuracy score using min_samples_split =', min_samples_split, end = ': ')
    fit_predict(train,test,y_train,y_test,n_estimators = 160,max_features = 0.2,min_samples_split=min_samples_split
               ,max_depth = 18)


Accuracy score using min_samples_split = 2: 0.7030612244897959
Accuracy score using min_samples_split = 3: 0.7193877551020408
Accuracy score using min_samples_split = 4: 0.7030612244897959
Accuracy score using min_samples_split = 5: 0.6979591836734694
Accuracy score using min_samples_split = 6: 0.6928571428571428
Accuracy score using min_samples_split = 7: 0.6816326530612244
Accuracy score using min_samples_split = 8: 0.6775510204081633
Accuracy score using min_samples_split = 9: 0.6724489795918367


In [12]:
print('tuned accuracy score', end = ': ')
fit_predict(train,test,y_train,y_test,n_estimators = 160,max_features = 0.2,min_samples_split=3,max_depth = 18)
print('tuned accuracy score with scaler', end = ': ')

fit_predict(train,test,y_train,y_test,n_estimators = 160,max_features = 0.2,min_samples_split=3,
            max_depth = 18,scaler=StandardScaler())

tuned accuracy score: 0.7193877551020408
tuned accuracy score with scaler: 0.7193877551020408


In [13]:
original_score = 0.514285714286
best_score = 0.7193877551020408
improvement = np.abs(np.round(100*(original_score - best_score)/original_score,2))
print('overall improvement is {} %'.format(improvement))

overall improvement is 39.88 %


In [14]:
original_score = 0.6428571428571429
best_score = 0.7193877551020408
improvement = np.abs(np.round(100*(original_score - best_score)/original_score,2))
print('overall improvement compare to non tuned model is {} %'.format(improvement))

overall improvement compare to non tuned model is 11.9 %
