# Model Benchmarking

In this notebook, we will work through a simple dataset to classify wine types in two classes i.e. 1 or 0. In the process we will use mutiple models for classification to compare model performance across difference feature space and look at ROC curves.

In [1]:
# Import important libraries
import numpy as np
import pandas as pd
import matplotlib as plt

# Cross Validation - K-Fold imports

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

# import different models

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC

# Model Selection Specific imports

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, make_scorer

# Preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler


%matplotlib inline

  from numpy.core.umath_tests import inner1d


# Load Dataset

In [2]:
#Adding Columns, because there is no columns in the dataset 
columns = [  'name'
                 ,'alcohol'
                 ,'malicAcid'
                 ,'ash'
                 ,'ashalcalinity'
                 ,'magnesium'
                 ,'totalPhenols'
                 ,'flavanoids'
                 ,'nonFlavanoidPhenols'
                 ,'proanthocyanins'
                 ,'colorIntensity'
                 ,'hue'
                 ,'od280_od315'
                 ,'proline'
                ]

df = pd.read_csv('../data/wine.csv', names=columns)

# Analyze Train Dataset

In [3]:
df.head()

Unnamed: 0,name,alcohol,malicAcid,ash,ashalcalinity,magnesium,totalPhenols,flavanoids,nonFlavanoidPhenols,proanthocyanins,colorIntensity,hue,od280_od315,proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [4]:
df.shape

(178, 14)

In [5]:
df.isnull().sum()

name                   0
alcohol                0
malicAcid              0
ash                    0
ashalcalinity          0
magnesium              0
totalPhenols           0
flavanoids             0
nonFlavanoidPhenols    0
proanthocyanins        0
colorIntensity         0
hue                    0
od280_od315            0
proline                0
dtype: int64

In [6]:
df.describe()

Unnamed: 0,name,alcohol,malicAcid,ash,ashalcalinity,magnesium,totalPhenols,flavanoids,nonFlavanoidPhenols,proanthocyanins,colorIntensity,hue,od280_od315,proline
count,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0
mean,1.938202,13.000618,2.336348,2.366517,19.494944,99.741573,2.295112,2.02927,0.361854,1.590899,5.05809,0.957449,2.611685,746.893258
std,0.775035,0.811827,1.117146,0.274344,3.339564,14.282484,0.625851,0.998859,0.124453,0.572359,2.318286,0.228572,0.70999,314.907474
min,1.0,11.03,0.74,1.36,10.6,70.0,0.98,0.34,0.13,0.41,1.28,0.48,1.27,278.0
25%,1.0,12.3625,1.6025,2.21,17.2,88.0,1.7425,1.205,0.27,1.25,3.22,0.7825,1.9375,500.5
50%,2.0,13.05,1.865,2.36,19.5,98.0,2.355,2.135,0.34,1.555,4.69,0.965,2.78,673.5
75%,3.0,13.6775,3.0825,2.5575,21.5,107.0,2.8,2.875,0.4375,1.95,6.2,1.12,3.17,985.0
max,3.0,14.83,5.8,3.23,30.0,162.0,3.88,5.08,0.66,3.58,13.0,1.71,4.0,1680.0


# Create Test and Train Splits

In [7]:
X= df.drop(['name'], axis=1)

X.head()

Unnamed: 0,alcohol,malicAcid,ash,ashalcalinity,magnesium,totalPhenols,flavanoids,nonFlavanoidPhenols,proanthocyanins,colorIntensity,hue,od280_od315,proline
0,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [8]:
y = df['name']
y.head()

0    1
1    1
2    1
3    1
4    1
Name: name, dtype: int64

In [9]:
train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size = 0.2)

print(train_X.shape, valid_X.shape)

(142, 13) (36, 13)


# Scaling the train_X and test_x 

In [20]:
fsc = StandardScaler()
X_train = fsc.fit_transform(train_X)
X_valid = fsc.transform(valid_X)

In [21]:
models = []

models.append(("Logistic Regression:",LogisticRegression()))
models.append(("Naive Bayes:",GaussianNB()))
models.append(("K-Nearest Neighbour:",KNeighborsClassifier(n_neighbors=3)))
models.append(("Random Forest:",RandomForestClassifier(n_estimators=20)))
# models.append(("MLP:", MLPClassifier(hidden_layer_sizes=(45,30,15), solver='sgd',learning_rate_init=0.01,max_iter=500)))
models.append(("GradientBoostingClassifier:",GradientBoostingClassifier()))
models.append(("SVC:", SVC(kernel = 'rbf', random_state = 0)))

print('Models appended...')

Models appended...


In [22]:
def run_models():
    results = []
    names = []
    for name, model in models:
        kfold = KFold(n_splits=4, random_state=0)
        cv_result = cross_val_score(model, train_X, train_y.values.ravel(), cv = kfold, scoring = "accuracy")
        names.append(name)
        results.append(cv_result)

    for i in range(len(names)):
        print(names[i],results[i].mean()*100)

In [23]:
run_models()

Logistic Regression: 95.07936507936508
Naive Bayes: 96.44841269841271
K-Nearest Neighbour: 69.08730158730158
Random Forest: 97.16269841269842
MLP: 40.09920634920635
GradientBoostingClassifier: 92.99603174603175
SVC: 38.73015873015873


## Simple GridSearchCV with parameter set

In [24]:
models_gs = {
    'Logistic Regression': LogisticRegression(),
    'Naive Bayes:':GaussianNB(),
    'K-Nearest Neighbour': KNeighborsClassifier(),
    'RandomForestClassifier': RandomForestClassifier(),
    'GradientBoostingClassifier': GradientBoostingClassifier(),
    'SVC': SVC()
}

params_gs = { 
    'Logistic Regression': {'C':[0.01, 1.0, 10.0]},
    'K-Nearest Neighbour': {'n_neighbors':[3, 5, 8]},
    'K-Nearest Neighbour': {'n_neighbors':[3, 5, 8]},
    'RandomForestClassifier': { 'n_estimators': [32, 64, 128] },
    'GradientBoostingClassifier': { 'n_estimators': [128, 256, 512], 'learning_rate': [0.05, 0.1, 0.3, 0.9] },
    'SVC': [
#         {'kernel': ['linear'], 'C': [1, 10, 100, 1000]},
        {'kernel': ['rbf'], 'C': [1, 10, 100, 1000], 'gamma': [0.1, 0.3, 0.7, 0.9, 1.0]},
    ]
}

In [33]:
def run_models_with_GS(models_gs, params_gs):
    results = []
    for model in models_gs:
        grid_search = GridSearchCV(estimator = models_gs[model],
                                   param_grid = params_gs[model],
                                   scoring = 'accuracy',
                                   cv = 4, n_jobs = 6)
        grid_search.fit(train_X, train_y)
        best_accuracy = grid_search.best_score_
        best_parameters = grid_search.best_params_
        # here is the best accuracy
        results.append(( model, best_accuracy, best_parameters ))
    return results

# Define custom scorer

In [17]:
original_classes = []
predicted_classes = []
def accuracy_with_classification_report(y_true, y_pred):
#     Cache the original classes and predicted classes for avg
    original_classes.extend(y_true)
    predicted_classes.extend(y_pred)
#     print(classification_report(y_true, y_pred)) # print classification report
    return accuracy_score(y_true, y_pred) # return accuracy score

In [18]:
inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)

# Non_nested parameter search and scoring
clf = GridSearchCV(estimator=svr, param_grid=p_grid, cv=inner_cv)

def run_models():
    results = []
    names = []
    for name, model in models:
        kfold = KFold(n_splits=4, random_state=0)
        cv_result = cross_val_score(model, train_X, train_y.values.ravel(), cv = kfold, scoring = make_scorer(accuracy_with_classification_report))
        names.append(name)
        results.append(cv_result)
        print(classification_report(original_classes, predicted_classes))

    for i in range(len(names)):
        print(names[i],results[i].mean()*100)

#     print(classification_report(original_classes, predicted_classes))

# ROC Curve for each model

In [None]:
print(__doc__)

import numpy as np
from scipy import interp
import matplotlib.pyplot as plt

from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import StratifiedKFold


# #############################################################################
# Classification and ROC analysis

# Run classifier with cross-validation and plot ROC curves
cv = StratifiedKFold(n_splits=6)
classifier = svm.SVC(kernel='linear', probability=True,
                     random_state=random_state)
interp?
tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)

i = 0
for train, test in cv.split(X, y):
    probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test])
    # Compute ROC curve and area the curve
    fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
    tprs.append(interp(mean_fpr, fpr, tpr))
    tprs[-1][0] = 0.0
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)
    plt.plot(fpr, tpr, lw=1, alpha=0.3,
             label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))

    i += 1
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
         label='Chance', alpha=.8)

mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
plt.plot(mean_fpr, mean_tpr, color='b',
         label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
         lw=2, alpha=.8)

std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
                 label=r'$\pm$ 1 std. dev.')

plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()

In [19]:
run_models()

             precision    recall  f1-score   support

          1       0.98      0.94      0.96        50
          2       0.93      0.95      0.94        55
          3       0.95      0.97      0.96        37

avg / total       0.95      0.95      0.95       142

             precision    recall  f1-score   support

          1       0.97      0.96      0.96       100
          2       0.95      0.94      0.94       110
          3       0.95      0.99      0.97        74

avg / total       0.96      0.96      0.96       284

             precision    recall  f1-score   support

          1       0.92      0.93      0.93       150
          2       0.86      0.84      0.85       165
          3       0.81      0.83      0.82       111

avg / total       0.87      0.87      0.87       426

             precision    recall  f1-score   support

          1       0.94      0.94      0.94       200
          2       0.88      0.87      0.88       220
          3       0.85      0.87    

# GridSearch

In [None]:
models_gs = {
    'K-Nearest Neighbour': KNeighborsClassifier(),
    'RandomForestClassifier': RandomForestClassifier(),
    'GradientBoostingClassifier': GradientBoostingClassifier(),
    'SVC': SVC()
}

params_gs = { 
    'K-Nearest Neighbour': {'n_neighbors':[3, 5, 8]},
    'RandomForestClassifier': { 'n_estimators': [32, 64, 128] },
    'GradientBoostingClassifier': { 'n_estimators': [128, 256, 512], 'learning_rate': [0.05, 0.1, 0.3, 0.9] },
    'SVC': [
#         {'kernel': ['linear'], 'C': [1, 10, 100, 1000]},
        {'kernel': ['rbf'], 'C': [1, 10, 100, 1000], 'gamma': [0.1, 0.3, 0.7, 0.9, 1.0]},
    ]
}

In [None]:
def run_models_with_GS(models_gs, params_gs):
    results = []
    for model in models_gs:
        grid_search = GridSearchCV(estimator = models_gs[model],
                                   param_grid = params_gs[model],
                                   scoring = 'accuracy',
                                   cv = 4, n_jobs = 6)
        grid_search.fit(train_X, train_y)
        best_accuracy = grid_search.best_score_
        best_parameters = grid_search.best_params_
        # here is the best accuracy
        results.append(( model, best_accuracy, best_parameters ))
    return results

In [None]:
results = run_models_with_GS(models_gs, params_gs)
for model, accuracy, params in results:
    print(model, accuracy * 100, params)

# RandomSearchCV

In [None]:
models_gs = {
    'K-Nearest Neighbour': KNeighborsClassifier(),
    'RandomForestClassifier': RandomForestClassifier(),
    'GradientBoostingClassifier': GradientBoostingClassifier(),
    'SVC': SVC()
}

params_gs = { 
    'K-Nearest Neighbour': {'n_neighbors':[3, 5, 8]},
    'RandomForestClassifier': { 'n_estimators': [32, 64, 128] },
    'GradientBoostingClassifier': { 'n_estimators': [128, 256, 512], 'learning_rate': [0.05, 0.1, 0.3, 0.9] },
    'SVC': [
#         {'kernel': ['linear'], 'C': [1, 10, 100, 1000]},
        {'kernel': ['rbf'], 'C': [1, 10, 100, 1000], 'gamma': [0.1, 0.3, 0.7, 0.9, 1.0]},
    ]
}

In [None]:
def run_models_with_GS(models_gs, params_gs):
    results = []
    for model in models_gs:
        grid_search = GridSearchCV(estimator = models_gs[model],
                                   param_grid = params_gs[model],
                                   scoring = 'accuracy',
                                   cv = 4, n_jobs = 6)
        grid_search.fit(train_X, train_y)
        best_accuracy = grid_search.best_score_
        best_parameters = grid_search.best_params_
        # here is the best accuracy
        results.append(( model, best_accuracy, best_parameters ))
    return results

# Resources

https://stackoverflow.com/questions/42562146/classification-report-with-nested-cross-validation-in-sklearn