# Build and Compare the Models

In [1]:
#Create a function to print out the results for each model
def print_results(results):
    print('BEST PARAMS: {}\n'.format(results.best_params_))

    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print('{} (+/-{}) for {}'.format(round(mean, 3), round(std * 2, 3), params))

In [2]:
import joblib
import pandas as pd
from sklearn.model_selection import GridSearchCV

tr_features = pd.read_csv('train_features.csv')
tr_labels = pd.read_csv('train_labels.csv')

### Logistic Regression: Fit and evaluate a model 

In [3]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter=500)
parameters = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
}

lrcv = GridSearchCV(lr, parameters, cv=5)
lrcv.fit(tr_features, tr_labels.values.ravel())

print_results(lrcv)

BEST PARAMS: {'C': 1}

0.672 (+/-0.06) for {'C': 0.001}
0.706 (+/-0.064) for {'C': 0.01}
0.788 (+/-0.09) for {'C': 0.1}
0.813 (+/-0.063) for {'C': 1}
0.813 (+/-0.063) for {'C': 10}
0.813 (+/-0.063) for {'C': 100}
0.813 (+/-0.063) for {'C': 1000}


In [4]:
#Save best model
joblib.dump(lrcv.best_estimator_, 'LR_model.pkl')

['LR_model.pkl']

### Support Vector Machines: Fit and evaluate a model

In [5]:
from sklearn.svm import SVC

svc = SVC()
parameters = {
    'kernel': ['linear', 'rbf'],
    'C': [0.1, 1, 10]
}

svmcv = GridSearchCV(svc, parameters, cv=5)
svmcv.fit(tr_features, tr_labels.values.ravel())

print_results(svmcv)

BEST PARAMS: {'C': 0.1, 'kernel': 'linear'}

0.809 (+/-0.032) for {'C': 0.1, 'kernel': 'linear'}
0.665 (+/-0.06) for {'C': 0.1, 'kernel': 'rbf'}
0.809 (+/-0.032) for {'C': 1, 'kernel': 'linear'}
0.672 (+/-0.073) for {'C': 1, 'kernel': 'rbf'}
0.809 (+/-0.032) for {'C': 10, 'kernel': 'linear'}
0.695 (+/-0.069) for {'C': 10, 'kernel': 'rbf'}


In [6]:
#Save best model
joblib.dump(svmcv.best_estimator_, 'SVM_model.pkl')

['SVM_model.pkl']

### Multilayer Perceptron: Fit and evaluate a model

In [7]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(max_iter=1000)
parameters = {
    'hidden_layer_sizes': [(10,), (50,), (100,)],
    'activation': ['relu', 'tanh', 'logistic'],
    'learning_rate': ['constant', 'invscaling', 'adaptive']
}

mlpcv = GridSearchCV(mlp, parameters, cv=5)
mlpcv.fit(tr_features, tr_labels.values.ravel())

print_results(mlpcv)

BEST PARAMS: {'activation': 'tanh', 'hidden_layer_sizes': (10,), 'learning_rate': 'invscaling'}

0.788 (+/-0.082) for {'activation': 'relu', 'hidden_layer_sizes': (10,), 'learning_rate': 'constant'}
0.76 (+/-0.179) for {'activation': 'relu', 'hidden_layer_sizes': (10,), 'learning_rate': 'invscaling'}
0.802 (+/-0.067) for {'activation': 'relu', 'hidden_layer_sizes': (10,), 'learning_rate': 'adaptive'}
0.798 (+/-0.082) for {'activation': 'relu', 'hidden_layer_sizes': (50,), 'learning_rate': 'constant'}
0.777 (+/-0.106) for {'activation': 'relu', 'hidden_layer_sizes': (50,), 'learning_rate': 'invscaling'}
0.798 (+/-0.073) for {'activation': 'relu', 'hidden_layer_sizes': (50,), 'learning_rate': 'adaptive'}
0.764 (+/-0.098) for {'activation': 'relu', 'hidden_layer_sizes': (100,), 'learning_rate': 'constant'}
0.786 (+/-0.076) for {'activation': 'relu', 'hidden_layer_sizes': (100,), 'learning_rate': 'invscaling'}
0.792 (+/-0.097) for {'activation': 'relu', 'hidden_layer_sizes': (100,), 'learn



In [8]:
#Save best model
joblib.dump(mlpcv.best_estimator_, 'MLP_model.pkl')

['MLP_model.pkl']

### Random Forest: Fit and evaluate a model

In [9]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
parameters = {
    'n_estimators': [5, 50, 250],
    'max_depth': [2, 4, 8, 16, 32, None]
}

rfcv = GridSearchCV(rf, parameters, cv=5)
rfcv.fit(tr_features, tr_labels.values.ravel())

print_results(rfcv)

BEST PARAMS: {'max_depth': 4, 'n_estimators': 250}

0.788 (+/-0.067) for {'max_depth': 2, 'n_estimators': 5}
0.815 (+/-0.037) for {'max_depth': 2, 'n_estimators': 50}
0.818 (+/-0.057) for {'max_depth': 2, 'n_estimators': 250}
0.813 (+/-0.073) for {'max_depth': 4, 'n_estimators': 5}
0.848 (+/-0.038) for {'max_depth': 4, 'n_estimators': 50}
0.852 (+/-0.039) for {'max_depth': 4, 'n_estimators': 250}
0.803 (+/-0.06) for {'max_depth': 8, 'n_estimators': 5}
0.83 (+/-0.027) for {'max_depth': 8, 'n_estimators': 50}
0.831 (+/-0.039) for {'max_depth': 8, 'n_estimators': 250}
0.811 (+/-0.084) for {'max_depth': 16, 'n_estimators': 5}
0.822 (+/-0.031) for {'max_depth': 16, 'n_estimators': 50}
0.83 (+/-0.014) for {'max_depth': 16, 'n_estimators': 250}
0.8 (+/-0.03) for {'max_depth': 32, 'n_estimators': 5}
0.83 (+/-0.014) for {'max_depth': 32, 'n_estimators': 50}
0.824 (+/-0.021) for {'max_depth': 32, 'n_estimators': 250}
0.805 (+/-0.048) for {'max_depth': None, 'n_estimators': 5}
0.82 (+/-0.035) for

In [10]:
#Save best model
joblib.dump(rfcv.best_estimator_, "RF_model.pkl")

['RF_model.pkl']

### Boosting: Fit and evaluate a model

In [11]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier()
parameters = {
    'n_estimators': [5, 50, 250, 500],
    'max_depth': [1, 3, 5, 7, 9],
    'learning_rate': [0.01, 0.1, 1, 10, 100]
}

gbcv = GridSearchCV(gb, parameters, cv=5)
gbcv.fit(tr_features, tr_labels.values.ravel())

print_results(gbcv)

BEST PARAMS: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 500}

0.618 (+/-0.005) for {'learning_rate': 0.01, 'max_depth': 1, 'n_estimators': 5}
0.809 (+/-0.032) for {'learning_rate': 0.01, 'max_depth': 1, 'n_estimators': 50}
0.809 (+/-0.032) for {'learning_rate': 0.01, 'max_depth': 1, 'n_estimators': 250}
0.826 (+/-0.032) for {'learning_rate': 0.01, 'max_depth': 1, 'n_estimators': 500}
0.618 (+/-0.005) for {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 5}
0.818 (+/-0.042) for {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 50}
0.845 (+/-0.065) for {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 250}
0.85 (+/-0.057) for {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 500}
0.618 (+/-0.005) for {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 5}
0.82 (+/-0.069) for {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 50}
0.841 (+/-0.04) for {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 250}
0.835 (+/-0.055) for {'learning_rat

In [12]:
#Save best model
joblib.dump(gbcv.best_estimator_, "GB_model.pkl")

['GB_model.pkl']

## Compare the models

In [13]:
#Create a function to output the evaluation metrics
def evaluate_model(name, model, features, labels):
    start = time()
    pred = model.predict(features)
    end = time()
    accuracy = round(accuracy_score(labels, pred), 3)
    precision = round(precision_score(labels, pred), 3)
    recall = round(recall_score(labels, pred), 3)
    print('{} -- Accuracy: {} / Precision: {} / Recall: {} / Latency: {}ms'.format(name,
                                                                                   accuracy,
                                                                                   precision,
                                                                                   recall,
                                                                                   round((end - start)*1000, 1)))

In [14]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
from time import time

val_features = pd.read_csv('val_features.csv')
val_labels = pd.read_csv('val_labels.csv')

te_features = pd.read_csv('test_features.csv')
te_labels = pd.read_csv('test_labels.csv')

In [15]:
#Load the models to compare
models = {} 

for mdl in ['LR', 'SVM', 'MLP', 'RF', 'GB']:
    models[mdl] = joblib.load("{}_model.pkl".format(mdl))

In [16]:
#Test the models on the validation data
for name, mdl in models.items():
    evaluate_model(name, mdl, val_features, val_labels)

LR -- Accuracy: 0.803 / Precision: 0.82 / Recall: 0.676 / Latency: 1.6ms
SVM -- Accuracy: 0.747 / Precision: 0.746 / Recall: 0.595 / Latency: 1.7ms
MLP -- Accuracy: 0.809 / Precision: 0.823 / Recall: 0.689 / Latency: 1.6ms
RF -- Accuracy: 0.798 / Precision: 0.828 / Recall: 0.649 / Latency: 31.8ms
GB -- Accuracy: 0.787 / Precision: 0.833 / Recall: 0.608 / Latency: 2.4ms


In [17]:
#Evaluate the best model on the test data
evaluate_model('Multilayer Perceptron', models['MLP'], te_features, te_labels)

Multilayer Perceptron -- Accuracy: 0.771 / Precision: 0.683 / Recall: 0.672 / Latency: 1.7ms
