# ML Models

This notebook will create Machine Learning models and test their performance.

In [None]:
import pandas as pd
import numpy as np
import pickle
import os

import matplotlib.pyplot as plt

from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import classification_report


from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

import joblib

import model_utils as M

%matplotlib inline

Import the data and split into Training and Testing sets.

In [None]:
DATA_DIR = '../6_feature_engineering/Feature_Dataset/'
data_files = [i for i in os.listdir(DATA_DIR) if i.endswith('.csv')]
data_files.sort()
df_TSP_list = [pd.read_csv(DATA_DIR + file) for file in data_files]

for i in range(len(df_TSP_list)):
    # Convert booleans to int
    df_TSP_list[i] = df_TSP_list[i].astype({'IS_IN_1ST_QUARTILE': 'int64',
                                            'IS_IN_2ND_QUARTILE': 'int64',
                                            'IS_IN_3RD_QUARTILE': 'int64',
                                            'EDGE_IN_SOL': 'int64'})
    df_TSP_list[i] = df_TSP_list[i].reset_index(drop=True)
    
df_TSP_list[0].head()

`1` is True and `0` is False.

In [None]:
train_set = df_TSP_list[:5]
test_set = df_TSP_list[5:]

print("Train Data Graphs:\t{}".format(len(train_set)))
print("Test Data Graphs:\t{}".format(len(test_set)))

In [None]:
df_train = pd.concat(train_set).reset_index(drop=True)
df_test = pd.concat(test_set).reset_index(drop=True)

y_train = df_train.pop('EDGE_IN_SOL').values
y_test = df_test.pop('EDGE_IN_SOL').values
X_train = df_train.values
X_test = df_test.values

print("X_train shape: ", X_train.shape)
print("X_test shape: ", X_test.shape)
print("y_train shape: ", y_train.shape)
print("y_test shape: ", y_test.shape)

We will split the data into 50% for training and 50% for testing.

## Naïve Bayes

In [None]:
gnb = GaussianNB()
TSP_NB = gnb.fit(X_train, y_train)
y_dash_NB = TSP_NB.predict(X_test)
joblib.dump(TSP_NB, 'Models/TSP_NB_model.pkl') # Save Model

### Accuracy & Confusion Matrix
With the confusion matrix, rows are actual and columns are predicted.   
If 0 is negative and 1 is positive `C(0,0)` is TN and `C(1,1)` is TP.  
TN, FP  
FN, TP

In [None]:
acc = accuracy_score(y_test, y_dash_NB)
print("Accuracy: {0:.2f}".format(acc)) 
confusion = confusion_matrix(y_test, y_dash_NB)
print("Confusion matrix:\n{}".format(confusion)) 
f1 = f1_score(y_test, y_dash_NB)
print("F1 score: {0:.2f}".format(f1))
precision = precision_score(y_test, y_dash_NB)
print("Precision score: {0:.2f}".format(precision))
recall = recall_score(y_test, y_dash_NB)
print("Recall score: {0:.2f}".format(recall))

## Logistic Regression Classifier

In [None]:
TSP_LR = LogisticRegression(random_state=0, solver='lbfgs')
TSP_LR.fit(X_train, y_train)
y_dash_LR = TSP_LR.predict(X_test)
joblib.dump(TSP_LR, 'Models/TSP_LR_model.pkl') # Save Model

In [None]:
acc = accuracy_score(y_test, y_dash_LR)
print("Accuracy: {0:.2f}".format(acc)) 
confusion = confusion_matrix(y_test, y_dash_LR)
print("Confusion matrix:\n{}".format(confusion)) 
f1 = f1_score(y_test, y_dash_LR)
print("F1 score: {0:.2f}".format(f1))
precision = precision_score(y_test, y_dash_LR)
print("Precision score: {0:.2f}".format(precision))
recall = recall_score(y_test, y_dash_LR)
print("Recall score: {0:.2f}".format(recall))

## Random Forest Classifier

#### Grid Search
Look at current parameters.

In [None]:
TSP_RF = RandomForestClassifier(random_state=0)
TSP_RF.get_params()

Start with random search.

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

In [None]:
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
random_grid

In [None]:
rf_random = RandomizedSearchCV(estimator=TSP_RF, param_distributions=random_grid, n_iter=100, cv=2, verbose=2, random_state=42, n_jobs=-1, scoring='f1')
# Fit the random search model
rf_random.fit(X_train, y_train)

Best parameters found by Randomized Grid Search.

In [None]:
rf_random.best_params_

Train and Save Model.

In [None]:
TSP_RF = RandomForestClassifier(**rf_random.best_params_)
TSP_RF.fit(X_train, y_train)
joblib.dump(TSP_RF, 'Models/TSP_RF_model.pkl') # Save Model

In [None]:
TSP_RF = joblib.load('Models/TSP_RF_model.pkl')
TSP_RF

Local grid search around the parameters found by randomized search.

In [None]:
# Create the parameter grid based on the results of random search 
param_grid = {'n_estimators': [rf_random.best_params_['n_estimators']-100,rf_random.best_params_['n_estimators'],rf_random.best_params_['n_estimators']+100],
              'min_samples_split': [rf_random.best_params_['min_samples_split']-1, rf_random.best_params_['min_samples_split'], rf_random.best_params_['min_samples_split']+1],
              'min_samples_leaf': [rf_random.best_params_['min_samples_leaf']-1, rf_random.best_params_['min_samples_leaf'], rf_random.best_params_['min_samples_leaf']+1],
              'max_features': [rf_random.best_params_['max_features']],
              'max_depth': [rf_random.best_params_['max_depth']-1, rf_random.best_params_['max_depth'], rf_random.best_params_['max_depth']+1],
              'bootstrap': [rf_random.best_params_['bootstrap']]
             }


# Create a base model
TSP_RF = RandomForestClassifier(random_state=0)

# Instantiate the grid search model
grid_search = GridSearchCV(estimator=TSP_RF, param_grid=param_grid, 
                          cv=2, n_jobs=-1, verbose=2, return_train_score=True, scoring='f1')

param_grid

In [None]:
# Fit the grid search to the data
grid_search.fit(X_train, y_train)
grid_search.best_params_

Fit and save final Random Forest model.

In [None]:
TSP_RF = RandomForestClassifier(**grid_search.best_params_)
TSP_RF.fit(X_train, y_train)
joblib.dump(TSP_RF, 'Models/TSP_RF_model.pkl') # Save Model

#### Feature Importance

In [None]:
FI_df = pd.DataFrame(TSP_RF.feature_importances_, index=df_TSP_list[0].columns[:-1], columns =['Feature Importance Score'])
FI_df.sort_values('Feature Importance Score', inplace=True, ascending=False)
FI_df['Feature Importance Score'] = FI_df['Feature Importance Score'].apply(lambda x: np.around(x, 7))
FI_df

In [None]:
y_dash_RF = TSP_RF.predict(X_test)
acc = accuracy_score(y_test, y_dash_RF)
print("Accuracy: {0:.2f}".format(acc)) 
confusion = confusion_matrix(y_test, y_dash_RF)
print("Confusion matrix:\n{}".format(confusion)) 
f1 = f1_score(y_test, y_dash_RF)
print("F1 score: {0:.2f}".format(f1))
precision = precision_score(y_test, y_dash_RF)
print("Precision score: {0:.2f}".format(precision))
recall = recall_score(y_test, y_dash_RF)
print("Recall score: {0:.2f}".format(recall))