# ML Models

This notebook will create Machine Learning models and test their performance.

In [1]:
import pandas as pd
import numpy as np
import pickle
import os

os.environ['PROJ_LIB'] = os.environ['CONDA_PREFIX'] + '\\Library\\share' # bug fix with anaconda and basemap
from mpl_toolkits.basemap import Basemap
import matplotlib.pyplot as plt

from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import classification_report


from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

import joblib

import model_utils as M

%matplotlib inline

Import the data and split into Training and Testing sets.

In [2]:
DATA_DIR = '../6_feature_engineering/Feature_Dataset/'
data_files = [i for i in os.listdir(DATA_DIR) if i.endswith('.csv')]
data_files.sort()
df_TSP_list = [pd.read_csv(DATA_DIR + file) for file in data_files]

for i in range(len(df_TSP_list)):
    # Convert booleans to int
    df_TSP_list[i] = df_TSP_list[i].astype({'IS_IN_1ST_QUARTILE': 'int64',
                                            'IS_IN_2ND_QUARTILE': 'int64',
                                            'IS_IN_3RD_QUARTILE': 'int64',
                                            'EDGE_IN_SOL': 'int64'})
    df_TSP_list[i] = df_TSP_list[i].reset_index(drop=True)
    
df_TSP_list[0].head()

Unnamed: 0,DISTANCE_KM,Local Rank Incident to Node1,Local Rank Incident to Node2,GLOBAL_RANK,IS_IN_1ST_QUARTILE,IS_IN_2ND_QUARTILE,IS_IN_3RD_QUARTILE,EDGE_IN_SOL
0,108,183,246,83596,1,1,1,0
1,384,926,780,396911,0,0,0,0
2,11,3,2,2752,1,1,1,0
3,168,346,461,156459,0,1,1,0
4,402,954,771,406968,0,0,0,0


`1` is True and `0` is False.

In [3]:
train_set = df_TSP_list[:5]
test_set = df_TSP_list[5:]

print("Train Data Graphs:\t{}".format(len(train_set)))
print("Test Data Graphs:\t{}".format(len(test_set)))

Train Data Graphs:	5
Test Data Graphs:	5


In [4]:
df_train = pd.concat(train_set).reset_index(drop=True)
df_test = pd.concat(test_set).reset_index(drop=True)

y_train = df_train.pop('EDGE_IN_SOL').values
y_test = df_test.pop('EDGE_IN_SOL').values
X_train = df_train.values
X_test = df_test.values

print("X_train shape: ", X_train.shape)
print("X_test shape: ", X_test.shape)
print("y_train shape: ", y_train.shape)
print("y_test shape: ", y_test.shape)

X_train shape:  (2497500, 7)
X_test shape:  (2497500, 7)
y_train shape:  (2497500,)
y_test shape:  (2497500,)


We will split the data into 50% for training and 50% for testing.

## Naïve Bayes

In [5]:
gnb = GaussianNB()
TSP_NB = gnb.fit(X_train, y_train)
y_dash_NB = TSP_NB.predict(X_test)
joblib.dump(TSP_NB, 'Models/TSP_NB_model.pkl') # Save Model

['Models/TSP_NB_model.pkl']

### Accuracy & Confusion Matrix
With the confusion matrix, rows are actual and columns are predicted.   
If 0 is negative and 1 is positive `C(0,0)` is TN and `C(1,1)` is TP.  
TN, FP  
FN, TP

In [6]:
acc = accuracy_score(y_test, y_dash_NB)
print("Accuracy: {0:.2f}".format(acc)) 
confusion = confusion_matrix(y_test, y_dash_NB)
print("Confusion matrix:\n{}".format(confusion)) 

Accuracy: 0.98
Confusion matrix:
[[2454465   38035]
 [     72    4928]]


## Random Forest Classifier

#### Grid Search
Look at current parameters.

In [None]:
TSP_RF = RandomForestClassifier(random_state=0)
TSP_RF.get_params()

Start with random search.

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

In [None]:
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
random_grid

In [None]:
rf_random = RandomizedSearchCV(estimator=TSP_RF, param_distributions=random_grid, n_iter=100, cv=2, verbose=2, random_state=42, n_jobs=-1, scoring='f1')
# Fit the random search model
rf_random.fit(X_train, y_train)

Best parameters found by Randomized Grid Search.

In [None]:
rf_random.best_params_

Train and Save Model.

In [None]:
TSP_RF = RandomForestClassifier(**rf_random.best_params_)
TSP_RF.fit(X_train, y_train)
joblib.dump(TSP_RF, 'Models/TSP_RF_model.pkl') # Save Model

In [7]:
TSP_RF = joblib.load('Models/TSP_RF_model.pkl')
TSP_RF

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=10, max_features='sqrt', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=11,
                       min_weight_fraction_leaf=0.0, n_estimators=1100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

Local grid search around the parameters found by randomized search.

In [None]:
# Create the parameter grid based on the results of random search 
param_grid = {'n_estimators': [rf_random.best_params_['n_estimators']-100,rf_random.best_params_['n_estimators'],rf_random.best_params_['n_estimators']+100],
              'min_samples_split': [rf_random.best_params_['min_samples_split']-1, rf_random.best_params_['min_samples_split'], rf_random.best_params_['min_samples_split']+1],
              'min_samples_leaf': [rf_random.best_params_['min_samples_leaf']-1, rf_random.best_params_['min_samples_leaf'], rf_random.best_params_['min_samples_leaf']+1],
              'max_features': [rf_random.best_params_['max_features']],
              'max_depth': [rf_random.best_params_['max_depth']-1, rf_random.best_params_['max_depth'], rf_random.best_params_['max_depth']+1],
              'bootstrap': [rf_random.best_params_['bootstrap']]
             }


# Create a base model
TSP_RF = RandomForestClassifier(random_state=0)

# Instantiate the grid search model
grid_search = GridSearchCV(estimator=TSP_RF, param_grid=param_grid, 
                          cv=2, n_jobs=-1, verbose=2, return_train_score=True, scoring='f1')

param_grid

In [None]:
# Fit the grid search to the data
grid_search.fit(X_train, y_train)
grid_search.best_params_

Fit and save final Random Forest model.

In [None]:
TSP_RF = RandomForestClassifier(**grid_search.best_params_)
TSP_RF.fit(X_train, y_train)
joblib.dump(TSP_RF, 'Models/TSP_RF_model.pkl') # Save Model

#### Feature Importance

In [11]:
FI_df = pd.DataFrame(TSP_RF.feature_importances_, index=df_TSP_list[0].columns[:-1], columns =['Feature Importance Score'])
FI_df.sort_values('Feature Importance Score', inplace=True, ascending=False)
FI_df['Feature Importance Score'] = FI_df['Feature Importance Score'].apply(lambda x: np.around(x, 7))
FI_df

Unnamed: 0,Feature Importance Score
Local Rank Incident to Node2,0.41163
Local Rank Incident to Node1,0.393103
GLOBAL_RANK,0.124089
DISTANCE_KM,0.069772
IS_IN_1ST_QUARTILE,0.001231
IS_IN_2ND_QUARTILE,0.000175
IS_IN_3RD_QUARTILE,0.0


In [12]:
y_dash_RF = TSP_RF.predict(X_test)
acc = accuracy_score(y_test, y_dash_RF)
print("Accuracy: {0:.2f}".format(acc)) 
confusion = confusion_matrix(y_test, y_dash_RF)
print("Confusion matrix:\n{}".format(confusion))

Accuracy: 1.00
Confusion matrix:
[[2491744     756]
 [   2068    2932]]


## Logistic Regression Classifier

In [13]:
TSP_LR = LogisticRegression(random_state=0, solver='lbfgs')
TSP_LR.fit(X_train, y_train)
joblib.dump(TSP_LR, 'Models/TSP_LR_model.pkl') # Save Model

['Models/TSP_LR_model.pkl']

In [14]:
y_dash_LR = TSP_LR.predict(X_test)
acc = accuracy_score(y_test, y_dash_LR)
print("Accuracy: {0:.2f}".format(acc)) 
confusion = confusion_matrix(y_test, y_dash_LR)
print("Confusion matrix:\n{}".format(confusion))

Accuracy: 1.00
Confusion matrix:
[[2491582     918]
 [   2084    2916]]


- Implement the thresholding
- Experiment with hyperparameters
- Trade off between pruning and number of ground truth edges that you keep (whilst changing threshold) **IMPORTANT**
- Find optimal threshold
- Integrate with exact algorithm (complete the tour)
- WRITE THE THESIS