In [1]:
import os
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import minmax_scale
from sklearn.preprocessing import PowerTransformer

from imblearn.over_sampling import SMOTE

## models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.svm import LinearSVC

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, cross_val_predict, cross_validate

from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix, make_scorer, roc_auc_score

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

## to make it possible to display multiple output inside one cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.options.display.max_rows = 50
pd.set_option('display.float_format', lambda x: '%9.8f' % x)

In [2]:
## load data, split Xy, upsample with SMOTE, split train/test
data = pd.read_csv('data/Restaurant20To50_data_encoded.csv')

## define X and y 
X = data.drop('Y', axis=1).reset_index(drop=True)
y = data.Y

## Handle imbalance
smote = SMOTE()

orig_vc = y.value_counts()
print(f"Orig: {100* orig_vc[1]/(orig_vc.sum()):.2f}% Yes")

X_sm, y_sm = smote.fit_resample(X, y)
sm_vc = y_sm.value_counts()
print(f"After SMOTE: {100* sm_vc[1]/(sm_vc.sum()):.2f}% Yes")

## Data splitting train/test
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.20, random_state=42)

Orig: 44.10% Yes
After SMOTE: 50.00% Yes


In [3]:
X_train.shape

(1334, 55)

## Testing RandomForestClassifier with Grid Search Cross-validation


In [4]:
roc_auc_scorer = make_scorer(roc_auc_score, average='weighted')

In [5]:
cols_to_view = ['rank_test_score', 'param_max_depth', 'param_max_features', 'param_min_samples_split', 
                'param_min_samples_leaf', 'param_n_estimators', 
                'mean_test_score', 'std_test_score', 'mean_train_score', 'std_train_score']

In [6]:
param_grid={'max_depth': [None, 5, 10, 15],
            'max_features': [5, 6, 7, 8, 9, 10],
            'min_samples_split': [2, 3, 4, 5],
            'min_samples_leaf': [1, 3, 5],
            'n_estimators': [100, 150, 200]}

clf = RandomForestClassifier(oob_score=True, random_state=42)

grid_search1 = GridSearchCV(clf, param_grid, scoring=roc_auc_scorer, cv=5, return_train_score=True, n_jobs=4)
grid_search1.fit(X_train.values, y_train.values)

GridSearchCV(cv=5,
             estimator=RandomForestClassifier(oob_score=True, random_state=42),
             n_jobs=4,
             param_grid={'max_depth': [None, 5, 10, 15],
                         'max_features': [5, 6, 7, 8, 9, 10],
                         'min_samples_leaf': [1, 3, 5],
                         'min_samples_split': [2, 3, 4, 5],
                         'n_estimators': [100, 150, 200]},
             return_train_score=True,
             scoring=make_scorer(roc_auc_score, average=weighted))

In [7]:
cv_results1 = pd.DataFrame(grid_search1.cv_results_)
cv_results1.loc[:, cols_to_view].sort_values('rank_test_score').head(10)

Unnamed: 0,rank_test_score,param_max_depth,param_max_features,param_min_samples_split,param_min_samples_leaf,param_n_estimators,mean_test_score,std_test_score,mean_train_score,std_train_score
39,1,,6,3,1,100,0.71055988,0.0051231,0.99698778,0.00109772
190,2,,10,5,1,150,0.70831554,0.01546154,0.99266006,0.00233655
184,3,,10,3,1,150,0.70757455,0.01585656,0.99735952,0.00109729
835,4,15.0,10,4,1,150,0.70687385,0.00754695,0.99267269,0.00069672
800,5,15.0,9,4,1,200,0.70679486,0.01116598,0.99211824,0.00128023
10,6,,5,5,1,150,0.70671605,0.01769672,0.98552496,0.00318266
4,7,,5,3,1,150,0.70608787,0.01830177,0.99773724,0.00095988
838,8,15.0,10,5,1,150,0.70608771,0.00543729,0.98760781,0.00247269
5,9,,5,3,1,200,0.70605998,0.00625786,0.99774285,0.00074858
837,10,15.0,10,5,1,100,0.70603151,0.00625385,0.98480604,0.00276145


In [8]:
grid_search1.best_estimator_

RandomForestClassifier(max_features=6, min_samples_split=3, oob_score=True,
                       random_state=42)

In [9]:
best_model = grid_search1.best_estimator_
best_model.oob_score_

0.684407796101949

In [10]:
cross_val_scores = cross_val_score(best_model, X_train, y_train, scoring=roc_auc_scorer, cv=5)
print(f"(weighted avg) AUC: {np.mean(cross_val_scores)} {np.std(cross_val_scores)}")

(weighted avg) AUC: 0.7105598762472288 0.005123099516093643


In [12]:
## fit best model on full train and test on test data
best_model = best_model.fit(X_train, y_train)

## use trained model to get predicted target var
# y_train_pred = best_model.predict(X_train)   
y_test_pred = best_model.predict(X_test) 

# print(f"(weighted avg) AUC: train {roc_auc_score(y_train, y_train_pred, average='weighted')}")
print(f"(weighted avg) AUC: TEST {roc_auc_score(y_test, y_test_pred, average='weighted')}")

(weighted avg) AUC: TEST 0.7226528854435831


# KNN

In [13]:
param_grid={'n_neighbors': [x for x in range(1,12)]}

clf = KNeighborsClassifier()

grid_search_knn = GridSearchCV(clf, param_grid, scoring=roc_auc_scorer, cv=5, return_train_score=True, n_jobs=4)
grid_search_knn.fit(X_train.values, y_train.values)

knn_cols_to_view = ['rank_test_score', 'param_n_neighbors', 
                    'mean_test_score', 'std_test_score', 'mean_train_score', 'std_train_score']

cv_results_knn = pd.DataFrame(grid_search_knn.cv_results_)
cv_results_knn.loc[:, knn_cols_to_view].sort_values('rank_test_score').head(5)

best_model_knn = grid_search_knn.best_estimator_

## fit best model on full train and test on test data
best_model_knn = best_model_knn.fit(X_train, y_train)

## use trained model to get predicted target var
y_test_pred = best_model_knn.predict(X_test) 

print(f"(weighted avg) AUC: TEST {roc_auc_score(y_test, y_test_pred, average='weighted')}")

GridSearchCV(cv=5, estimator=KNeighborsClassifier(), n_jobs=4,
             param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]},
             return_train_score=True,
             scoring=make_scorer(roc_auc_score, average=weighted))

Unnamed: 0,rank_test_score,param_n_neighbors,mean_test_score,std_test_score,mean_train_score,std_train_score
0,1,1,0.66397018,0.02579237,0.99830925,0.00092033
2,2,3,0.64672166,0.03322447,0.82963105,0.00610641
3,3,4,0.64331951,0.03406148,0.79517494,0.00520299
5,4,6,0.64147418,0.04286132,0.76393665,0.00755106
1,5,2,0.64070271,0.03118021,0.8634829,0.00416732


(weighted avg) AUC: TEST 0.6694659776055125


# Attempting to look at the resulting decision tree nodes

In [11]:
featimp_df = pd.DataFrame({'feature_name': X.columns, 'importance': best_model.feature_importances_})
featimp_df.sort_values('importance', ascending=False).head(10)

Unnamed: 0,feature_name,importance
10,income,0.07906705
17,Restaurant20To50,0.069917
8,age,0.06844166
14,CoffeeHouse,0.06045186
9,education,0.05420729
7,time,0.05411841
16,RestaurantLessThan20,0.04998435
15,CarryAway,0.04935573
13,Bar,0.0490134
25,expiration_2h,0.04244311
