In [1]:
import os
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import minmax_scale
from sklearn.preprocessing import PowerTransformer

from imblearn.over_sampling import SMOTE

## models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.svm import LinearSVC

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, cross_val_predict, cross_validate

from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix, make_scorer, roc_auc_score

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

## to make it possible to display multiple output inside one cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.options.display.max_rows = 50
pd.set_option('display.float_format', lambda x: '%9.8f' % x)

In [6]:
## load data, split Xy, upsample with SMOTE, split train/test
data = pd.read_csv('data/Bar_data_encoded.csv')

## define X and y 
X = data.drop('Y', axis=1).reset_index(drop=True)
y = data.Y

## Handle imbalance
smote = SMOTE(random_state=42)

orig_vc = y.value_counts()
print(f"Orig: {100* orig_vc[1]/(orig_vc.sum()):.2f}% Yes")

X_sm, y_sm = smote.fit_resample(X, y)

## Data splitting train/test
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.20, random_state=42)

print("After SMOTE upsampling")
train_vc = y_train.value_counts()
print(f"\tNum training samples: {X_train.shape[0]:,} ({100* train_vc[1]/(train_vc.sum()):.2f}% yes)")
test_vc = y_test.value_counts()
print(f"\tNum testing samples:  {X_test.shape[0]:,} ({100* test_vc[1]/(test_vc.sum()):.2f}% yes)")

Orig: 41.00% Yes
After SMOTE upsampling
	Num training samples: 1,904 (50.26% yes)
	Num testing samples:  476 (48.95% yes)


## Testing RandomForestClassifier with Grid Search Cross-validation

Based on initial testing, the following params will be left out of the search (their default values was always the best setting):
* min_samples_leaf (1)
* max_depth (None)
* max_leaf_nodes (None)

In [7]:
cols_to_view = ['rank_test_score', 'param_max_features', 'param_min_samples_split', 'param_n_estimators', 
                'mean_test_score', 'std_test_score', 'mean_train_score', 'std_train_score']

In [11]:
param_grid={'max_features': [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
            'min_samples_split': [2, 3, 4, 5],
            'n_estimators': [100, 150, 200]}

clf = RandomForestClassifier(oob_score=True, random_state=42)

grid_search1 = GridSearchCV(clf, param_grid, scoring='roc_auc', cv=5, return_train_score=True, n_jobs=4)
grid_search1.fit(X_train.values, y_train.values)

cv_results1 = pd.DataFrame(grid_search1.cv_results_)
best_model = grid_search1.best_estimator_
best_model

GridSearchCV(cv=5,
             estimator=RandomForestClassifier(oob_score=True, random_state=42),
             n_jobs=4,
             param_grid={'max_features': [5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
                                          15],
                         'min_samples_split': [2, 3, 4, 5],
                         'n_estimators': [100, 150, 200]},
             return_train_score=True, scoring='roc_auc')

RandomForestClassifier(max_features=11, n_estimators=200, oob_score=True,
                       random_state=42)

In [12]:
print(f"Best model oob score: {best_model.oob_score_}")
y_test_score = best_model.predict_proba(X_test)[:, 1]
print(f"ROC AUC: TEST {roc_auc_score(y_test, y_test_score):.6f}\n")

Best model oob score: 0.8182773109243697
ROC AUC: TEST 0.888474



In [13]:
cv_results1.loc[:, cols_to_view].sort_values('rank_test_score').head(10)

Unnamed: 0,rank_test_score,param_max_features,param_min_samples_split,param_n_estimators,mean_test_score,std_test_score,mean_train_score,std_train_score
74,1,11,2,200,0.88961793,0.0054089,1.0,0.0
100,2,13,3,150,0.88924768,0.00179532,0.99998845,8.47e-06
101,3,13,3,200,0.88922709,0.00236674,0.99999414,4.16e-06
73,4,11,2,150,0.88902086,0.00439781,1.0,0.0
72,5,11,2,100,0.88850504,0.00417416,1.0,0.0
86,6,12,2,200,0.88848466,0.00602779,1.0,0.0
62,7,10,2,200,0.88841336,0.00531802,1.0,0.0
122,8,15,2,200,0.88829411,0.00512427,1.0,0.0
125,9,15,3,200,0.88828225,0.00374955,0.99999931,1.38e-06
113,10,14,3,200,0.88822064,0.00498912,0.99999966,6.9e-07


In [14]:
featimp_df = pd.DataFrame({'feature_name': X.columns, 'importance': best_model.feature_importances_})
featimp_df.sort_values('importance', ascending=False).head(10)

Unnamed: 0,feature_name,importance
13,Bar,0.17023029
8,age,0.0639216
10,income,0.05954362
9,education,0.04773957
14,CoffeeHouse,0.04738741
15,CarryAway,0.04294969
17,Restaurant20To50,0.0404628
7,time,0.03988751
16,RestaurantLessThan20,0.03983949
11,temperature,0.03713355


# Attempting to look at the resulting decision tree nodes