In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

import couponProjectUtils as proj_utils

from sklearn.preprocessing import StandardScaler

from imblearn.over_sampling import SMOTE

## models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

## to make it possible to display multiple output inside one cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.options.display.max_rows = 50
pd.set_option('display.float_format', lambda x: '%9.8f' % x)

In [2]:
## load data, split Xy, upsample with SMOTE, split train/test
data = pd.read_csv('data/Restaurant20To50_data_encoded.csv')

## define X and y 
X = data.drop('Y', axis=1).reset_index(drop=True)
y = data.Y

## Handle imbalance
smote = SMOTE(random_state=42)

orig_vc = y.value_counts()
print(f"Orig: {100* orig_vc[1]/(orig_vc.sum()):.2f}% Yes")

X_sm, y_sm = smote.fit_resample(X, y)

## Data splitting train/test
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.20, random_state=42)

print("After SMOTE upsampling")
train_vc = y_train.value_counts()
print(f"\tNum training samples: {X_train.shape[0]:,} ({100* train_vc[1]/(train_vc.sum()):.2f}% yes)")
test_vc = y_test.value_counts()
print(f"\tNum testing samples:  {X_test.shape[0]:,} ({100* test_vc[1]/(test_vc.sum()):.2f}% yes)")

Orig: 44.15% Yes
After SMOTE upsampling
	Num training samples: 1,329 (51.17% yes)
	Num testing samples:  333 (45.35% yes)


In [3]:
## Replace the unknown values (previously encoded as 0.0) using the most frequent value seen in the TRAINING data
X_train, X_test, train_modes = proj_utils.replace_unknown_with_mode(X_train, X_test, outfile_prefix='Restaurant20To50')

## Testing RandomForestClassifier with Grid Search Cross-validation


In [4]:
cols_to_view = ['rank_test_score', 'param_max_features', 'param_min_samples_split', 'param_n_estimators', 
                'mean_test_score', 'std_test_score', 'mean_train_score', 'std_train_score']

param_grid={'max_features': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
            'min_samples_split': [2, 3, 4, 5],
            'n_estimators': [100, 150, 200]}

clf = RandomForestClassifier(oob_score=True, random_state=42)

grid_search1 = GridSearchCV(clf, param_grid, scoring='roc_auc', cv=5, return_train_score=True, n_jobs=4)
grid_search1.fit(X_train.values, y_train.values)

cv_results1 = pd.DataFrame(grid_search1.cv_results_)
best_model = grid_search1.best_estimator_
best_model

GridSearchCV(cv=5,
             estimator=RandomForestClassifier(oob_score=True, random_state=42),
             n_jobs=4,
             param_grid={'max_features': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                          13, 14, 15],
                         'min_samples_split': [2, 3, 4, 5],
                         'n_estimators': [100, 150, 200]},
             return_train_score=True, scoring='roc_auc')

RandomForestClassifier(max_features=4, min_samples_split=3, n_estimators=150,
                       oob_score=True, random_state=42)

In [5]:
print(f"Best model oob score: {best_model.oob_score_}")
y_test_score = best_model.predict_proba(X_test)[:, 1]
print(f"ROC AUC: TEST {roc_auc_score(y_test, y_test_score):.6f}\n")

Best model oob score: 0.7065462753950339
ROC AUC: TEST 0.762645



In [6]:
cv_results1.loc[:, cols_to_view].sort_values('rank_test_score').head(10)

Unnamed: 0,rank_test_score,param_max_features,param_min_samples_split,param_n_estimators,mean_test_score,std_test_score,mean_train_score,std_train_score
28,1,4,3,150,0.77808582,0.01054487,0.99998266,7.53e-06
29,2,4,3,200,0.7771783,0.01445501,0.99998407,6.22e-06
2,3,2,2,200,0.77564111,0.01431242,0.99999327,5.18e-06
35,4,4,5,200,0.77510277,0.00908054,0.99952385,0.0001556
31,5,4,4,150,0.77497606,0.0169898,0.99986014,6.398e-05
32,6,4,4,200,0.77466905,0.01650555,0.99988139,5.929e-05
1,7,2,2,150,0.77457807,0.0129728,0.99999327,5.18e-06
41,8,5,3,200,0.77413321,0.01087393,0.99998973,4.4e-06
27,9,4,3,100,0.77390179,0.01303132,0.99996778,1.377e-05
0,10,2,2,100,0.77360702,0.01392495,0.99999327,5.18e-06


In [7]:
featimp_df = pd.DataFrame({'feature_name': X.columns, 'importance': best_model.feature_importances_})
featimp_df.sort_values('importance', ascending=False).head(10)

Unnamed: 0,feature_name,importance
10,income,0.0764265
8,age,0.0681742
14,CoffeeHouse,0.05993261
17,Restaurant20To50,0.05992031
9,education,0.05410223
15,CarryAway,0.05358184
16,RestaurantLessThan20,0.05140987
13,Bar,0.05055678
7,time,0.04579567
25,expiration_2h,0.04277222


In [8]:
import pickle

with open('model/phase1/Restaurant20To50_GridSearchCV_RFC.pickle', "wb") as f:
    pickle.dump(grid_search1,f)
    
with open('model/phase1/Restaurant20To50_best_model_RFC.pickle', "wb") as f:
    pickle.dump(best_model,f)

# KNN

In [9]:
param_grid={'n_neighbors': [x for x in range(1,12)]}

clf = KNeighborsClassifier()

grid_search_knn = GridSearchCV(clf, param_grid, scoring='roc_auc', cv=5, return_train_score=True, n_jobs=4)
grid_search_knn.fit(X_train.values, y_train.values)

knn_cols_to_view = ['rank_test_score', 'param_n_neighbors', 
                    'mean_test_score', 'std_test_score', 'mean_train_score', 'std_train_score']

cv_results_knn = pd.DataFrame(grid_search_knn.cv_results_)
cv_results_knn.loc[:, knn_cols_to_view].sort_values('rank_test_score').head(5)

best_model_knn = grid_search_knn.best_estimator_
best_model
y_test_score_knn = best_model_knn.predict_proba(X_test)[:, 1]
print(f"ROC AUC: TEST {roc_auc_score(y_test, y_test_score_knn):.6f}\n")

GridSearchCV(cv=5, estimator=KNeighborsClassifier(), n_jobs=4,
             param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]},
             return_train_score=True, scoring='roc_auc')

Unnamed: 0,rank_test_score,param_n_neighbors,mean_test_score,std_test_score,mean_train_score,std_train_score
2,1,3,0.68268341,0.01168804,0.90799181,0.00431916
1,2,2,0.6766333,0.01456749,0.9464559,0.00137559
4,3,5,0.67623189,0.01599181,0.85623598,0.00532108
3,4,4,0.67422568,0.01745958,0.87838293,0.00357059
6,5,7,0.67115341,0.01111712,0.82685577,0.00705345


RandomForestClassifier(max_features=4, min_samples_split=3, n_estimators=150,
                       oob_score=True, random_state=42)

ROC AUC: TEST 0.692744



In [10]:
with open('model/phase1/Restaurant20To50_GridSearchCV_KNN.pickle', "wb") as f:
    pickle.dump(grid_search_knn,f)
    
with open('model/phase1/Restaurant20To50_best_model_KNN.pickle', "wb") as f:
    pickle.dump(best_model_knn,f)