In [1]:
import os
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import minmax_scale
from sklearn.preprocessing import PowerTransformer

from imblearn.over_sampling import SMOTE

## models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.svm import LinearSVC

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, cross_val_predict, cross_validate

from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix, make_scorer, roc_auc_score

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

## to make it possible to display multiple output inside one cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.options.display.max_rows = 50
pd.set_option('display.float_format', lambda x: '%9.8f' % x)

In [2]:
## load data, split Xy, upsample with SMOTE, split train/test
data = pd.read_csv('data/CarryAway_data_encoded.csv')

## define X and y 
X = data.drop('Y', axis=1).reset_index(drop=True)
y = data.Y

## Handle imbalance
smote = SMOTE(random_state=42)

orig_vc = y.value_counts()
print(f"Orig: {100* orig_vc[1]/(orig_vc.sum()):.2f}% Yes")

X_sm, y_sm = smote.fit_resample(X, y)
sm_vc = y_sm.value_counts()
print(f"After SMOTE: {100* sm_vc[1]/(sm_vc.sum()):.2f}% Yes")

## Data splitting train/test
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.20, random_state=42)

Orig: 73.55% Yes
After SMOTE: 50.00% Yes


In [3]:
X_train.shape

(2816, 55)

## Testing RandomForestClassifier with Grid Search Cross-validation


## RandomForestClassifier Parameters

The main parameters to adjust when using these methods:
* n_estimators
* max_features (sqrt is best default for classification) 

Try starting with:
* max_depth=None
* min_samples_split=2 
* oob_score=True (with bootstrapping)

Also investigate parameters: 
1. max_leaf_nodes
1. min_samples_leaf

In [4]:
cols_to_view = ['rank_test_score', 'param_max_depth', 'param_max_features', 'param_min_samples_split', 
                'param_min_samples_leaf', 'param_n_estimators', 'param_max_leaf_nodes',
                'mean_test_score', 'std_test_score', 'mean_train_score', 'std_train_score']

In [7]:
param_grid={'max_depth': [None, 15],
            'max_features': [5, 6, 7, 8, 9, 10],
            'max_leaf_nodes': [None, 50, 100, 150],
            'min_samples_split': [2, 3, 4, 5],
            'min_samples_leaf': [1, 3],
            'n_estimators': [100, 150, 200]}

clf = RandomForestClassifier(oob_score=True, random_state=42)

grid_search1 = GridSearchCV(clf, param_grid, scoring='roc_auc', cv=5, return_train_score=True, n_jobs=4)
grid_search1.fit(X_train.values, y_train.values)

cv_results1 = pd.DataFrame(grid_search1.cv_results_)
best_model = grid_search1.best_estimator_
best_model

GridSearchCV(cv=5,
             estimator=RandomForestClassifier(oob_score=True, random_state=42),
             n_jobs=4,
             param_grid={'max_depth': [None, 15],
                         'max_features': [5, 6, 7, 8, 9, 10],
                         'max_leaf_nodes': [None, 50, 100, 150],
                         'min_samples_leaf': [1, 3],
                         'min_samples_split': [2, 3, 4, 5],
                         'n_estimators': [100, 150, 200]},
             return_train_score=True, scoring='roc_auc')

RandomForestClassifier(max_features=5, n_estimators=150, oob_score=True,
                       random_state=42)

In [8]:
print(f"Best model oob score: {best_model.oob_score_}")
y_test_score = best_model.predict_proba(X_test)[:, 1]
print(f"ROC AUC: TEST {roc_auc_score(y_test, y_test_score):.6f}\n")

Best model oob score: 0.86328125
ROC AUC: TEST 0.930397



In [11]:
cv_results1.loc[:, cols_to_view].sort_values('rank_test_score').head(10)

Unnamed: 0,rank_test_score,param_max_depth,param_max_features,param_min_samples_split,param_min_samples_leaf,param_n_estimators,param_max_leaf_nodes,mean_test_score,std_test_score,mean_train_score,std_train_score
1,1,,5,2,1,150,,0.9275425,0.0099624,0.99998952,4.62e-06
2,2,,5,2,1,200,,0.92732463,0.00986402,0.99998952,4.62e-06
5,3,,5,3,1,200,,0.92696915,0.00955805,0.99996965,1.339e-05
8,4,,5,4,1,200,,0.92674606,0.00817474,0.99986056,4.082e-05
4,5,,5,3,1,150,,0.92658816,0.00978333,0.99997233,1.206e-05
104,6,,6,4,1,200,,0.92646999,0.01000667,0.99988499,5.138e-05
7,7,,5,4,1,150,,0.92631026,0.00754393,0.99985347,4.212e-05
103,8,,6,4,1,150,,0.92619479,0.01020585,0.99987948,5.294e-05
197,9,,7,3,1,200,,0.92612284,0.00798019,0.99997091,1.274e-05
581,10,15.0,5,3,1,200,,0.92581099,0.00952455,0.99983614,5.803e-05


In [10]:
featimp_df = pd.DataFrame({'feature_name': X.columns, 'importance': best_model.feature_importances_})
featimp_df.sort_values('importance', ascending=False).head(10)

Unnamed: 0,feature_name,importance
10,income,0.07010677
14,CoffeeHouse,0.06478373
9,education,0.05812116
8,age,0.05801979
7,time,0.05597261
16,RestaurantLessThan20,0.05587273
15,CarryAway,0.05552131
13,Bar,0.05463156
11,temperature,0.04995737
17,Restaurant20To50,0.04956038


# KNN

In [12]:
param_grid={'n_neighbors': [x for x in range(1,26)]}

clf = KNeighborsClassifier()

grid_search_knn = GridSearchCV(clf, param_grid, scoring='roc_auc', cv=5, return_train_score=True, n_jobs=4)
grid_search_knn.fit(X_train.values, y_train.values)

GridSearchCV(cv=5, estimator=KNeighborsClassifier(), n_jobs=4,
             param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                         13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
                                         23, 24, 25]},
             return_train_score=True, scoring='roc_auc')

In [13]:
knn_cols_to_view = ['rank_test_score', 'param_n_neighbors', 
                    'mean_test_score', 'std_test_score', 'mean_train_score', 'std_train_score']

cv_results_knn = pd.DataFrame(grid_search_knn.cv_results_)
cv_results_knn.loc[:, knn_cols_to_view].sort_values('rank_test_score').head(3)

Unnamed: 0,rank_test_score,param_n_neighbors,mean_test_score,std_test_score,mean_train_score,std_train_score
2,1,3,0.85676188,0.01455993,0.9757366,0.00077729
1,2,2,0.85648387,0.01258463,0.98878988,0.00066234
3,3,4,0.8521576,0.01676664,0.9631625,0.00172038


In [16]:
best_model_knn = grid_search_knn.best_estimator_
best_model
y_test_score_knn = best_model_knn.predict_proba(X_test)[:, 1]
print(f"ROC AUC: TEST {roc_auc_score(y_test, y_test_score_knn):.6f}\n")

RandomForestClassifier(max_features=5, n_estimators=150, oob_score=True,
                       random_state=42)

ROC AUC: TEST 0.869875



# Logistic regression

In [18]:
## standardization
std_scaler=StandardScaler().fit(X_train) ## first fit the model on the training data

## now std scaler transform
X_train_scaled = std_scaler.transform(X_train)
X_test_scaled = std_scaler.transform(X_test)

## apply regression
liblinear_model = LogisticRegression(solver='liblinear', random_state=42)
liblinear_model.fit(X_train_scaled, y_train)   # train model by fitting to training data

# use trained model to get predicted target var
y_test_score_ll = liblinear_model.predict_proba(X_test_scaled)[:, 1]

print(f"ROC AUC: TEST {roc_auc_score(y_test, y_test_score_ll):.6f}\n")

## apply regression again, using a different solver
lbfgs_model = LogisticRegression(solver='lbfgs', random_state=42)
lbfgs_model.fit(X_train_scaled, y_train)   # train model by fitting to training data

# use trained model to get predicted target var
y_test_score_lb = lbfgs_model.predict_proba(X_test_scaled)[:, 1]

print(f"ROC AUC: TEST {roc_auc_score(y_test, y_test_score_lb):.6f}\n")

LogisticRegression(random_state=42, solver='liblinear')

ROC AUC: TEST 0.816650



LogisticRegression(random_state=42)

ROC AUC: TEST 0.816650



# Attempting to look at the resulting decision tree nodes

In [None]:
len(best_model.estimators_)
best_model.estimators_[0]

In [None]:
# fig, ax = plt.subplots(1,1,figsize = (20,20))
plt.figure()

plot_tree(best_model.estimators_[0], filled=True, rounded=True, max_depth=5, feature_names=X_train.columns)
# plt.show()

plt.savefig('plots/test-tree.eps',format='eps',bbox_inches = "tight")

In [None]:
#Extracting the decision path for instance i = 12
i_data = X_train.iloc[12].values.reshape(1,-1)
d_path = best_model.decision_path(i_data)

print(d_path)