In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

import couponProjectUtils as proj_utils

from sklearn.preprocessing import StandardScaler

from imblearn.over_sampling import SMOTE

## models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

## to make it possible to display multiple output inside one cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.options.display.max_rows = 50
pd.set_option('display.float_format', lambda x: '%9.8f' % x)

# Use best encoding for coupon type: Carry out & Take away

In [2]:
## load data, split Xy, upsample with SMOTE, split train/test
data = pd.read_csv('data/CarryAway_data_encoded-temperature_nominal.csv')

## define X and y 
X = data.drop('Y', axis=1).reset_index(drop=True)
y = data.Y

## Handle imbalance
smote = SMOTE(random_state=42)

orig_vc = y.value_counts()
print(f"Orig: {100* orig_vc[1]/(orig_vc.sum()):.2f}% Yes")

X_sm, y_sm = smote.fit_resample(X, y)

## Data splitting train/test
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.20, random_state=42)

print("After SMOTE upsampling")
train_vc = y_train.value_counts()
print(f"\tNum training samples: {X_train.shape[0]:,} ({100* train_vc[1]/(train_vc.sum()):.2f}% yes)")
test_vc = y_test.value_counts()
print(f"\tNum testing samples:  {X_test.shape[0]:,} ({100* test_vc[1]/(test_vc.sum()):.2f}% yes)")

Orig: 73.38% Yes
After SMOTE upsampling
	Num training samples: 2,752 (49.38% yes)
	Num testing samples:  688 (52.47% yes)


In [3]:
## Replace the unknown values (previously encoded as 0.0) using the most frequent value seen in the TRAINING data
X_train, X_test, train_modes = proj_utils.replace_unknown_with_mode(X_train, X_test, 
                                                                    outfile_prefix='phase2-CarryAway')

## Testing RandomForestClassifier with Grid Search Cross-validation


## RandomForestClassifier Parameters

The main parameters to adjust when using these methods:
* n_estimators
* max_features (sqrt is generally best default for classification. I have 55 features, so try values around 8) 

Make sure to include params:
* max_depth=None
* min_samples_split=2 
* oob_score=True (with bootstrapping)

In [4]:
cols_to_view = ['rank_test_score', 'param_max_depth', 'param_max_features', 'param_min_samples_split', 
                'param_n_estimators', 'mean_test_score', 'std_test_score', 'mean_train_score', 'std_train_score']

In [5]:
param_grid={'max_depth': [None, 15],
            'max_features': [5, 6, 7, 8, 9, 10],
            'min_samples_split': [2, 3, 4, 5],
            'n_estimators': [100, 150, 200]}

clf = RandomForestClassifier(oob_score=True, random_state=42)

grid_search1 = GridSearchCV(clf, param_grid, scoring='roc_auc', cv=5, return_train_score=True, n_jobs=4)
grid_search1.fit(X_train.values, y_train.values)

cv_results1 = pd.DataFrame(grid_search1.cv_results_)
best_model = grid_search1.best_estimator_
best_model

GridSearchCV(cv=5,
             estimator=RandomForestClassifier(oob_score=True, random_state=42),
             n_jobs=4,
             param_grid={'max_depth': [None, 15],
                         'max_features': [5, 6, 7, 8, 9, 10],
                         'min_samples_split': [2, 3, 4, 5],
                         'n_estimators': [100, 150, 200]},
             return_train_score=True, scoring='roc_auc')

RandomForestClassifier(max_features=5, min_samples_split=3, n_estimators=200,
                       oob_score=True, random_state=42)

In [6]:
print(f"Best model oob score: {best_model.oob_score_}")
y_test_score = best_model.predict_proba(X_test)[:, 1]
print(f"ROC AUC: TEST {roc_auc_score(y_test, y_test_score):.6f}\n")

Best model oob score: 0.8680959302325582
ROC AUC: TEST 0.921896



In [7]:
cv_results1.loc[:, cols_to_view].sort_values('rank_test_score').head(10)

Unnamed: 0,rank_test_score,param_max_depth,param_max_features,param_min_samples_split,param_n_estimators,mean_test_score,std_test_score,mean_train_score,std_train_score
5,1,,5,3,200,0.91746437,0.01348041,0.99986342,8.681e-05
4,2,,5,3,150,0.91668096,0.01379106,0.99985549,9.445e-05
8,3,,5,4,200,0.91617189,0.01534445,0.99974555,8.862e-05
2,4,,5,2,200,0.91577793,0.00995565,0.99995882,2.717e-05
11,5,,5,5,200,0.91545111,0.01627077,0.99951494,5.743e-05
3,6,,5,3,100,0.91542063,0.01568872,0.99983833,9.815e-05
20,7,,6,4,200,0.91539747,0.01392357,0.99975611,7.058e-05
1,8,,5,2,150,0.91523979,0.00941765,0.99994776,3.696e-05
29,9,,7,3,200,0.91513751,0.01572976,0.99988157,7.181e-05
7,10,,5,4,150,0.91456016,0.01634508,0.99971881,9.394e-05


In [8]:
featimp_df = pd.DataFrame({'feature_name': X.columns, 'importance': best_model.feature_importances_})
featimp_df.sort_values('importance', ascending=False).head(10)

Unnamed: 0,feature_name,importance
10,income,0.07344673
7,time,0.06294384
8,age,0.06260279
13,CoffeeHouse,0.06221789
9,education,0.05941928
15,RestaurantLessThan20,0.05600129
11,minsToCouponDest,0.05510411
16,Restaurant20To50,0.05236327
14,CarryAway,0.05224499
12,Bar,0.05079347


In [9]:
import pickle

with open('model/phase2/CarryAway_GridSearchCV_RFC.pickle', "wb") as f:
    pickle.dump(grid_search1,f)
    
with open('model/phase2/CarryAway_best_model_RFC.pickle', "wb") as f:
    pickle.dump(best_model,f)

In [16]:
## save train/test info to file
Xy_tt = {'X_train': X_train, 'X_test': X_test, 'y_train': y_train, 'y_test': y_test}

for k in Xy_tt.keys():
    fname = f'data/CarryAway_{k}_phase2.csv'
    print(f'Saving file: {fname}')
    Xy_tt[k].to_csv(fname, index=False)

Saving file: data/CarryAway_X_train_phase2.csv
Saving file: data/CarryAway_X_test_phase2.csv
Saving file: data/CarryAway_y_train_phase2.csv
Saving file: data/CarryAway_y_test_phase2.csv


# KNN

In [10]:
param_grid={'n_neighbors': [x for x in range(1,26)]}

clf = KNeighborsClassifier()

grid_search_knn = GridSearchCV(clf, param_grid, scoring='roc_auc', cv=5, return_train_score=True, n_jobs=4)
grid_search_knn.fit(X_train.values, y_train.values)

GridSearchCV(cv=5, estimator=KNeighborsClassifier(), n_jobs=4,
             param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                         13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
                                         23, 24, 25]},
             return_train_score=True, scoring='roc_auc')

In [11]:
knn_cols_to_view = ['rank_test_score', 'param_n_neighbors', 
                    'mean_test_score', 'std_test_score', 'mean_train_score', 'std_train_score']

cv_results_knn = pd.DataFrame(grid_search_knn.cv_results_)
cv_results_knn.loc[:, knn_cols_to_view].sort_values('rank_test_score').head(3)

Unnamed: 0,rank_test_score,param_n_neighbors,mean_test_score,std_test_score,mean_train_score,std_train_score
2,1,3,0.84204642,0.00339922,0.97565922,0.00130962
3,2,4,0.83953618,0.00312299,0.96037401,0.00146195
1,3,2,0.83883446,0.00798575,0.98804292,0.00063673


In [12]:
best_model_knn = grid_search_knn.best_estimator_
best_model_knn
y_test_score_knn = best_model_knn.predict_proba(X_test)[:, 1]
print(f"ROC AUC: TEST {roc_auc_score(y_test, y_test_score_knn):.6f}\n")

KNeighborsClassifier(n_neighbors=3)

ROC AUC: TEST 0.871644



In [13]:
with open('model/phase2/CarryAway_GridSearchCV_KNN.pickle', "wb") as f:
    pickle.dump(grid_search_knn,f)
    
with open('model/phase2/CarryAway_best_model_KNN.pickle', "wb") as f:
    pickle.dump(best_model_knn,f)

# Logistic regression

In [17]:
## standardization
std_scaler=StandardScaler().fit(X_train) ## first fit the model on the training data

## now std scaler transform
X_train_scaled = std_scaler.transform(X_train)
X_test_scaled = std_scaler.transform(X_test)

## apply regression
liblinear_model = LogisticRegression(solver='liblinear', random_state=42)
liblinear_model.fit(X_train_scaled, y_train)   # train model by fitting to training data

## use cv to assess generalization
cross_val_scores = cross_val_score(liblinear_model, X_train, y_train, scoring='roc_auc', cv=5)
print(f"5-fold CV ROC AUC: mean: {np.mean(cross_val_scores):.6f} std dev:{np.std(cross_val_scores):.6f}")
    
# use trained model to get predicted target var
y_test_score_ll = liblinear_model.predict_proba(X_test_scaled)[:, 1]

print(f"ROC AUC: TEST {roc_auc_score(y_test, y_test_score_ll)}\n")

## apply regression again, using a different solver
lbfgs_model = LogisticRegression(solver='lbfgs', random_state=42)
lbfgs_model.fit(X_train_scaled, y_train)   # train model by fitting to training data

## use cv to assess generalization
cross_val_scores = cross_val_score(lbfgs_model, X_train, y_train, scoring='roc_auc', cv=5)
print(f"5-fold CV ROC AUC: mean: {np.mean(cross_val_scores):.6f} std dev:{np.std(cross_val_scores):.6f}")

# use trained model to get predicted target var
y_test_score_lb = lbfgs_model.predict_proba(X_test_scaled)[:, 1]

print(f"ROC AUC: TEST {roc_auc_score(y_test, y_test_score_lb)}\n")

LogisticRegression(random_state=42, solver='liblinear')

5-fold CV ROC AUC: mean: 0.835316 std dev:0.023199
ROC AUC: TEST 0.8208044253560023



LogisticRegression(random_state=42)

5-fold CV ROC AUC: mean: 0.835117 std dev:0.021572
ROC AUC: TEST 0.820855252568892



In [18]:
## both LR models have essentially the same score, just save one
with open('model/phase2/CarryAway_best_model_LogisticRegression.pickle', "wb") as f:
    pickle.dump(liblinear_model,f)