In [60]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

from imblearn.over_sampling import SMOTE

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

from sklearn.metrics import confusion_matrix, make_scorer, roc_auc_score
from sklearn.metrics import accuracy_score, precision_score, recall_score

## to make it possible to display multiple output inside one cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Assessment Overview

Investigation into the effect of encoding 'time', 'age', 'education', 'income', and/or 'temperature' as nominal instead of ordinal.

I'm running this assessment using the 'best model' params from the initial testing (with the 5 features encoded as ordinal) to assess whether the nominal encoding improves anything.

For all coupon types the comparison includes:
* data_encoded.csv: all 5 as ordinal
* data_encoded-all5_nominal.csv: all 5 as nominal
* 5 x data_encoded-\<featureName>\_nominal.csv: encoding only one of the features as nominal


Plus an encoding of 2-4 features as nominal (feature subset selected based on the single feature as nominal test scores for that specific coupon type):
* data_encoded-\<letterCode>\_nominal.csv

using letterCodes:
* A: age
* E: education
* I: income
* M: time
* P: temperature

## Best Result
based on test roc auc score

```
Restaurant(20-50): data/Restaurant20To50_data_encoded-AEIP_nominal.csv
    5-fold CV ROC AUC: mean: 0.783592 std dev:0.018793
    ROC AUC: TEST 0.814635

Restaurant(<20): data/RestaurantLessThan20_data_encoded-all5_nominal.csv
    5-fold CV ROC AUC: mean: 0.927935 std dev:0.005978
    ROC AUC: TEST 0.940948

Coffee House: data/CoffeeHouse_data_encoded-age_nominal.csv
    5-fold CV ROC AUC: mean: 0.838130 std dev:0.010756
    ROC AUC: TEST 0.864536

Bar: data/Bar_data_encoded-all5_nominal.csv
    5-fold CV ROC AUC: mean: 0.893960 std dev:0.003912
    ROC AUC: TEST 0.905959

Carry out & Take away: data/CarryAway_data_encoded-time_nominal.csv
    5-fold CV ROC AUC: mean: 0.923258 std dev:0.009514
    ROC AUC: TEST 0.933429
```             

In [73]:
def eval_roc_auc_cv_scores(infilename, model, verbose=False):
    print(infilename)
    ## load data, split Xy, upsample with SMOTE, split train/test
    data = pd.read_csv(infilename)

    ## define X and y 
    X = data.drop('Y', axis=1).reset_index(drop=True)
    y = data.Y

    ## Handle imbalance
    smote = SMOTE(random_state=42)

    if verbose:
        orig_vc = y.value_counts()
        print(f"Orig: {100* orig_vc[1]/(orig_vc.sum()):.2f}% Yes")

    X_sm, y_sm = smote.fit_resample(X, y)

    ## Data splitting train/test
    X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.20, random_state=42)

    if verbose:
        print("After SMOTE upsampling")
        train_vc = y_train.value_counts()
        print(f"\tNum training samples: {X_train.shape[0]:,} ({100* train_vc[1]/(train_vc.sum()):.2f}% yes)")
        test_vc = y_test.value_counts()
        print(f"\tNum testing samples:  {X_test.shape[0]:,} ({100* test_vc[1]/(test_vc.sum()):.2f}% yes)")
    
    ## fit model on full training data, cross validate, then assess score on test data
    model.fit(X_train, y_train)
    cross_val_scores = cross_val_score(model, X_train, y_train, scoring='roc_auc', cv=5)
    print(f"5-fold CV ROC AUC: mean: {np.mean(cross_val_scores):.6f} std dev:{np.std(cross_val_scores):.6f}")

    y_test_score = model.predict_proba(X_test)[:, 1]
    print(f"ROC AUC: TEST {roc_auc_score(y_test, y_test_score):.6f}\n")
    
    return


In [79]:
filename_endings = ['', '-all5_nominal', '-age_nominal', '-education_nominal', '-income_nominal', 
                    '-temperature_nominal', '-time_nominal']

## Expensive Restaurant

In [88]:
## testings coupon-specific combo created based on iterative output below
eval_roc_auc_cv_scores('data/Restaurant20To50_data_encoded-AEIP_nominal.csv',
                     RandomForestClassifier(max_features=5, min_samples_split=3, n_estimators=150, random_state=42))  

data/Restaurant20To50_data_encoded-AEIP_nominal.csv
5-fold CV ROC AUC: mean: 0.783592 std dev:0.018793
ROC AUC: TEST 0.814635



In [80]:
for fne in filename_endings:
    eval_roc_auc_cv_scores(f'data/Restaurant20To50_data_encoded{fne}.csv', 
               RandomForestClassifier(max_features=5, min_samples_split=3, n_estimators=150, random_state=42))

data/Restaurant20To50_data_encoded.csv
5-fold CV ROC AUC: mean: 0.783701 std dev:0.014206
ROC AUC: TEST 0.794125

data/Restaurant20To50_data_encoded-all5_nominal.csv
5-fold CV ROC AUC: mean: 0.770888 std dev:0.012906
ROC AUC: TEST 0.797355

data/Restaurant20To50_data_encoded-age_nominal.csv
5-fold CV ROC AUC: mean: 0.774159 std dev:0.016381
ROC AUC: TEST 0.797427

data/Restaurant20To50_data_encoded-education_nominal.csv
5-fold CV ROC AUC: mean: 0.770946 std dev:0.016234
ROC AUC: TEST 0.806560

data/Restaurant20To50_data_encoded-income_nominal.csv
5-fold CV ROC AUC: mean: 0.781243 std dev:0.013589
ROC AUC: TEST 0.802810

data/Restaurant20To50_data_encoded-temperature_nominal.csv
5-fold CV ROC AUC: mean: 0.776397 std dev:0.011976
ROC AUC: TEST 0.795291

data/Restaurant20To50_data_encoded-time_nominal.csv
5-fold CV ROC AUC: mean: 0.763534 std dev:0.018145
ROC AUC: TEST 0.791631



## Cheap Restaurant

In [87]:
## testings coupon-specific combo created based on iterative output below
eval_roc_auc_cv_scores('data/RestaurantLessThan20_data_encoded-EM_nominal.csv',
                       RandomForestClassifier(max_features=2, n_estimators=200, random_state=42))

data/RestaurantLessThan20_data_encoded-EM_nominal.csv
5-fold CV ROC AUC: mean: 0.926073 std dev:0.006593
ROC AUC: TEST 0.936222



In [81]:
for fne in filename_endings:
    eval_roc_auc_cv_scores(f'data/RestaurantLessThan20_data_encoded{fne}.csv', 
               RandomForestClassifier(max_features=2, n_estimators=200, random_state=42))

data/RestaurantLessThan20_data_encoded.csv
5-fold CV ROC AUC: mean: 0.922418 std dev:0.005040
ROC AUC: TEST 0.937237

data/RestaurantLessThan20_data_encoded-all5_nominal.csv
5-fold CV ROC AUC: mean: 0.927935 std dev:0.005978
ROC AUC: TEST 0.940948

data/RestaurantLessThan20_data_encoded-age_nominal.csv
5-fold CV ROC AUC: mean: 0.922395 std dev:0.005139
ROC AUC: TEST 0.935986

data/RestaurantLessThan20_data_encoded-education_nominal.csv
5-fold CV ROC AUC: mean: 0.924275 std dev:0.006254
ROC AUC: TEST 0.937331

data/RestaurantLessThan20_data_encoded-income_nominal.csv
5-fold CV ROC AUC: mean: 0.922398 std dev:0.001559
ROC AUC: TEST 0.935419

data/RestaurantLessThan20_data_encoded-temperature_nominal.csv
5-fold CV ROC AUC: mean: 0.920618 std dev:0.007301
ROC AUC: TEST 0.937195

data/RestaurantLessThan20_data_encoded-time_nominal.csv
5-fold CV ROC AUC: mean: 0.925179 std dev:0.006403
ROC AUC: TEST 0.937650



## Coffee House

In [86]:
## testings coupon-specific combo created based on iterative output below
eval_roc_auc_cv_scores('data/CoffeeHouse_data_encoded-AEIM_nominal.csv',
                       RandomForestClassifier(max_features=12, n_estimators=200, random_state=42))

data/CoffeeHouse_data_encoded-AEIM_nominal.csv
5-fold CV ROC AUC: mean: 0.836473 std dev:0.016430
ROC AUC: TEST 0.861901



In [82]:
for fne in filename_endings:
    eval_roc_auc_cv_scores(f'data/CoffeeHouse_data_encoded{fne}.csv', 
               RandomForestClassifier(max_features=12, n_estimators=200, random_state=42))

data/CoffeeHouse_data_encoded.csv
5-fold CV ROC AUC: mean: 0.841308 std dev:0.013106
ROC AUC: TEST 0.857360

data/CoffeeHouse_data_encoded-all5_nominal.csv
5-fold CV ROC AUC: mean: 0.838216 std dev:0.014396
ROC AUC: TEST 0.858125

data/CoffeeHouse_data_encoded-age_nominal.csv
5-fold CV ROC AUC: mean: 0.838130 std dev:0.010756
ROC AUC: TEST 0.864536

data/CoffeeHouse_data_encoded-education_nominal.csv
5-fold CV ROC AUC: mean: 0.838859 std dev:0.011429
ROC AUC: TEST 0.857800

data/CoffeeHouse_data_encoded-income_nominal.csv
5-fold CV ROC AUC: mean: 0.840551 std dev:0.012217
ROC AUC: TEST 0.861343

data/CoffeeHouse_data_encoded-temperature_nominal.csv
5-fold CV ROC AUC: mean: 0.837986 std dev:0.013033
ROC AUC: TEST 0.857145

data/CoffeeHouse_data_encoded-time_nominal.csv
5-fold CV ROC AUC: mean: 0.834738 std dev:0.014890
ROC AUC: TEST 0.861518



## Bar

In [None]:
## best coupon-specific combo is all 5, so there is no new subset to test here

In [83]:
for fne in filename_endings:
    eval_roc_auc_cv_scores(f'data/Bar_data_encoded{fne}.csv', 
               RandomForestClassifier(max_features=11, n_estimators=200, random_state=42))

data/Bar_data_encoded.csv
5-fold CV ROC AUC: mean: 0.889618 std dev:0.005409
ROC AUC: TEST 0.888474

data/Bar_data_encoded-all5_nominal.csv
5-fold CV ROC AUC: mean: 0.893960 std dev:0.003912
ROC AUC: TEST 0.905959

data/Bar_data_encoded-age_nominal.csv
5-fold CV ROC AUC: mean: 0.885884 std dev:0.004736
ROC AUC: TEST 0.891574

data/Bar_data_encoded-education_nominal.csv
5-fold CV ROC AUC: mean: 0.889386 std dev:0.009265
ROC AUC: TEST 0.893384

data/Bar_data_encoded-income_nominal.csv
5-fold CV ROC AUC: mean: 0.888137 std dev:0.002029
ROC AUC: TEST 0.895804

data/Bar_data_encoded-temperature_nominal.csv
5-fold CV ROC AUC: mean: 0.888907 std dev:0.003771
ROC AUC: TEST 0.899477

data/Bar_data_encoded-time_nominal.csv
5-fold CV ROC AUC: mean: 0.885075 std dev:0.004668
ROC AUC: TEST 0.891565



## CarryAway

In [85]:
## testings coupon-specific combo created based on iterative output below
eval_roc_auc_cv_scores('data/CarryAway_data_encoded-APM_nominal.csv',
                       RandomForestClassifier(max_features=5, n_estimators=150, random_state=42))

data/CarryAway_data_encoded-APM_nominal.csv
5-fold CV ROC AUC: mean: 0.922662 std dev:0.010854
ROC AUC: TEST 0.932795



In [84]:
for fne in filename_endings:
    eval_roc_auc_cv_scores(f'data/CarryAway_data_encoded{fne}.csv', 
               RandomForestClassifier(max_features=5, n_estimators=150, random_state=42))

data/CarryAway_data_encoded.csv
5-fold CV ROC AUC: mean: 0.927542 std dev:0.009962
ROC AUC: TEST 0.930397

data/CarryAway_data_encoded-all5_nominal.csv
5-fold CV ROC AUC: mean: 0.924179 std dev:0.010087
ROC AUC: TEST 0.931305

data/CarryAway_data_encoded-age_nominal.csv
5-fold CV ROC AUC: mean: 0.930741 std dev:0.009582
ROC AUC: TEST 0.932270

data/CarryAway_data_encoded-education_nominal.csv
5-fold CV ROC AUC: mean: 0.925852 std dev:0.007429
ROC AUC: TEST 0.925349

data/CarryAway_data_encoded-income_nominal.csv
5-fold CV ROC AUC: mean: 0.930701 std dev:0.009271
ROC AUC: TEST 0.923088

data/CarryAway_data_encoded-temperature_nominal.csv
5-fold CV ROC AUC: mean: 0.927841 std dev:0.007716
ROC AUC: TEST 0.933344

data/CarryAway_data_encoded-time_nominal.csv
5-fold CV ROC AUC: mean: 0.923258 std dev:0.009514
ROC AUC: TEST 0.933429

