In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

import couponProjectUtils as proj_utils

from imblearn.over_sampling import SMOTE

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn.metrics import accuracy_score, precision_score, recall_score

## to make it possible to display multiple output inside one cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Assessment Overview

Investigation into the effect of encoding 'time', 'age', 'education', 'income', and/or 'temperature' as nominal instead of ordinal.

I'm running this assessment using the 'best model' params from the initial testing (with the 5 features encoded as ordinal) to assess whether the nominal encoding improves anything.

For all coupon types the comparison includes:
* data_encoded.csv: all 5 as ordinal
* data_encoded-all5_nominal.csv: all 5 as nominal
* 5 x data_encoded-\<featureName>\_nominal.csv: encoding only one of the features as nominal


Plus an encoding of 2-4 features as nominal (feature subset selected based on the single feature as nominal test scores for that specific coupon type):
* data_encoded-\<letterCode>\_nominal.csv

using letterCodes:
* A: age
* E: education
* I: income
* M: time
* P: temperature

## Best Result
based on test roc auc score

```
Restaurant(20-50): data/Restaurant20To50_data_encoded-age_nominal.csv
    5-fold CV ROC AUC: mean: 0.764292 std dev:0.014494
    ROC AUC: TEST 0.776617

Restaurant(<20): data/RestaurantLessThan20_data_encoded-time_nominal.csv
    5-fold CV ROC AUC: mean: 0.920792 std dev:0.010820
    ROC AUC: TEST 0.946881

Coffee House: data/CoffeeHouse_data_encoded-AI_nominal.csv
    5-fold CV ROC AUC: mean: 0.848034 std dev:0.014650
    ROC AUC: TEST 0.861322

Bar: data/Bar_data_encoded-all5_nominal.csv
    5-fold CV ROC AUC: mean: 0.894461 std dev:0.012684
    ROC AUC: TEST 0.910159

Carry out & Take away: data/CarryAway_data_encoded-temperature_nominal.csv
    5-fold CV ROC AUC: mean: 0.917464 std dev:0.013480
    ROC AUC: TEST 0.921896
```             

In [2]:
def eval_roc_auc_cv_scores(infilename, model, verbose=False):
    print(infilename)
    ## load data, split Xy, upsample with SMOTE, split train/test
    data = pd.read_csv(infilename)

    ## define X and y 
    X = data.drop('Y', axis=1).reset_index(drop=True)
    y = data.Y

    ## Handle imbalance
    smote = SMOTE(random_state=42)

    if verbose:
        orig_vc = y.value_counts()
        print(f"Orig: {100* orig_vc[1]/(orig_vc.sum()):.2f}% Yes")

    X_sm, y_sm = smote.fit_resample(X, y)

    ## Data splitting train/test
    X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.20, random_state=42)

    if verbose:
        print("After SMOTE upsampling")
        train_vc = y_train.value_counts()
        print(f"\tNum training samples: {X_train.shape[0]:,} ({100* train_vc[1]/(train_vc.sum()):.2f}% yes)")
        test_vc = y_test.value_counts()
        print(f"\tNum testing samples:  {X_test.shape[0]:,} ({100* test_vc[1]/(test_vc.sum()):.2f}% yes)")
    
    ## Replace the unknown values (previously encoded as 0.0) using the most frequent value seen in the TRAINING data
    ## Do NOT save the mode info to file
    X_train, X_test, _ = proj_utils.replace_unknown_with_mode(X_train, X_test, outfile_prefix=None)

    ## fit model on full training data, cross validate, then assess score on test data
    model.fit(X_train, y_train)
    cross_val_scores = cross_val_score(model, X_train, y_train, scoring='roc_auc', cv=5)
    print(f"5-fold CV ROC AUC: mean: {np.mean(cross_val_scores):.6f} std dev:{np.std(cross_val_scores):.6f}")

    y_test_score = model.predict_proba(X_test)[:, 1]
    print(f"ROC AUC: TEST {roc_auc_score(y_test, y_test_score):.6f}\n")
    
    return


In [3]:
filename_endings = ['', '-all5_nominal', '-age_nominal', '-education_nominal', '-income_nominal', 
                    '-temperature_nominal', '-time_nominal']

## Expensive Restaurant

In [4]:
for fne in filename_endings:
    eval_roc_auc_cv_scores(f'data/Restaurant20To50_data_encoded{fne}.csv', 
               RandomForestClassifier(max_features=4, min_samples_split=3, n_estimators=150, random_state=42))

data/Restaurant20To50_data_encoded.csv
5-fold CV ROC AUC: mean: 0.778086 std dev:0.010545
ROC AUC: TEST 0.762645

data/Restaurant20To50_data_encoded-all5_nominal.csv
5-fold CV ROC AUC: mean: 0.770785 std dev:0.012229
ROC AUC: TEST 0.768048

data/Restaurant20To50_data_encoded-age_nominal.csv
5-fold CV ROC AUC: mean: 0.764292 std dev:0.014494
ROC AUC: TEST 0.776617

data/Restaurant20To50_data_encoded-education_nominal.csv
5-fold CV ROC AUC: mean: 0.768880 std dev:0.010679
ROC AUC: TEST 0.765756

data/Restaurant20To50_data_encoded-income_nominal.csv
5-fold CV ROC AUC: mean: 0.774084 std dev:0.020151
ROC AUC: TEST 0.757678

data/Restaurant20To50_data_encoded-temperature_nominal.csv
5-fold CV ROC AUC: mean: 0.771956 std dev:0.023648
ROC AUC: TEST 0.770031

data/Restaurant20To50_data_encoded-time_nominal.csv
5-fold CV ROC AUC: mean: 0.768588 std dev:0.013430
ROC AUC: TEST 0.763700



In [9]:
## testings coupon-specific combo created based on iterative output above
eval_roc_auc_cv_scores('data/Restaurant20To50_data_encoded-AEPM_nominal.csv',
                     RandomForestClassifier(max_features=4, min_samples_split=3, n_estimators=150, random_state=42))  

data/Restaurant20To50_data_encoded-AEPM_nominal.csv
5-fold CV ROC AUC: mean: 0.762304 std dev:0.007365
ROC AUC: TEST 0.770814



## Cheap Restaurant

In [5]:
for fne in filename_endings:
    eval_roc_auc_cv_scores(f'data/RestaurantLessThan20_data_encoded{fne}.csv', 
               RandomForestClassifier(max_features=2, min_samples_split=4, n_estimators=200, random_state=42))

data/RestaurantLessThan20_data_encoded.csv
5-fold CV ROC AUC: mean: 0.923149 std dev:0.011154
ROC AUC: TEST 0.944040

data/RestaurantLessThan20_data_encoded-all5_nominal.csv
5-fold CV ROC AUC: mean: 0.925323 std dev:0.012389
ROC AUC: TEST 0.944175

data/RestaurantLessThan20_data_encoded-age_nominal.csv
5-fold CV ROC AUC: mean: 0.922121 std dev:0.011896
ROC AUC: TEST 0.940158

data/RestaurantLessThan20_data_encoded-education_nominal.csv
5-fold CV ROC AUC: mean: 0.924542 std dev:0.008746
ROC AUC: TEST 0.946713

data/RestaurantLessThan20_data_encoded-income_nominal.csv
5-fold CV ROC AUC: mean: 0.922050 std dev:0.009540
ROC AUC: TEST 0.945118

data/RestaurantLessThan20_data_encoded-temperature_nominal.csv
5-fold CV ROC AUC: mean: 0.921174 std dev:0.011974
ROC AUC: TEST 0.944808

data/RestaurantLessThan20_data_encoded-time_nominal.csv
5-fold CV ROC AUC: mean: 0.920792 std dev:0.010820
ROC AUC: TEST 0.946881



In [10]:
## testings coupon-specific combo created based on iterative output above
eval_roc_auc_cv_scores('data/RestaurantLessThan20_data_encoded-EIPM_nominal.csv',
                       RandomForestClassifier(max_features=2, min_samples_split=4, n_estimators=200, random_state=42))

data/RestaurantLessThan20_data_encoded-EIPM_nominal.csv
5-fold CV ROC AUC: mean: 0.922972 std dev:0.010381
ROC AUC: TEST 0.945725



## Coffee House

In [6]:
for fne in filename_endings:
    eval_roc_auc_cv_scores(f'data/CoffeeHouse_data_encoded{fne}.csv', 
               RandomForestClassifier(max_features=8, min_samples_split=3, n_estimators=200, random_state=42))

data/CoffeeHouse_data_encoded.csv
5-fold CV ROC AUC: mean: 0.846149 std dev:0.015823
ROC AUC: TEST 0.853249

data/CoffeeHouse_data_encoded-all5_nominal.csv
5-fold CV ROC AUC: mean: 0.841285 std dev:0.015081
ROC AUC: TEST 0.858299

data/CoffeeHouse_data_encoded-age_nominal.csv
5-fold CV ROC AUC: mean: 0.845051 std dev:0.017853
ROC AUC: TEST 0.858067

data/CoffeeHouse_data_encoded-education_nominal.csv
5-fold CV ROC AUC: mean: 0.843837 std dev:0.017810
ROC AUC: TEST 0.851808

data/CoffeeHouse_data_encoded-income_nominal.csv
5-fold CV ROC AUC: mean: 0.845588 std dev:0.014781
ROC AUC: TEST 0.855072

data/CoffeeHouse_data_encoded-temperature_nominal.csv
5-fold CV ROC AUC: mean: 0.842123 std dev:0.014466
ROC AUC: TEST 0.849674

data/CoffeeHouse_data_encoded-time_nominal.csv
5-fold CV ROC AUC: mean: 0.841135 std dev:0.015717
ROC AUC: TEST 0.848788



In [11]:
## testings coupon-specific combo created based on iterative output above
eval_roc_auc_cv_scores('data/CoffeeHouse_data_encoded-AI_nominal.csv',
                       RandomForestClassifier(max_features=8, min_samples_split=3, n_estimators=200, random_state=42))

data/CoffeeHouse_data_encoded-AI_nominal.csv
5-fold CV ROC AUC: mean: 0.848034 std dev:0.014650
ROC AUC: TEST 0.861322



## Bar

In [7]:
for fne in filename_endings:
    eval_roc_auc_cv_scores(f'data/Bar_data_encoded{fne}.csv', 
               RandomForestClassifier(max_features=5, n_estimators=200, random_state=42))

data/Bar_data_encoded.csv
5-fold CV ROC AUC: mean: 0.892287 std dev:0.012258
ROC AUC: TEST 0.904195

data/Bar_data_encoded-all5_nominal.csv
5-fold CV ROC AUC: mean: 0.894461 std dev:0.012684
ROC AUC: TEST 0.910159

data/Bar_data_encoded-age_nominal.csv
5-fold CV ROC AUC: mean: 0.889955 std dev:0.013174
ROC AUC: TEST 0.907854

data/Bar_data_encoded-education_nominal.csv
5-fold CV ROC AUC: mean: 0.888909 std dev:0.014218
ROC AUC: TEST 0.903554

data/Bar_data_encoded-income_nominal.csv
5-fold CV ROC AUC: mean: 0.892277 std dev:0.016613
ROC AUC: TEST 0.901889

data/Bar_data_encoded-temperature_nominal.csv
5-fold CV ROC AUC: mean: 0.888717 std dev:0.013092
ROC AUC: TEST 0.901773

data/Bar_data_encoded-time_nominal.csv
5-fold CV ROC AUC: mean: 0.886190 std dev:0.012988
ROC AUC: TEST 0.898079



In [None]:
## iterative output above only has one improved feature, so no new combo to test

## CarryAway

In [8]:
for fne in filename_endings:
    eval_roc_auc_cv_scores(f'data/CarryAway_data_encoded{fne}.csv', 
               RandomForestClassifier(max_features=5, min_samples_split=3, n_estimators=200, random_state=42))

data/CarryAway_data_encoded.csv
5-fold CV ROC AUC: mean: 0.922661 std dev:0.012814
ROC AUC: TEST 0.920629

data/CarryAway_data_encoded-all5_nominal.csv
5-fold CV ROC AUC: mean: 0.919068 std dev:0.014706
ROC AUC: TEST 0.915165

data/CarryAway_data_encoded-age_nominal.csv
5-fold CV ROC AUC: mean: 0.922134 std dev:0.013009
ROC AUC: TEST 0.918956

data/CarryAway_data_encoded-education_nominal.csv
5-fold CV ROC AUC: mean: 0.917932 std dev:0.012564
ROC AUC: TEST 0.917300

data/CarryAway_data_encoded-income_nominal.csv
5-fold CV ROC AUC: mean: 0.919183 std dev:0.010639
ROC AUC: TEST 0.917385

data/CarryAway_data_encoded-temperature_nominal.csv
5-fold CV ROC AUC: mean: 0.917464 std dev:0.013480
ROC AUC: TEST 0.921896

data/CarryAway_data_encoded-time_nominal.csv
5-fold CV ROC AUC: mean: 0.917471 std dev:0.013042
ROC AUC: TEST 0.914780



In [None]:
## iterative output above only has one improved feature, so no new combo to test