In [2]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

import pickle

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

## to make it possible to display multiple output inside one cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.options.display.max_rows = 50
pd.set_option('display.float_format', lambda x: '%9.8f' % x)

# Overview feature encodings used for phase2

1. Restaurant20To50: age_nominal
1. RestaurantLessThan20: time_nominal
1. CoffeeHouse: AI_nominal (age and income)
1. Bar: all5_nominal (age, education, income, temperature, time)
1. CarryAway: temperature_nominal

## Create data structure with the info for each coupon type

In [5]:
info = {'Restaurant20To50': {'best_model': None, 'X_test': None, 'ranked_fi': None},
       'RestaurantLessThan20': {'best_model': None, 'X_test': None, 'ranked_fi': None},
       'CoffeeHouse': {'best_model': None, 'X_test': None, 'ranked_fi': None},
       'Bar': {'best_model': None, 'X_test': None, 'ranked_fi': None},
       'CarryAway': {'best_model': None, 'X_test': None, 'ranked_fi': None}}

for coupon_type in info.keys():
    with open(f'model/phase2/{coupon_type}_best_model_RFC.pickle', "rb") as f: 
        info[coupon_type]['best_model'] = pickle.load(f)
    info[coupon_type]['X_test'] = pd.read_csv(f'data/{coupon_type}_X_test_phase2.csv')
    info[coupon_type]['ranked_fi'] = pd.DataFrame({'feature_name': info[coupon_type]['X_test'].columns, 
                'importance': info[coupon_type]['best_model'].feature_importances_}).sort_values('importance', 
                                                                        ascending=False).reset_index(drop=True)

## get top 10 most important features for each coupon type


In [21]:
for coupon_type in info.keys():
    sorted_features = info[coupon_type]['ranked_fi']
    display(sorted_features.head(10))
    print(f"table above: {coupon_type}")


Unnamed: 0,feature_name,importance
0,income,0.07564337
1,CoffeeHouse,0.06104137
2,Restaurant20To50,0.06088167
3,education,0.05403117
4,CarryAway,0.05053496
5,RestaurantLessThan20,0.05007116
6,Bar,0.04973657
7,time,0.04624255
8,expiration_2h,0.04099355
9,temperature,0.03400992


table above: Restaurant20To50


Unnamed: 0,feature_name,importance
0,income,0.06064292
1,age,0.05777058
2,minsToCouponDest,0.05605809
3,CoffeeHouse,0.05391411
4,education,0.05053328
5,RestaurantLessThan20,0.05028352
6,CarryAway,0.04885656
7,Bar,0.04666874
8,Restaurant20To50,0.04591961
9,destination_No Urgent Place,0.04321239


table above: RestaurantLessThan20


Unnamed: 0,feature_name,importance
0,CoffeeHouse,0.13890557
1,time,0.048372
2,education,0.04589422
3,minsToCouponDest,0.04527118
4,Bar,0.04289073
5,RestaurantLessThan20,0.04072185
6,CarryAway,0.04011278
7,Restaurant20To50,0.03898276
8,expiration_2h,0.03769892
9,temperature,0.026212


table above: CoffeeHouse


Unnamed: 0,feature_name,importance
0,Bar,0.13280221
1,CoffeeHouse,0.04675748
2,Restaurant20To50,0.04235856
3,CarryAway,0.04228904
4,RestaurantLessThan20,0.04108017
5,minsToCouponDest,0.03058365
6,has_children,0.02981448
7,gender_Male,0.02129778
8,maritalStatus_Single,0.02032457
9,maritalStatus_Married partner,0.01893958


table above: Bar


Unnamed: 0,feature_name,importance
0,income,0.07344673
1,time,0.06294384
2,age,0.06260279
3,CoffeeHouse,0.06221789
4,education,0.05941928
5,RestaurantLessThan20,0.05600129
6,minsToCouponDest,0.05510411
7,Restaurant20To50,0.05236327
8,CarryAway,0.05224499
9,Bar,0.05079347


table above: CarryAway


## get rank of the 'visiting habits unknown' indicator features
Result: all are very low in the listing, so I guess that means they were truly missing at random

In [11]:
for coupon_type in info.keys():
    sorted_features = info[coupon_type]['ranked_fi']
    print(f'{coupon_type} ({sorted_features.shape[0]} features)')
    sorted_features[sorted_features.feature_name.str.contains("unknown")]

Restaurant20To50 (61 features)


Unnamed: 0,feature_name,importance
45,CoffeeHouse_freq_unknown,0.00438089
49,Restaurant20To50_freq_unknown,0.00368918
55,CarryAway_freq_unknown,0.00192155
57,Bar_freq_unknown,0.00154546
58,RestaurantLessThan20_freq_unknown,0.00153755


RestaurantLessThan20 (58 features)


Unnamed: 0,feature_name,importance
47,Restaurant20To50_freq_unknown,0.00240152
48,CoffeeHouse_freq_unknown,0.00220422
51,Bar_freq_unknown,0.00176762
54,CarryAway_freq_unknown,0.0015341
55,RestaurantLessThan20_freq_unknown,0.00140986


CoffeeHouse (68 features)


Unnamed: 0,feature_name,importance
55,RestaurantLessThan20_freq_unknown,0.00323045
59,CoffeeHouse_freq_unknown,0.00239232
61,Restaurant20To50_freq_unknown,0.00229931
62,CarryAway_freq_unknown,0.00214266
64,Bar_freq_unknown,0.00097667


Bar (76 features)


Unnamed: 0,feature_name,importance
60,Restaurant20To50_freq_unknown,0.00316368
62,CoffeeHouse_freq_unknown,0.00280625
64,CarryAway_freq_unknown,0.00228757
68,Bar_freq_unknown,0.00200864
70,RestaurantLessThan20_freq_unknown,0.00186293


CarryAway (56 features)


Unnamed: 0,feature_name,importance
46,Bar_freq_unknown,0.00254688
47,CoffeeHouse_freq_unknown,0.00220079
49,Restaurant20To50_freq_unknown,0.00175716
52,CarryAway_freq_unknown,0.00111994
53,RestaurantLessThan20_freq_unknown,0.00091664


## get rank of the actually ordinal features encoded as nominal

hmmm, I guess I really need to compare this info to the position of the ordinal version of these features in the original phase1 results. But I don't have time to get into this in my presentation anyway, so I'll stop here. 

In [18]:
sorted_features = info['Restaurant20To50']['ranked_fi']
print(f'Restaurant20To50 ({sorted_features.shape[0]} features)')
sorted_features[sorted_features.feature_name.str.startswith("age_")]

sorted_features = info['RestaurantLessThan20']['ranked_fi']
print(f'RestaurantLessThan20 ({sorted_features.shape[0]} features)')
sorted_features[sorted_features.feature_name.str.startswith("time_")]

sorted_features = info['CoffeeHouse']['ranked_fi']
print(f'CoffeeHouse ({sorted_features.shape[0]} features)')
sorted_features[sorted_features.feature_name.str.startswith("age_")]
sorted_features[sorted_features.feature_name.str.startswith("income_")]

# skip Bar, it has all 5

sorted_features = info['CarryAway']['ranked_fi']
print(f'CarryAway ({sorted_features.shape[0]} features)')
sorted_features[sorted_features.feature_name.str.startswith("temperature_")]

# Restaurant20To50: age_nominal
# RestaurantLessThan20: time_nominal
# CoffeeHouse: AI_nominal (age and income)
# Bar: all5_nominal (age, education, income, temperature, time)
# CarryAway: temperature_nominal

Restaurant20To50 (61 features)


Unnamed: 0,feature_name,importance
18,age_26,0.01670875
19,age_31,0.01576602
20,age_50plus,0.01538345
25,age_36,0.01215619
32,age_41,0.00972303
38,age_46,0.00767682
39,age_below21,0.00752991


RestaurantLessThan20 (58 features)


Unnamed: 0,feature_name,importance
11,time_6PM,0.03722272
15,time_2PM,0.02521209
22,time_10PM,0.01279601
27,time_7AM,0.01029913


CoffeeHouse (68 features)


Unnamed: 0,feature_name,importance
13,age_26,0.01657366
20,age_31,0.01434854
28,age_50plus,0.01127121
30,age_36,0.0105483
31,age_41,0.00988853
42,age_below21,0.00724163
43,age_46,0.00702806


Unnamed: 0,feature_name,importance
15,income_$50000 - $62499,0.01610371
17,income_$37500 - $49999,0.01505573
21,income_$25000 - $37499,0.01415994
24,income_$12500 - $24999,0.01300526
29,income_$62500 - $74999,0.01081028
35,income_$75000 - $87499,0.00873133
37,income_$87500 - $99999,0.00808891
40,income_Less than $12500,0.00758338


CarryAway (56 features)


Unnamed: 0,feature_name,importance
12,temperature_55,0.01977904
19,temperature_80,0.01678155
