In [1]:
import copy
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import OrdinalEncoder

## to make it possible to display multiple output inside one cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.options.display.max_rows = 50
pd.set_option('display.float_format', lambda x: '%9.8f' % x)

## Read original csv and drop/transform features

In [2]:
file_name = "data/in-vehicle-coupon-recommendation.csv"
data = pd.read_csv(file_name)

## drop unneeded cols (car is mostly null, direction_opp is the inverse of direction_same)
data = data.drop(['car', 'direction_opp'], axis=1)

In [3]:
## populate new column minsToCouponDest with values ['5-14', '15-24', '25plus']
data.loc[(data.toCoupon_GEQ25min == 1), 'minsToCouponDest'] = '25plus'
data.loc[(data.toCoupon_GEQ25min == 0) & (data.toCoupon_GEQ15min==1), 'minsToCouponDest'] = '15-24'
data.loc[(data.toCoupon_GEQ25min == 0) & (data.toCoupon_GEQ15min==0), 'minsToCouponDest'] = '5-14'

## drop separate cols used to create minsToCouponDest
data = data.drop(['toCoupon_GEQ5min', 'toCoupon_GEQ15min', 'toCoupon_GEQ25min'], axis=1)

In [4]:
## convert temperature to string
data['temperature'] = data.temperature.astype(str)

In [5]:
## drop duplicates (count rows before and after to see how many records were dropped)
data.shape[0]
data = data.drop_duplicates().reset_index(drop=True)
data.shape[0]

12684

12610

## Handle missing values in 5 driver-habit columns

In [6]:
data.columns[data.isna().any()].tolist()

['Bar', 'CoffeeHouse', 'CarryAway', 'RestaurantLessThan20', 'Restaurant20To50']

In [7]:
## manually add indicator columns
data['Bar_freq_unknown'] = np.where(data['Bar'].isna(), 1, 0)
data['CoffeeHouse_freq_unknown'] = np.where(data['CoffeeHouse'].isna(), 1, 0)
data['CarryAway_freq_unknown'] = np.where(data['CarryAway'].isna(), 1, 0)
data['RestaurantLessThan20_freq_unknown'] = np.where(data['RestaurantLessThan20'].isna(), 1, 0)
data['Restaurant20To50_freq_unknown'] = np.where(data['Restaurant20To50'].isna(), 1, 0)

## replace missing values with string 'unknown'
data = data.fillna(value='unknown')

# Split by coupon type

In [8]:
Bar_data = data[data['coupon'] == 'Bar']
Bar_data = Bar_data.drop('coupon', axis=1)
Bar_data.to_csv('data/Bar_data.csv', index=False)

CoffeeHouse_data = data[data['coupon'] == 'Coffee House']
CoffeeHouse_data = CoffeeHouse_data.drop('coupon', axis=1)
CoffeeHouse_data.to_csv('data/CoffeeHouse_data.csv', index=False)

CarryAway_data = data[data['coupon'] == 'Carry out & Take away']
CarryAway_data = CarryAway_data.drop('coupon', axis=1)
CarryAway_data.to_csv('data/CarryAway_data.csv', index=False)

RestaurantLessThan20_data = data[data['coupon'] == 'Restaurant(<20)']
RestaurantLessThan20_data = RestaurantLessThan20_data.drop('coupon', axis=1)
RestaurantLessThan20_data.to_csv('data/RestaurantLessThan20_data.csv', index=False)

Restaurant20To50_data = data[data['coupon'] == 'Restaurant(20-50)']
Restaurant20To50_data = Restaurant20To50_data.drop('coupon', axis=1)
Restaurant20To50_data.to_csv('data/Restaurant20To50_data.csv', index=False)

# Encode features

In [9]:
data.columns

Index(['destination', 'passanger', 'weather', 'temperature', 'time', 'coupon',
       'expiration', 'gender', 'age', 'maritalStatus', 'has_children',
       'education', 'occupation', 'income', 'Bar', 'CoffeeHouse', 'CarryAway',
       'RestaurantLessThan20', 'Restaurant20To50', 'direction_same', 'Y',
       'minsToCouponDest', 'Bar_freq_unknown', 'CoffeeHouse_freq_unknown',
       'CarryAway_freq_unknown', 'RestaurantLessThan20_freq_unknown',
       'Restaurant20To50_freq_unknown'],
      dtype='object')

### Binary features already encoded as numeric

`['has_children', 'direction_same', 'Bar_freq_unknown', 'CoffeeHouse_freq_unknown',
  'CarryAway_freq_unknown', 'RestaurantLessThan20_freq_unknown', 'Restaurant20To50_freq_unknown', 'Y']`

### Ordinal

`['Bar', 'CoffeeHouse', 'CarryAway', 'RestaurantLessThan20', 'Restaurant20To50', 'minsToCouponDest']`


### Ordinal features where the order may or may not matter. 
(Encode as Ordinal for now)

`['time', 'age', 'education', 'income', 'temperature']`


### Nominal (unordered)

`['destination', 'passanger', 'weather', 'expiration', 'gender', 'maritalStatus', 'occupation']`


In [10]:
binary_numeric_data = data[['has_children', 'direction_same', 'Bar_freq_unknown', 'CoffeeHouse_freq_unknown',
       'CarryAway_freq_unknown', 'RestaurantLessThan20_freq_unknown', 'Restaurant20To50_freq_unknown', 'Y']]

In [12]:
## Note: encoding cannot handle missing values
ordinal_colnames = ['time', 'age', 'education', 'income', 'temperature', 'minsToCouponDest',
                   'Bar', 'CoffeeHouse', 'CarryAway', 'RestaurantLessThan20', 'Restaurant20To50']
ordinal_data = data[ordinal_colnames]

ordinal_categories_list = [['7AM', '10AM', '2PM', '6PM', '10PM'],
                           ['below21', '21', '26', '31', '36', '41', '46', '50plus'],
                           ['Some High School', 'High School Graduate', 'Some college - no degree',
                            'Associates degree', 'Bachelors degree', 'Graduate degree (Masters or Doctorate)'],
                           ['Less than $12500', '$12500 - $24999', '$25000 - $37499', '$37500 - $49999',
                            '$50000 - $62499', '$62500 - $74999', '$75000 - $87499', '$87500 - $99999',
                            '$100000 or More'],
                           ['30', '55', '80'],
                           ['5-14', '15-24', '25plus'],
                           ['unknown', 'never', 'less1', '1~3', '4~8', 'gt8'],
                           ['unknown', 'never', 'less1', '1~3', '4~8', 'gt8'],
                           ['unknown', 'never', 'less1', '1~3', '4~8', 'gt8'],
                           ['unknown', 'never', 'less1', '1~3', '4~8', 'gt8'],
                           ['unknown', 'never', 'less1', '1~3', '4~8', 'gt8']]

ordinal_encoder = OrdinalEncoder(categories=ordinal_categories_list)

## run the encoding
encoded_ordinal_data = pd.DataFrame(ordinal_encoder.fit_transform(ordinal_data), columns=ordinal_data.columns)

In [16]:
nominal_colnames = ['destination', 'passanger', 'weather', 'expiration', 'gender', 'maritalStatus', 'occupation']
nominal_data = data[nominal_colnames]

encoded_nominal_data = pd.get_dummies(nominal_data, drop_first=True)

In [17]:
recombined_data = pd.concat([binary_numeric_data, encoded_ordinal_data, encoded_nominal_data, data.coupon], axis=1)

In [18]:
recombined_data.to_csv('data/full_dataset_encoded_plus_coupon.csv', index=False)

## Split encoded data by coupon type

In [19]:
Bar_data = recombined_data[recombined_data['coupon'] == 'Bar']
Bar_data = Bar_data.drop('coupon', axis=1)
Bar_data.to_csv('data/Bar_data_encoded.csv', index=False)

CoffeeHouse_data = recombined_data[recombined_data['coupon'] == 'Coffee House']
CoffeeHouse_data = CoffeeHouse_data.drop('coupon', axis=1)
CoffeeHouse_data.to_csv('data/CoffeeHouse_data_encoded.csv', index=False)

CarryAway_data = recombined_data[recombined_data['coupon'] == 'Carry out & Take away']
CarryAway_data = CarryAway_data.drop('coupon', axis=1)
CarryAway_data.to_csv('data/CarryAway_data_encoded.csv', index=False)

RestaurantLessThan20_data = recombined_data[recombined_data['coupon'] == 'Restaurant(<20)']
RestaurantLessThan20_data = RestaurantLessThan20_data.drop('coupon', axis=1)
RestaurantLessThan20_data.to_csv('data/RestaurantLessThan20_data_encoded.csv', index=False)

Restaurant20To50_data = recombined_data[recombined_data['coupon'] == 'Restaurant(20-50)']
Restaurant20To50_data = Restaurant20To50_data.drop('coupon', axis=1)
Restaurant20To50_data.to_csv('data/Restaurant20To50_data_encoded.csv', index=False)

# Phase II: Exploring encoding 5 variables as nominal instead of ordinal
('time', 'age', 'education', 'income', 'temperature')


## Nominal encode each of the 5 features individually 

In [21]:
ordinal_colnames = ['time', 'age', 'education', 'income', 'temperature', 'minsToCouponDest',
                   'Bar', 'CoffeeHouse', 'CarryAway', 'RestaurantLessThan20', 'Restaurant20To50']

ordinal_categories_list = [['7AM', '10AM', '2PM', '6PM', '10PM'],
                           ['below21', '21', '26', '31', '36', '41', '46', '50plus'],
                           ['Some High School', 'High School Graduate', 'Some college - no degree',
                            'Associates degree', 'Bachelors degree', 'Graduate degree (Masters or Doctorate)'],
                           ['Less than $12500', '$12500 - $24999', '$25000 - $37499', '$37500 - $49999',
                            '$50000 - $62499', '$62500 - $74999', '$75000 - $87499', '$87500 - $99999',
                            '$100000 or More'],
                           ['30', '55', '80'],
                           ['5-14', '15-24', '25plus'],
                           ['unknown', 'never', 'less1', '1~3', '4~8', 'gt8'],
                           ['unknown', 'never', 'less1', '1~3', '4~8', 'gt8'],
                           ['unknown', 'never', 'less1', '1~3', '4~8', 'gt8'],
                           ['unknown', 'never', 'less1', '1~3', '4~8', 'gt8'],
                           ['unknown', 'never', 'less1', '1~3', '4~8', 'gt8']]
nominal_colnames = ['destination', 'passanger', 'weather', 'expiration', 'gender', 'maritalStatus', 'occupation']

coupon_type_outfile_name = {'Bar': 'Bar',
                           'Coffee House': 'CoffeeHouse',
                           'Carry out & Take away': 'CarryAway',
                           'Restaurant(<20)': 'RestaurantLessThan20',
                           'Restaurant(20-50)': 'Restaurant20To50'}

for i in range(5):
    my_ordinal_colnames = copy.deepcopy(ordinal_colnames)
    nom_feature = my_ordinal_colnames.pop(i)
    print(f"{nom_feature}\n")

    my_ordinal_data = data[my_ordinal_colnames]
    my_ordinal_categories_list = copy.deepcopy(ordinal_categories_list)
    del my_ordinal_categories_list[i]
    my_ordinal_encoder = OrdinalEncoder(categories=my_ordinal_categories_list)

    my_encoded_ordinal_data = pd.DataFrame(my_ordinal_encoder.fit_transform(my_ordinal_data), 
                                           columns=my_ordinal_data.columns)
    
    my_nominal_colnames = nominal_colnames + [nom_feature]
    my_nominal_data = data[my_nominal_colnames]
    my_encoded_nominal_data = pd.get_dummies(my_nominal_data, drop_first=True)

    my_recombined_data = pd.concat([binary_numeric_data, 
                                    my_encoded_ordinal_data, 
                                    my_encoded_nominal_data, data.coupon], axis=1)
    my_recombined_data.to_csv(f'data/full_dataset_encoded_plus_coupon-{nom_feature}_nominal.csv', index=False)
    
    for k,v in coupon_type_outfile_name.items():
        subset_data = my_recombined_data[my_recombined_data['coupon'] == k]
        subset_data = subset_data.drop('coupon', axis=1)
        subset_data.to_csv(f'data/{v}_data_encoded-{nom_feature}_nominal.csv', index=False)

time

age

education

income

temperature



## Encode all 5 as nominal (together)

In [22]:
my_ordinal_colnames = ['minsToCouponDest', 'Bar', 'CoffeeHouse', 'CarryAway', 'RestaurantLessThan20', 
                       'Restaurant20To50']

my_ordinal_categories_list = [['5-14', '15-24', '25plus'],
                           ['unknown', 'never', 'less1', '1~3', '4~8', 'gt8'],
                           ['unknown', 'never', 'less1', '1~3', '4~8', 'gt8'],
                           ['unknown', 'never', 'less1', '1~3', '4~8', 'gt8'],
                           ['unknown', 'never', 'less1', '1~3', '4~8', 'gt8'],
                           ['unknown', 'never', 'less1', '1~3', '4~8', 'gt8']]

my_nominal_colnames = ['time', 'age', 'education', 'income', 'temperature',
                    'destination', 'passanger', 'weather', 'expiration', 'gender', 'maritalStatus', 'occupation']

coupon_type_outfile_name = {'Bar': 'Bar',
                           'Coffee House': 'CoffeeHouse',
                           'Carry out & Take away': 'CarryAway',
                           'Restaurant(<20)': 'RestaurantLessThan20',
                           'Restaurant(20-50)': 'Restaurant20To50'}

my_ordinal_data = data[my_ordinal_colnames]
my_ordinal_encoder = OrdinalEncoder(categories=my_ordinal_categories_list)
my_encoded_ordinal_data = pd.DataFrame(my_ordinal_encoder.fit_transform(my_ordinal_data), 
                                       columns=my_ordinal_data.columns)

my_nominal_data = data[my_nominal_colnames]
my_encoded_nominal_data = pd.get_dummies(my_nominal_data, drop_first=True)

my_recombined_data = pd.concat([binary_numeric_data, 
                                my_encoded_ordinal_data, 
                                my_encoded_nominal_data, data.coupon], axis=1)
my_recombined_data.to_csv(f'data/full_dataset_encoded_plus_coupon-all5_nominal.csv', index=False)

for k,v in coupon_type_outfile_name.items():
    subset_data = my_recombined_data[my_recombined_data['coupon'] == k]
    subset_data = subset_data.drop('coupon', axis=1)
    subset_data.to_csv(f'data/{v}_data_encoded-all5_nominal.csv', index=False)

## Encode subsets of 2-4 for each coupon

Choice of subsets is based on the results seen in `assess_nominal_vs_ordinal_encoding.ipynb`

Encode subset of 5 features that have a better test score for their individual nominal encoding (as compared to original all ordinal)

Note: Bar and CarryOut only have one feature that's better so we don't need a new subset

output filenaming:
* A: age
* E: education
* I: income
* M: time
* P: temperature

### Expensive Restaurant

In [23]:
### Expensive Restaurant
my_ordinal_colnames = ['income', 'minsToCouponDest',
                   'Bar', 'CoffeeHouse', 'CarryAway', 'RestaurantLessThan20', 'Restaurant20To50']

my_ordinal_categories_list = [['Less than $12500', '$12500 - $24999', '$25000 - $37499', '$37500 - $49999',
                            '$50000 - $62499', '$62500 - $74999', '$75000 - $87499', '$87500 - $99999',
                            '$100000 or More'],
                           ['5-14', '15-24', '25plus'],
                           ['unknown', 'never', 'less1', '1~3', '4~8', 'gt8'],
                           ['unknown', 'never', 'less1', '1~3', '4~8', 'gt8'],
                           ['unknown', 'never', 'less1', '1~3', '4~8', 'gt8'],
                           ['unknown', 'never', 'less1', '1~3', '4~8', 'gt8'],
                           ['unknown', 'never', 'less1', '1~3', '4~8', 'gt8']]

my_nominal_colnames = ['time', 'age', 'education', 'temperature',
                       'destination', 'passanger', 'weather', 'expiration', 'gender', 'maritalStatus', 'occupation']

my_ordinal_data = data[my_ordinal_colnames]
my_ordinal_encoder = OrdinalEncoder(categories=my_ordinal_categories_list)
my_encoded_ordinal_data = pd.DataFrame(my_ordinal_encoder.fit_transform(my_ordinal_data), 
                                       columns=my_ordinal_data.columns)

my_nominal_data = data[my_nominal_colnames]
my_encoded_nominal_data = pd.get_dummies(my_nominal_data, drop_first=True)

my_recombined_data = pd.concat([binary_numeric_data, 
                                my_encoded_ordinal_data, 
                                my_encoded_nominal_data, data.coupon], axis=1)

subset_data = my_recombined_data[my_recombined_data['coupon'] == 'Restaurant(20-50)']
subset_data = subset_data.drop('coupon', axis=1)
subset_data.to_csv('data/Restaurant20To50_data_encoded-AEPM_nominal.csv', index=False)

### Cheap Restaurant

In [24]:
### Cheap Restaurant
my_ordinal_colnames = ['age', 'minsToCouponDest',
                   'Bar', 'CoffeeHouse', 'CarryAway', 'RestaurantLessThan20', 'Restaurant20To50']

my_ordinal_categories_list = [['below21', '21', '26', '31', '36', '41', '46', '50plus'],
                           ['5-14', '15-24', '25plus'],
                           ['unknown', 'never', 'less1', '1~3', '4~8', 'gt8'],
                           ['unknown', 'never', 'less1', '1~3', '4~8', 'gt8'],
                           ['unknown', 'never', 'less1', '1~3', '4~8', 'gt8'],
                           ['unknown', 'never', 'less1', '1~3', '4~8', 'gt8'],
                           ['unknown', 'never', 'less1', '1~3', '4~8', 'gt8']]

my_nominal_colnames = ['time', 'education', 'income', 'temperature', 
                       'destination', 'passanger', 'weather', 'expiration', 'gender', 'maritalStatus', 'occupation']

my_ordinal_data = data[my_ordinal_colnames]
my_ordinal_encoder = OrdinalEncoder(categories=my_ordinal_categories_list)
my_encoded_ordinal_data = pd.DataFrame(my_ordinal_encoder.fit_transform(my_ordinal_data), 
                                       columns=my_ordinal_data.columns)

my_nominal_data = data[my_nominal_colnames]
my_encoded_nominal_data = pd.get_dummies(my_nominal_data, drop_first=True)

my_recombined_data = pd.concat([binary_numeric_data, 
                                my_encoded_ordinal_data, 
                                my_encoded_nominal_data, data.coupon], axis=1)

subset_data = my_recombined_data[my_recombined_data['coupon'] == 'Restaurant(<20)']
subset_data = subset_data.drop('coupon', axis=1)
subset_data.to_csv('data/RestaurantLessThan20_data_encoded-EIPM_nominal.csv', index=False)

### Coffee House

In [25]:
### Coffee House
my_ordinal_colnames = ['time', 'education', 'temperature', 'minsToCouponDest',
                   'Bar', 'CoffeeHouse', 'CarryAway', 'RestaurantLessThan20', 'Restaurant20To50']

my_ordinal_categories_list = [['7AM', '10AM', '2PM', '6PM', '10PM'],
                           ['Some High School', 'High School Graduate', 'Some college - no degree',
                            'Associates degree', 'Bachelors degree', 'Graduate degree (Masters or Doctorate)'],
                           ['30', '55', '80'],
                           ['5-14', '15-24', '25plus'],
                           ['unknown', 'never', 'less1', '1~3', '4~8', 'gt8'],
                           ['unknown', 'never', 'less1', '1~3', '4~8', 'gt8'],
                           ['unknown', 'never', 'less1', '1~3', '4~8', 'gt8'],
                           ['unknown', 'never', 'less1', '1~3', '4~8', 'gt8'],
                           ['unknown', 'never', 'less1', '1~3', '4~8', 'gt8']]

my_nominal_colnames = ['age', 'income',
                       'destination', 'passanger', 'weather', 'expiration', 'gender', 'maritalStatus', 'occupation']

my_ordinal_data = data[my_ordinal_colnames]
my_ordinal_encoder = OrdinalEncoder(categories=my_ordinal_categories_list)
my_encoded_ordinal_data = pd.DataFrame(my_ordinal_encoder.fit_transform(my_ordinal_data), 
                                       columns=my_ordinal_data.columns)

my_nominal_data = data[my_nominal_colnames]
my_encoded_nominal_data = pd.get_dummies(my_nominal_data, drop_first=True)

my_recombined_data = pd.concat([binary_numeric_data, 
                                my_encoded_ordinal_data, 
                                my_encoded_nominal_data, data.coupon], axis=1)

subset_data = my_recombined_data[my_recombined_data['coupon'] == 'Coffee House']
subset_data = subset_data.drop('coupon', axis=1)
subset_data.to_csv('data/CoffeeHouse_data_encoded-AI_nominal.csv', index=False)