In [33]:
import os
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
import pickle

from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer

## to make it possible to display multiple output inside one cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.options.display.max_rows = 50
pd.set_option('display.float_format', lambda x: '%9.8f' % x)

In [2]:
file_name = "data/in-vehicle-coupon-recommendation.csv"
data = pd.read_csv(file_name)

data = data.drop(['car', 'direction_opp'], axis=1)

In [3]:
## populate new column minsToCouponDest with values ['5-14', '15-24', '25plus']
data.loc[(data.toCoupon_GEQ25min == 1), 'minsToCouponDest'] = '25plus'
data.loc[(data.toCoupon_GEQ25min == 0) & (data.toCoupon_GEQ15min==1), 'minsToCouponDest'] = '15-24'
data.loc[(data.toCoupon_GEQ25min == 0) & (data.toCoupon_GEQ15min==0), 'minsToCouponDest'] = '5-14'

## drop separate cols
data = data.drop(['toCoupon_GEQ5min', 'toCoupon_GEQ15min', 'toCoupon_GEQ25min'], axis=1)

In [8]:
## convert temperature to string
data['temperature'] = data.temperature.astype(str)

# Split by coupon type

In [9]:
Bar_data = data[data['coupon'] == 'Bar']
Bar_data = Bar_data.drop('coupon', axis=1)
Bar_data.to_csv('data/Bar_data.csv', index=False)

CoffeeHouse_data = data[data['coupon'] == 'Coffee House']
CoffeeHouse_data = CoffeeHouse_data.drop('coupon', axis=1)
CoffeeHouse_data.to_csv('data/CoffeeHouse_data.csv', index=False)

CarryAway_data = data[data['coupon'] == 'Carry out & Take away']
CarryAway_data = CarryAway_data.drop('coupon', axis=1)
CarryAway_data.to_csv('data/CarryAway_data.csv', index=False)

RestaurantLessThan20_data = data[data['coupon'] == 'Restaurant(<20)']
RestaurantLessThan20_data = RestaurantLessThan20_data.drop('coupon', axis=1)
RestaurantLessThan20_data.to_csv('data/RestaurantLessThan20_data.csv', index=False)

Restaurant20To50_data = data[data['coupon'] == 'Restaurant(20-50)']
Restaurant20To50_data = Restaurant20To50_data.drop('coupon', axis=1)
Restaurant20To50_data.to_csv('data/Restaurant20To50_data.csv', index=False)

## Handle missing values

In [12]:
data.columns[data.isna().any()].tolist()

['Bar', 'CoffeeHouse', 'CarryAway', 'RestaurantLessThan20', 'Restaurant20To50']

In [29]:
## manually add indicator columns
data['Bar_freq_unknown'] = np.where(data['Bar'].isna(), 1, 0)
data['CoffeeHouse_freq_unknown'] = np.where(data['CoffeeHouse'].isna(), 1, 0)
data['CarryAway_freq_unknown'] = np.where(data['CarryAway'].isna(), 1, 0)
data['RestaurantLessThan20_freq_unknown'] = np.where(data['RestaurantLessThan20'].isna(), 1, 0)
data['Restaurant20To50_freq_unknown'] = np.where(data['Restaurant20To50'].isna(), 1, 0)

In [30]:
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

In [31]:
## fit and transform using the entire dataset
imputer.fit(data)
data2 = imputer.transform(data)

data = pd.DataFrame(data2, columns = data.columns)

SimpleImputer(strategy='most_frequent')

In [34]:
filename = f"model/imputer.pickle"
with open(filename, "wb") as f:
    pickle.dump(imputer,f)

In [35]:
data.columns

Index(['destination', 'passanger', 'weather', 'temperature', 'time', 'coupon',
       'expiration', 'gender', 'age', 'maritalStatus', 'has_children',
       'education', 'occupation', 'income', 'Bar', 'CoffeeHouse', 'CarryAway',
       'RestaurantLessThan20', 'Restaurant20To50', 'direction_same', 'Y',
       'minsToCouponDest', 'Bar_freq_unknown', 'CoffeeHouse_freq_unknown',
       'CarryAway_freq_unknown', 'RestaurantLessThan20_freq_unknown',
       'Restaurant20To50_freq_unknown'],
      dtype='object')

# Encode features

### Binary features already encoded as numeric

`['has_children', 'direction_same', 'Bar_freq_unknown', 'CoffeeHouse_freq_unknown',
  'CarryAway_freq_unknown', 'RestaurantLessThan20_freq_unknown', 'Restaurant20To50_freq_unknown', 'Y']`

### Ordinal

`['Bar', 'CoffeeHouse', 'CarryAway', 'RestaurantLessThan20', 'Restaurant20To50', 'minsToCouponDest']`


### Ordinal features where the order may or may not matter. 
(Encode as Ordinal for now)

`['time', 'age', 'education', 'income', 'temperature']`


### Nominal (unordered)

`['destination', 'passanger', 'weather', 'expiration', 'gender', 'maritalStatus', 'occupation']`


In [36]:
binary_numeric_data = data[['has_children', 'direction_same', 'Bar_freq_unknown', 'CoffeeHouse_freq_unknown',
       'CarryAway_freq_unknown', 'RestaurantLessThan20_freq_unknown', 'Restaurant20To50_freq_unknown', 'Y']]

In [39]:
## Note: encoding cannot handle missing values
ordinal_colnames = ['time', 'age', 'education', 'income', 'temperature', 'minsToCouponDest',
                   'Bar', 'CoffeeHouse', 'CarryAway', 'RestaurantLessThan20', 'Restaurant20To50']
ordinal_data = data[ordinal_colnames]

ordinal_categories_list = [['7AM', '10AM', '2PM', '6PM', '10PM'],
                           ['below21', '21', '26', '31', '36', '41', '46', '50plus'],
                           ['Some High School', 'High School Graduate', 'Some college - no degree',
                            'Associates degree', 'Bachelors degree', 'Graduate degree (Masters or Doctorate)'],
                           ['Less than $12500', '$12500 - $24999', '$25000 - $37499', '$37500 - $49999',
                            '$50000 - $62499', '$62500 - $74999', '$75000 - $87499', '$87500 - $99999',
                            '$100000 or More'],
                           ['30', '55', '80'],
                           ['5-14', '15-24', '25plus'],
                           ['never', 'less1', '1~3', '4~8', 'gt8'],
                           ['never', 'less1', '1~3', '4~8', 'gt8'],
                           ['never', 'less1', '1~3', '4~8', 'gt8'],
                           ['never', 'less1', '1~3', '4~8', 'gt8'],
                           ['never', 'less1', '1~3', '4~8', 'gt8']]

ordinal_encoder = OrdinalEncoder(categories=ordinal_categories_list)

## run the encoding
encoded_ordinal_data = pd.DataFrame(ordinal_encoder.fit_transform(ordinal_data), columns=ordinal_data.columns)

In [38]:
nominal_data = data[['destination', 'passanger', 'weather', 'expiration', 'gender', 'maritalStatus', 'occupation']]

encoded_nominal_data = pd.get_dummies(nominal_data, drop_first=True)

In [44]:
recombined_data = pd.concat([binary_numeric_data, encoded_ordinal_data, encoded_nominal_data, data.coupon], axis=1)

In [47]:
recombined_data.to_csv('data/full_dataset_encoded_plus_coupon.csv', index=False)

## Split encoded data by coupon type

In [46]:
Bar_data = recombined_data[recombined_data['coupon'] == 'Bar']
Bar_data = Bar_data.drop('coupon', axis=1)
Bar_data.to_csv('data/Bar_data_encoded.csv', index=False)

CoffeeHouse_data = recombined_data[recombined_data['coupon'] == 'Coffee House']
CoffeeHouse_data = CoffeeHouse_data.drop('coupon', axis=1)
CoffeeHouse_data.to_csv('data/CoffeeHouse_data_encoded.csv', index=False)

CarryAway_data = recombined_data[recombined_data['coupon'] == 'Carry out & Take away']
CarryAway_data = CarryAway_data.drop('coupon', axis=1)
CarryAway_data.to_csv('data/CarryAway_data_encoded.csv', index=False)

RestaurantLessThan20_data = recombined_data[recombined_data['coupon'] == 'Restaurant(<20)']
RestaurantLessThan20_data = RestaurantLessThan20_data.drop('coupon', axis=1)
RestaurantLessThan20_data.to_csv('data/RestaurantLessThan20_data_encoded.csv', index=False)

Restaurant20To50_data = recombined_data[recombined_data['coupon'] == 'Restaurant(20-50)']
Restaurant20To50_data = Restaurant20To50_data.drop('coupon', axis=1)
Restaurant20To50_data.to_csv('data/Restaurant20To50_data_encoded.csv', index=False)