In [1]:
import os
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import minmax_scale
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import PowerTransformer

from imblearn.over_sampling import SMOTE

from sklearn.impute import KNNImputer

## models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.svm import LinearSVC

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, cross_val_predict, cross_validate

import statsmodels.api as sm
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

## to make it possible to display multiple output inside one cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.options.display.max_rows = 50
pd.set_option('display.float_format', lambda x: '%9.8f' % x)

In [2]:
def run_logistic_regression_trials(X_train, y_train):
    ## NOTE: this is actually ridge regression, as the default penalty is l2
    ## Tries solvers: liblinear and lbfgs

    df_list = [] # will be used to create reg_results (full list of scores, one row per run)
    reg_mean_scores_df = pd.DataFrame() # mean scores one row per cv run, also has col used to save the models

    idx = 0
    for cvk in [5,10]:
        for solver_name in ['liblinear', 'lbfgs']:
            reg_model = LogisticRegression(solver=solver_name)
            scores_list = cross_val_score(reg_model, X_train, y_train, cv=cvk)

            df_list.append(pd.DataFrame({'score': scores_list, 'cvk':[cvk]*cvk, 'solver':[solver_name]*cvk}))

            new_score_record = pd.DataFrame({'cvk': cvk, 'solver': solver_name, 'init_model': reg_model, 
                                             'mean_score': np.mean(scores_list)}, index=[idx])
            reg_mean_scores_df = pd.concat([reg_mean_scores_df,new_score_record], axis=0)
            idx += 1
    reg_results = pd.concat(df_list)
    
    return reg_mean_scores_df, reg_results

# Load Data

In [3]:
file_name = "data/in-vehicle-coupon-recommendation.csv"
data = pd.read_csv(file_name)

In [4]:
data.shape

(12684, 26)

## Check for imbalance
Full dataset: 57% Yes  

### Coupon type subsets

Restaurant(<20) 		70.71% Yes
Coffee House 		    49.92% Yes
Carry out & Take away 	73.55% Yes
Bar  		            41.00% Yes
Restaurant(20-50)  		44.10% Yes

In [5]:
data.Y.value_counts()

1    7210
0    5474
Name: Y, dtype: int64

In [6]:
7210/12684

0.5684326710816777

In [30]:
for coupon_type in data.coupon.unique():
    vc = data.loc[data['coupon'] == coupon_type, 'Y'].value_counts()
    print(f'{coupon_type} :\t\t{100*vc[1]/(vc[0]+vc[1]):.2f}% Yes')


Restaurant(<20) :		70.71% Yes
Coffee House :		49.92% Yes
Carry out & Take away :		73.55% Yes
Bar :		41.00% Yes
Restaurant(20-50) :		44.10% Yes


## How many records do we have for each coupon type?
Note: Numbers are kind of small... especially for the expensive restaurants

In [7]:
data.coupon.value_counts()

Coffee House             3996
Restaurant(<20)          2786
Carry out & Take away    2393
Bar                      2017
Restaurant(20-50)        1492
Name: coupon, dtype: int64

## Cleaning: Drop mostly null feature 'car'
Only 108/12684 records include this feature, and it's not mentioned in the feature descriptions.

In [31]:
data.car.value_counts()

Scooter and motorcycle                      22
Mazda5                                      22
do not drive                                22
crossover                                   21
Car that is too old to install Onstar :D    21
Name: car, dtype: int64

In [32]:
data = data.drop('car', axis=1)

## Extract Numerical variables

In [33]:
numeric_data = data.select_dtypes('number')
numeric_data.columns

Index(['temperature', 'has_children', 'toCoupon_GEQ5min', 'toCoupon_GEQ15min',
       'toCoupon_GEQ25min', 'direction_same', 'direction_opp', 'Y'],
      dtype='object')

### Check for Null values 
Result: no missing values

In [34]:
nan_colnames = numeric_data.columns[numeric_data.isna().any()].tolist()
nan_colnames

[]

## Define X,y train/test

In [35]:
## define X and y 
X = numeric_data.drop('Y', axis=1).reset_index(drop=True)
y = numeric_data.Y

## Data splitting train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Evaluate using only numeric features

## logistic_regression_trials

In [36]:
reg_mean_scores_df, reg_results = run_logistic_regression_trials(X_train, y_train)
display(reg_mean_scores_df.sort_values('mean_score', ascending=False))

Unnamed: 0,cvk,solver,init_model,mean_score
0,5,liblinear,LogisticRegression(solver='liblinear'),0.5854935
1,5,lbfgs,LogisticRegression(),0.5854935
2,10,liblinear,LogisticRegression(solver='liblinear'),0.58460868
3,10,lbfgs,LogisticRegression(),0.58460868


## Single RandomForest

In [37]:
clf = RandomForestClassifier(max_depth=5, random_state=42)
cross_val_scores = cross_val_score(clf, X_train, y_train, cv=5)
print(f"{np.mean(cross_val_scores)} {np.std(cross_val_scores)}")

0.5910123893203718 0.0063071195794977206


# Add Categoricals

In [38]:
categorical_data = data.select_dtypes('object')
categorical_data.columns

Index(['destination', 'passanger', 'weather', 'time', 'coupon', 'expiration',
       'gender', 'age', 'maritalStatus', 'education', 'occupation', 'income',
       'Bar', 'CoffeeHouse', 'CarryAway', 'RestaurantLessThan20',
       'Restaurant20To50'],
      dtype='object')

## Replace missing values with 'unknown'

In [39]:
nan_colnames = categorical_data.columns[categorical_data.isna().any()].tolist()
nan_colnames

['Bar', 'CoffeeHouse', 'CarryAway', 'RestaurantLessThan20', 'Restaurant20To50']

In [40]:
categorical_data.fillna('unknown', inplace=True)

## Ordinal features

In [41]:
ordinal_colnames = ['income', 'time', 'education', 
                   'Bar', 'CoffeeHouse', 'CarryAway', 'RestaurantLessThan20', 'Restaurant20To50']
ordinal_data = categorical_data[ordinal_colnames]

ordinal_categories_list = [['Less than $12500', '$12500 - $24999', '$25000 - $37499', '$37500 - $49999',
                            '$50000 - $62499', '$62500 - $74999', '$75000 - $87499', '$87500 - $99999',
                            '$100000 or More'],
                           ['7AM', '10AM', '2PM', '6PM', '10PM'],
                           ['Some High School', 'High School Graduate', 'Some college - no degree',
                            'Associates degree', 'Bachelors degree', 'Graduate degree (Masters or Doctorate)'],
                           ['unknown', 'never', 'less1', '1~3', '4~8', 'gt8'],
                           ['unknown', 'never', 'less1', '1~3', '4~8', 'gt8'],
                           ['unknown', 'never', 'less1', '1~3', '4~8', 'gt8'],
                           ['unknown', 'never', 'less1', '1~3', '4~8', 'gt8'],
                           ['unknown', 'never', 'less1', '1~3', '4~8', 'gt8']]

ordinal_encoder = OrdinalEncoder(categories=ordinal_categories_list)

## run the encoding
ordinal_encoded_data = pd.DataFrame(ordinal_encoder.fit_transform(ordinal_data), columns=ordinal_data.columns)

## Nominal features

In [42]:
nominal_data = categorical_data.drop(ordinal_colnames, axis=1)

In [43]:
nominal_data.columns
nominal_data.shape[1]

Index(['destination', 'passanger', 'weather', 'coupon', 'expiration', 'gender',
       'age', 'maritalStatus', 'occupation'],
      dtype='object')

9

## Encode nominals

In [44]:
encoded_nominal_data = pd.get_dummies(nominal_data, drop_first=True)
encoded_nominal_data.shape[1]

48

## All features: define X/y train/test

In [45]:
recombined_data = pd.concat([numeric_data, ordinal_encoded_data, encoded_nominal_data], axis=1)

## define X and y 
X = recombined_data.drop('Y', axis=1).reset_index(drop=True)
y = recombined_data.Y

## Data splitting train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

## single random forest

In [46]:
clf = RandomForestClassifier(max_depth=5, random_state=42)
cross_val_scores = cross_val_score(clf, X_train, y_train, cv=5)
print(f"{np.mean(cross_val_scores)} {np.std(cross_val_scores)}")

0.6780327614127175 0.0075288597072811665


# Split and analyse by coupon type

In [50]:
encoded_nominal_data_no_coupon = pd.get_dummies(nominal_data.drop('coupon', axis=1), drop_first=True)
data_to_split = pd.concat([numeric_data, ordinal_encoded_data, encoded_nominal_data_no_coupon, 
                           nominal_data.coupon], axis=1)

In [56]:
cols_to_keep = data_to_split.columns.to_list()
cols_to_keep.remove('coupon')

In [58]:
Bar_data = data_to_split.loc[data_to_split['coupon'] == 'Bar', cols_to_keep]
CoffeeHouse_data = data_to_split.loc[data_to_split['coupon'] == 'Coffee House', cols_to_keep]
CarryAway_data = data_to_split.loc[data_to_split['coupon'] == 'Carry out & Take away', cols_to_keep]
RestaurantLessThan20_data = data_to_split.loc[data_to_split['coupon'] == 'Restaurant(<20)', cols_to_keep]
Restaurant20To50_data = data_to_split.loc[data_to_split['coupon'] == 'Restaurant(20-50)', cols_to_keep]