# In Vehicle Coupon Recommendation

## Reading the dataset 

In [277]:
import warnings
import pandas as pd

warnings.filterwarnings("ignore")

# Reading data from the source file
vehicle_coupon_df = pd.read_csv('in-vehicle-coupon-recommendation.csv')
print(vehicle_coupon_df.shape)
vehicle_coupon_df

(12684, 26)


Unnamed: 0,destination,passanger,weather,temperature,time,coupon,expiration,gender,age,maritalStatus,...,CoffeeHouse,CarryAway,RestaurantLessThan20,Restaurant20To50,toCoupon_GEQ5min,toCoupon_GEQ15min,toCoupon_GEQ25min,direction_same,direction_opp,Y
0,No Urgent Place,Alone,Sunny,55,2PM,Restaurant(<20),1d,Female,21,Unmarried partner,...,never,,4~8,1~3,1,0,0,0,1,1
1,No Urgent Place,Friend(s),Sunny,80,10AM,Coffee House,2h,Female,21,Unmarried partner,...,never,,4~8,1~3,1,0,0,0,1,0
2,No Urgent Place,Friend(s),Sunny,80,10AM,Carry out & Take away,2h,Female,21,Unmarried partner,...,never,,4~8,1~3,1,1,0,0,1,1
3,No Urgent Place,Friend(s),Sunny,80,2PM,Coffee House,2h,Female,21,Unmarried partner,...,never,,4~8,1~3,1,1,0,0,1,0
4,No Urgent Place,Friend(s),Sunny,80,2PM,Coffee House,1d,Female,21,Unmarried partner,...,never,,4~8,1~3,1,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12679,Home,Partner,Rainy,55,6PM,Carry out & Take away,1d,Male,26,Single,...,never,1~3,4~8,1~3,1,0,0,1,0,1
12680,Work,Alone,Rainy,55,7AM,Carry out & Take away,1d,Male,26,Single,...,never,1~3,4~8,1~3,1,0,0,0,1,1
12681,Work,Alone,Snowy,30,7AM,Coffee House,1d,Male,26,Single,...,never,1~3,4~8,1~3,1,0,0,1,0,0
12682,Work,Alone,Snowy,30,7AM,Bar,1d,Male,26,Single,...,never,1~3,4~8,1~3,1,1,1,0,1,0


## Checking the dataset for any null values and cleaning the dataset 

In [278]:
# Checking if null values exist in the dataframe
vehicle_coupon_df.isnull().any()

destination             False
passanger               False
weather                 False
temperature             False
time                    False
coupon                  False
expiration              False
gender                  False
age                     False
maritalStatus           False
has_children            False
education               False
occupation              False
income                  False
car                      True
Bar                      True
CoffeeHouse              True
CarryAway                True
RestaurantLessThan20     True
Restaurant20To50         True
toCoupon_GEQ5min        False
toCoupon_GEQ15min       False
toCoupon_GEQ25min       False
direction_same          False
direction_opp           False
Y                       False
dtype: bool

In [279]:
# Count of rows with missing values
vehicle_coupon_df.isnull().sum()

destination                 0
passanger                   0
weather                     0
temperature                 0
time                        0
coupon                      0
expiration                  0
gender                      0
age                         0
maritalStatus               0
has_children                0
education                   0
occupation                  0
income                      0
car                     12576
Bar                       107
CoffeeHouse               217
CarryAway                 151
RestaurantLessThan20      130
Restaurant20To50          189
toCoupon_GEQ5min            0
toCoupon_GEQ15min           0
toCoupon_GEQ25min           0
direction_same              0
direction_opp               0
Y                           0
dtype: int64

### Dropping the car column as it has a lot of null values and cannot be used and, removing other null values.

In [280]:
vehicle_coupon_df = vehicle_coupon_df.drop(['car'], axis = 1)
vehicle_coupon_df = vehicle_coupon_df.dropna()
vehicle_coupon_df.isnull().any()

destination             False
passanger               False
weather                 False
temperature             False
time                    False
coupon                  False
expiration              False
gender                  False
age                     False
maritalStatus           False
has_children            False
education               False
occupation              False
income                  False
Bar                     False
CoffeeHouse             False
CarryAway               False
RestaurantLessThan20    False
Restaurant20To50        False
toCoupon_GEQ5min        False
toCoupon_GEQ15min       False
toCoupon_GEQ25min       False
direction_same          False
direction_opp           False
Y                       False
dtype: bool

## Transforming the dataset 

In [281]:
# Categorizing the attributes of the dataset into respective types 

categorical_attributes = ['destination', 'passanger', 'weather', 'coupon', 'gender', 'maritalStatus', 'occupation']
ratio_attributes = ['temperature', 'age', 'expiration', 'has_children', 'toCoupon_GEQ5min', 'toCoupon_GEQ15min', 'toCoupon_GEQ25min', 'direction_same', 'direction_opp']
ordinal_attributes = ['time', 'income', 'education', 'Bar', 'CoffeeHouse', 'CarryAway', 'RestaurantLessThan20', 'Restaurant20To50']

### Modifying the categorical attributes 

In [282]:
vehicle_coupon_df[categorical_attributes]

Unnamed: 0,destination,passanger,weather,coupon,gender,maritalStatus,occupation
22,No Urgent Place,Alone,Sunny,Restaurant(<20),Male,Single,Architecture & Engineering
23,No Urgent Place,Friend(s),Sunny,Coffee House,Male,Single,Architecture & Engineering
24,No Urgent Place,Friend(s),Sunny,Bar,Male,Single,Architecture & Engineering
25,No Urgent Place,Friend(s),Sunny,Carry out & Take away,Male,Single,Architecture & Engineering
26,No Urgent Place,Friend(s),Sunny,Coffee House,Male,Single,Architecture & Engineering
...,...,...,...,...,...,...,...
12679,Home,Partner,Rainy,Carry out & Take away,Male,Single,Sales & Related
12680,Work,Alone,Rainy,Carry out & Take away,Male,Single,Sales & Related
12681,Work,Alone,Snowy,Coffee House,Male,Single,Sales & Related
12682,Work,Alone,Snowy,Bar,Male,Single,Sales & Related


In [283]:
for category in categorical_attributes:
    print(vehicle_coupon_df[category].unique().tolist())

['No Urgent Place', 'Home', 'Work']
['Alone', 'Friend(s)', 'Kid(s)', 'Partner']
['Sunny', 'Rainy', 'Snowy']
['Restaurant(<20)', 'Coffee House', 'Bar', 'Carry out & Take away', 'Restaurant(20-50)']
['Male', 'Female']
['Single', 'Married partner', 'Unmarried partner', 'Divorced', 'Widowed']
['Architecture & Engineering', 'Student', 'Education&Training&Library', 'Unemployed', 'Healthcare Support', 'Healthcare Practitioners & Technical', 'Sales & Related', 'Management', 'Arts Design Entertainment Sports & Media', 'Computer & Mathematical', 'Life Physical Social Science', 'Personal Care & Service', 'Office & Administrative Support', 'Construction & Extraction', 'Legal', 'Retired', 'Community & Social Services', 'Installation Maintenance & Repair', 'Transportation & Material Moving', 'Business & Financial', 'Protective Service', 'Food Preparation & Serving Related', 'Production Occupations', 'Building & Grounds Cleaning & Maintenance', 'Farming Fishing & Forestry']


In [284]:
# Defining a new empty data frame and converting categorical attributes using one-hot encoding method

mod_vehicle_coupon_df = None
for category in categorical_attributes:
    column = pd.get_dummies(vehicle_coupon_df[category])
    mod_vehicle_coupon_df = pd.concat([mod_vehicle_coupon_df, column], axis = 1)
mod_vehicle_coupon_df

Unnamed: 0,Home,No Urgent Place,Work,Alone,Friend(s),Kid(s),Partner,Rainy,Snowy,Sunny,...,Management,Office & Administrative Support,Personal Care & Service,Production Occupations,Protective Service,Retired,Sales & Related,Student,Transportation & Material Moving,Unemployed
22,0,1,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
23,0,1,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
24,0,1,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
25,0,1,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
26,0,1,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12679,1,0,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,1,0,0,0
12680,0,0,1,1,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
12681,0,0,1,1,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
12682,0,0,1,1,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0


### Modifying the ratio attributes 

In [285]:
vehicle_coupon_df[ratio_attributes]

Unnamed: 0,temperature,age,expiration,has_children,toCoupon_GEQ5min,toCoupon_GEQ15min,toCoupon_GEQ25min,direction_same,direction_opp
22,55,21,1d,0,1,0,0,0,1
23,80,21,2h,0,1,0,0,0,1
24,80,21,1d,0,1,0,0,0,1
25,80,21,2h,0,1,1,0,0,1
26,80,21,1d,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...
12679,55,26,1d,0,1,0,0,1,0
12680,55,26,1d,0,1,0,0,0,1
12681,30,26,1d,0,1,0,0,1,0
12682,30,26,1d,0,1,1,1,0,1


In [286]:
# Mapping the ratio attributes to the modified data frame and changing any data if required.

mod_vehicle_coupon_df['temperature'] = vehicle_coupon_df['temperature']
vehicle_coupon_df['expiration'] = vehicle_coupon_df['expiration'].replace('1d', 24)
vehicle_coupon_df['expiration'] = vehicle_coupon_df['expiration'].replace('2h', 2)
mod_vehicle_coupon_df['expiration'] = vehicle_coupon_df['expiration']
vehicle_coupon_df['age'] = vehicle_coupon_df['age'].replace('50plus', '51')
vehicle_coupon_df['age'] = vehicle_coupon_df['age'].replace('below21', '20')
mod_vehicle_coupon_df['age'] = vehicle_coupon_df['age']
mod_vehicle_coupon_df['has_children'] = vehicle_coupon_df['has_children']
mod_vehicle_coupon_df['toCoupon_GEQ5min'] = vehicle_coupon_df['toCoupon_GEQ5min']
mod_vehicle_coupon_df['toCoupon_GEQ15min'] = vehicle_coupon_df['toCoupon_GEQ15min']
mod_vehicle_coupon_df['toCoupon_GEQ25min'] = vehicle_coupon_df['toCoupon_GEQ25min']
mod_vehicle_coupon_df['direction_same'] = vehicle_coupon_df['direction_same']
mod_vehicle_coupon_df['direction_opp'] = vehicle_coupon_df['direction_opp']
mod_vehicle_coupon_df

Unnamed: 0,Home,No Urgent Place,Work,Alone,Friend(s),Kid(s),Partner,Rainy,Snowy,Sunny,...,Unemployed,temperature,expiration,age,has_children,toCoupon_GEQ5min,toCoupon_GEQ15min,toCoupon_GEQ25min,direction_same,direction_opp
22,0,1,0,1,0,0,0,0,0,1,...,0,55,24,21,0,1,0,0,0,1
23,0,1,0,0,1,0,0,0,0,1,...,0,80,2,21,0,1,0,0,0,1
24,0,1,0,0,1,0,0,0,0,1,...,0,80,24,21,0,1,0,0,0,1
25,0,1,0,0,1,0,0,0,0,1,...,0,80,2,21,0,1,1,0,0,1
26,0,1,0,0,1,0,0,0,0,1,...,0,80,24,21,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12679,1,0,0,0,0,0,1,1,0,0,...,0,55,24,26,0,1,0,0,1,0
12680,0,0,1,1,0,0,0,1,0,0,...,0,55,24,26,0,1,0,0,0,1
12681,0,0,1,1,0,0,0,0,1,0,...,0,30,24,26,0,1,0,0,1,0
12682,0,0,1,1,0,0,0,0,1,0,...,0,30,24,26,0,1,1,1,0,1


### Modifying ordinal attributes 

In [287]:
vehicle_coupon_df[ordinal_attributes]

Unnamed: 0,time,income,education,Bar,CoffeeHouse,CarryAway,RestaurantLessThan20,Restaurant20To50
22,2PM,$62500 - $74999,Bachelors degree,never,less1,4~8,4~8,less1
23,10AM,$62500 - $74999,Bachelors degree,never,less1,4~8,4~8,less1
24,10AM,$62500 - $74999,Bachelors degree,never,less1,4~8,4~8,less1
25,10AM,$62500 - $74999,Bachelors degree,never,less1,4~8,4~8,less1
26,2PM,$62500 - $74999,Bachelors degree,never,less1,4~8,4~8,less1
...,...,...,...,...,...,...,...,...
12679,6PM,$75000 - $87499,Bachelors degree,never,never,1~3,4~8,1~3
12680,7AM,$75000 - $87499,Bachelors degree,never,never,1~3,4~8,1~3
12681,7AM,$75000 - $87499,Bachelors degree,never,never,1~3,4~8,1~3
12682,7AM,$75000 - $87499,Bachelors degree,never,never,1~3,4~8,1~3


In [288]:
for value in ordinal_attributes:
    print(vehicle_coupon_df[value].unique().tolist())

['2PM', '10AM', '6PM', '7AM', '10PM']
['$62500 - $74999', '$12500 - $24999', '$75000 - $87499', '$50000 - $62499', '$37500 - $49999', '$25000 - $37499', '$100000 or More', '$87500 - $99999', 'Less than $12500']
['Bachelors degree', 'Some college - no degree', 'Associates degree', 'High School Graduate', 'Graduate degree (Masters or Doctorate)', 'Some High School']
['never', 'less1', '1~3', 'gt8', '4~8']
['less1', '4~8', '1~3', 'gt8', 'never']
['4~8', '1~3', 'gt8', 'less1', 'never']
['4~8', '1~3', 'less1', 'gt8', 'never']
['less1', 'never', '1~3', 'gt8', '4~8']


In [289]:
# Mapping the ordinal attributes with maps defined in respective order of heirarchy

time_map = {
'7AM' : 1,
'10AM' : 2,
'2PM' : 3,
'6PM' : 4,
'10PM' : 5
}
income_map = {
'Less than $12500' : 1,
'$12500 - $24999' : 2,
'$25000 - $37499' : 3,
'$37500 - $49999' : 4,
'$50000 - $62499' : 5,
'$62500 - $74999' : 6,
'$75000 - $87499' : 7,
'$87500 - $99999' : 8,
'$100000 or More' : 9
}
bar_map = {
'never' : 1,
'less1' : 2,
'1~3' : 3,
'4~8' : 4,
'gt8' : 5
}
coffeehouse_map = {
'never' : 1,
'less1' : 2,
'1~3' : 3,
'4~8' : 4,
'gt8' : 5
}
carryaway_map = {
'never' : 1,
'less1' : 2,
'1~3' : 3,
'4~8' : 4,
'gt8' : 5
}
edu = {
'Some High School' : 1,
'High School Graduate' : 2,
'Some college - no degree' : 3,
'Associates degree' : 4,
'Bachelors degree' : 5,
'Graduate degree (Masters or Doctorate)' : 6
}
vehicle_coupon_df['time'] = vehicle_coupon_df['time'].map(time_map)
vehicle_coupon_df['income'] = vehicle_coupon_df['income'].map(income_map)
vehicle_coupon_df['CoffeeHouse'] = vehicle_coupon_df['CoffeeHouse'].map(coffeehouse_map)
vehicle_coupon_df['Bar'] = vehicle_coupon_df['Bar'].map(bar_map)
vehicle_coupon_df['CarryAway'] = vehicle_coupon_df['CarryAway'].map(carryaway_map)
vehicle_coupon_df['RestaurantLessThan20'] = vehicle_coupon_df['RestaurantLessThan20'].map(carryaway_map)
vehicle_coupon_df['Restaurant20To50'] = vehicle_coupon_df['Restaurant20To50'].map(carryaway_map)
vehicle_coupon_df['education'] = vehicle_coupon_df['education'].map(edu)
vehicle_coupon_df[ordinal_attributes]

Unnamed: 0,time,income,education,Bar,CoffeeHouse,CarryAway,RestaurantLessThan20,Restaurant20To50
22,3,6,5,1,2,4,4,2
23,2,6,5,1,2,4,4,2
24,2,6,5,1,2,4,4,2
25,2,6,5,1,2,4,4,2
26,3,6,5,1,2,4,4,2
...,...,...,...,...,...,...,...,...
12679,4,7,5,1,1,3,4,3
12680,1,7,5,1,1,3,4,3
12681,1,7,5,1,1,3,4,3
12682,1,7,5,1,1,3,4,3


In [290]:
mod_vehicle_coupon_df['time'] = vehicle_coupon_df['time']
mod_vehicle_coupon_df['income'] = vehicle_coupon_df['income']
mod_vehicle_coupon_df['CoffeeHouse'] = vehicle_coupon_df['CoffeeHouse']
mod_vehicle_coupon_df['Bar'] = vehicle_coupon_df['Bar']
mod_vehicle_coupon_df['CarryAway'] = vehicle_coupon_df['CarryAway']
mod_vehicle_coupon_df['RestaurantLessThan20'] = vehicle_coupon_df['RestaurantLessThan20']
mod_vehicle_coupon_df['Restaurant20To50'] = vehicle_coupon_df['Restaurant20To50']
mod_vehicle_coupon_df['education'] = vehicle_coupon_df['education']
mod_vehicle_coupon_df

Unnamed: 0,Home,No Urgent Place,Work,Alone,Friend(s),Kid(s),Partner,Rainy,Snowy,Sunny,...,toCoupon_GEQ25min,direction_same,direction_opp,time,income,CoffeeHouse,CarryAway,RestaurantLessThan20,Restaurant20To50,education
22,0,1,0,1,0,0,0,0,0,1,...,0,0,1,3,6,2,4,4,2,5
23,0,1,0,0,1,0,0,0,0,1,...,0,0,1,2,6,2,4,4,2,5
24,0,1,0,0,1,0,0,0,0,1,...,0,0,1,2,6,2,4,4,2,5
25,0,1,0,0,1,0,0,0,0,1,...,0,0,1,2,6,2,4,4,2,5
26,0,1,0,0,1,0,0,0,0,1,...,0,0,1,3,6,2,4,4,2,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12679,1,0,0,0,0,0,1,1,0,0,...,0,1,0,4,7,1,3,4,3,5
12680,0,0,1,1,0,0,0,1,0,0,...,0,0,1,1,7,1,3,4,3,5
12681,0,0,1,1,0,0,0,0,1,0,...,0,1,0,1,7,1,3,4,3,5
12682,0,0,1,1,0,0,0,0,1,0,...,1,0,1,1,7,1,3,4,3,5


## Mapping the class label and modified data set

In [295]:
class_label_df = vehicle_coupon_df.Y
class_label_df

22       1
23       0
24       1
25       0
26       0
        ..
12679    1
12680    1
12681    0
12682    0
12683    0
Name: Y, Length: 12079, dtype: int64

## Splitting data into k folds and applying classification model 

In [297]:
from sklearn.model_selection import StratifiedKFold

# initializing cross-validation object

no_of_splits = 5
stratified_k_fold = StratifiedKFold(n_splits=no_of_splits, shuffle = True, random_state = 42)
stratified_k_fold.get_n_splits(mod_vehicle_coupon_df, class_label_df)
stratified_k_fold

StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

In [300]:
from sklearn.datasets import load_svmlight_file
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Initializing decision tree classifier and calculating accuracies for gini and entorpy parameters

gini_acc_vals = []
entropy_acc_vals = []
for train_set, test_set in stratified_k_fold.split(mod_vehicle_coupon_df, class_label_df):
    print(f"Training batch = {len(train_set)}, Testing set = {len(test_set)}")
    print(f"Training set = {train_set}, Testing set = {test_set}")
    mod_vehicle_coupon_df_train, mod_vehicle_coupon_df_test = mod_vehicle_coupon_df.iloc[train_set], mod_vehicle_coupon_df.iloc[test_set]
    class_label_df_train, class_label_df_test = class_label_df.iloc[train_set], class_label_df.iloc[test_set]
    
    # Calculating gini index
    decision_tree_gini = DecisionTreeClassifier(criterion = 'gini', max_depth = len(mod_vehicle_coupon_df.columns)*10, random_state = 42)
    decision_tree_gini = decision_tree_gini.fit(mod_vehicle_coupon_df_train, class_label_df_train)
    class_label_df_pred = decision_tree_gini.predict(mod_vehicle_coupon_df_test)
    gini_acc_vals.append(accuracy_score(class_label_df_test, class_label_df_pred))
    
    # Calculating entropy index
    decision_tree_entropy = DecisionTreeClassifier(criterion = 'entropy', max_depth = len(mod_vehicle_coupon_df.columns)*10, random_state = 42)
    decision_tree_entropy = decision_tree_entropy.fit(mod_vehicle_coupon_df_train, class_label_df_train)
    class_label_df_pred = decision_tree_entropy.predict(mod_vehicle_coupon_df_test)
    entropy_acc_vals.append(accuracy_score(class_label_df_test, class_label_df_pred))

print(f"\nGini accuracy values : {gini_acc_vals}")
print(f"Entropy accuracy values : {entropy_acc_vals}")

Training batch = 9663, Testing set = 2416
Training set = [    1     2     3 ... 12074 12077 12078], Testing set = [    0    12    19 ... 12070 12075 12076]
Training batch = 9663, Testing set = 2416
Training set = [    0     2     3 ... 12075 12076 12077], Testing set = [    1    26    27 ... 12058 12065 12078]
Training batch = 9663, Testing set = 2416
Training set = [    0     1     2 ... 12075 12076 12078], Testing set = [    4     5     6 ... 12066 12073 12077]
Training batch = 9663, Testing set = 2416
Training set = [    0     1     2 ... 12076 12077 12078], Testing set = [    3     8     9 ... 12071 12072 12074]
Training batch = 9664, Testing set = 2415
Training set = [    0     1     3 ... 12076 12077 12078], Testing set = [    2    11    15 ... 12062 12063 12067]

Gini accuracy values : [0.7044701986754967, 0.668046357615894, 0.6705298013245033, 0.6920529801324503, 0.6795031055900621]
Entropy accuracy values : [0.6982615894039735, 0.679635761589404, 0.6771523178807947, 0.68625827

## Calculating Accuracy in Gini and Entropy 

In [301]:
import numpy as np

print(f"Overall Gini Accuracy is : {np.average(gini_acc_vals)}")
print(f"Overall Entropy Accuracy is : {np.average(entropy_acc_vals)}")

Overall Gini Accuracy is : 0.6829204886676813
Overall Entropy Accuracy is : 0.6820918171472447
