# Imports

In [1]:
import sys
assert sys.version_info >= (3, 5)


import sklearn
assert sklearn.__version__ >= "0.20"

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.preprocessing import StandardScaler, RobustScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
from sklearn.utils import class_weight
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector
from imblearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder

from scipy.stats import alpha, randint, uniform

from imblearn.over_sampling import RandomOverSampler, ADASYN, SMOTE

from tabulate import tabulate
import numpy as np
np.random.seed(42)
import pandas as pd

import tensorflow as tf
import keras

import os

%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

from google.colab import drive
import joblib

# Dataset

## Features

1. destination[No Urgent Place, Home, Work]
2. passenger[Alone, Friend(s), Kid(s), Partner]
3. weather[Sunny, Rainy, Snowy]
4. temperatures[55, 80, 30]
5. time[2PM, 10AM, 6PM, 7AM, 10PM]
6. coupon[Restaurant(<20), Coffee House, Carry out & Take away, Bar, Restaurant(20-50)]
7. expiration[1d, 2h]
8. gender[Female, Male]
9. age[21, 46, 26, 31, 41, 50plus, 36, below21]
10. maritalStatus[Unmarried partner, Single, Married partner, Divorced, Widowed]
11. has_Children[0, 1]
12. education[ Some college - no degree, Bachelors degree, Associates degree, High School Graduate, Graduate degree (Masters or Doctorate), Some High School]
13. occupation[Unemployed, Architecture & Engineering, Student,
Education&Training&Library, Healthcare Support,
Healthcare Practitioners & Technical, Sales & Related, Management,
Arts Design Entertainment Sports & Media, Computer & Mathematical,
Life Physical Social Science, Personal Care & Service,
Community & Social Services, Office & Administrative Support,
Construction & Extraction, Legal, Retired,
Installation Maintenance & Repair, Transportation & Material Moving,
Business & Financial, Protective Service,
Food Preparation & Serving Related, Production Occupations,
Building & Grounds Cleaning & Maintenance, Farming Fishing & Forestry]
14. income[$37500 - $49999, $62500 - $74999, $12500 - $24999, $75000 - $87499,
$50000 - $62499, $25000 - $37499, $100000 or More, $87500 - $99999, Less than $12500]
15. car[]
16. Bar[ever, less1, 1-3, gt8,  nan4~8] : howm many times do you go to a bar every month
17. Coffee House[never, less1, 4-8, 1-3, gt8,  nan]: How many times do you go to a coffehouse every month
18. Carry Away[n4~8, 1-3, gt8, less1, never]:  how many times do you get take-away food every month
19. RestaurantLessThan20[ 4-8, 1-3, less1, gt8, never]: how many times do you go to a restaurant with an average expense per person of less than 20 every month
20. Restaurant20To50[1-3, less1, never, gt8, 4-8,  nan]: how many times do you go to a restaurant with average expense per person of 20 - $50 every month
21. toCoupon_GE5min[0, 1]: driving distance to the restaurant/bar for using the coupon is greater than 5 minutes
22. toCoupon_GEQ15min[0,1]: driving distance to the restaurant/bar for using the coupon is greater than 15 minutes
23. toCoupon_GEQ25min[0, 1]: driving distance to the restaurant/bar for using the coupon is greater than 25 minutes
24. direction_same[0,1]: whether the restaurant/bar is in the same direction as your current destination
25. direction_op[0, 1]: whether the restaurant/bar is in the same direction as your current destination
26. Y[0, 1]: whether the coupon is accepted

## Looking at the Dataset

In [34]:
dataset = pd.read_csv('https://raw.githubusercontent.com/NikitasThermos/In-Vehicle-Recommendation-System/main/Dataset/in-vehicle-coupon-recommendation.csv')

In [35]:
dataset.head()

Unnamed: 0,destination,passanger,weather,temperature,time,coupon,expiration,gender,age,maritalStatus,...,CoffeeHouse,CarryAway,RestaurantLessThan20,Restaurant20To50,toCoupon_GEQ5min,toCoupon_GEQ15min,toCoupon_GEQ25min,direction_same,direction_opp,Y
0,No Urgent Place,Alone,Sunny,55,2PM,Restaurant(<20),1d,Female,21,Unmarried partner,...,never,,4~8,1~3,1,0,0,0,1,1
1,No Urgent Place,Friend(s),Sunny,80,10AM,Coffee House,2h,Female,21,Unmarried partner,...,never,,4~8,1~3,1,0,0,0,1,0
2,No Urgent Place,Friend(s),Sunny,80,10AM,Carry out & Take away,2h,Female,21,Unmarried partner,...,never,,4~8,1~3,1,1,0,0,1,1
3,No Urgent Place,Friend(s),Sunny,80,2PM,Coffee House,2h,Female,21,Unmarried partner,...,never,,4~8,1~3,1,1,0,0,1,0
4,No Urgent Place,Friend(s),Sunny,80,2PM,Coffee House,1d,Female,21,Unmarried partner,...,never,,4~8,1~3,1,1,0,0,1,0


In [4]:
dataset.shape

(12684, 26)

In [5]:
print(f'Number of coupons: {dataset.shape[0]}')
print(f'Number of negative class: {np.sum(dataset["Y"]==0)}')
print(f'Number of positive class: {np.sum(dataset["Y"]==1)}')

Number of coupons: 12684
Number of negative class: 5474
Number of positive class: 7210


In [36]:
training_df, labels = dataset.drop('Y', axis=1), dataset['Y']

In [37]:
X_train, X_test, y_train, y_test = train_test_split(training_df, labels, test_size=0.2, random_state=42)

In [8]:
missing_percentage = X_train.isnull().mean() * 100
for col, per in zip(missing_percentage.index, missing_percentage):
  print(f'{col}, {per:.2f}%')

destination, 0.00%
passanger, 0.00%
weather, 0.00%
temperature, 0.00%
time, 0.00%
coupon, 0.00%
expiration, 0.00%
gender, 0.00%
age, 0.00%
maritalStatus, 0.00%
has_children, 0.00%
education, 0.00%
occupation, 0.00%
income, 0.00%
car, 99.23%
Bar, 0.83%
CoffeeHouse, 1.77%
CarryAway, 1.17%
RestaurantLessThan20, 1.01%
Restaurant20To50, 1.48%
toCoupon_GEQ5min, 0.00%
toCoupon_GEQ15min, 0.00%
toCoupon_GEQ25min, 0.00%
direction_same, 0.00%
direction_opp, 0.00%


## Test

In [38]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer


category_order = {'temperature': [30, 55, 80],
                  'time': ['7AM', '10AM', '2PM', '6PM', '10PM'],
                  'age': ['below21', '21', '26', '31', '36', '41', '46', '50plus'],
                  'education': ['Some High School', 'High School Graduate', 'Some college - no degree', 'Associates degree', 'Bachelors degree', 'Graduate degree (Masters or Doctorate)'],
                  'income': ['Less than $12500', '$12500 - $24999', '$25000 - $37499','$37500 - $49999', '$50000 - $62499', '$62500 - $74999', '$75000 - $87499', '$87500 - $99999', '$100000 or More'],
                  'Bar': ['never', 'less1', '1~3', '4~8', 'gt8'],
                  'CoffeeHouse': ['never', 'less1', '1~3', '4~8', 'gt8'],
                  'CarryAway': ['never', 'less1', '1~3', '4~8', 'gt8'],
                  'RestaurantLessThan20': ['never', 'less1', '1~3', '4~8', 'gt8'],
                  'Restaurant20To50': ['never', 'less1', '1~3', '4~8', 'gt8']}

order_category_names = ['temperature', 'time', 'age',  'education', 'income', 'Bar', 'CoffeeHouse', 'CarryAway', 'RestaurantLessThan20',  'Restaurant20To50']

pass_columns = [c for c in training_df.columns if c not in order_category_names]


preprocessor = ColumnTransformer(
    transformers=[
        ('ordinal', OrdinalEncoder(), order_category_names),],
    remainder='passthrough',
    verbose_feature_names_out=False).set_output(transform='pandas')

In [39]:
test = preprocessor.fit_transform(X_train)
test