In [1]:
import pandas as pd
import numpy as np

##module for adding all EDA features in the dataset
from ImportData_EDAFeatures import CreateDF_UntilEDA
from collections import Counter

import warnings
warnings.filterwarnings("ignore")

In [2]:
dataset,dependent_variable,numerical_features,categorical_features = CreateDF_UntilEDA('CouponRecommendation')
dataset.head()

Unnamed: 0,destination,passanger,weather,temperature,time,coupon,expiration,gender,age,maritalStatus,...,Restaurant20To50,toCouponGEQ5min,toCouponGEQ15min,toCouponGEQ25min,directionsame,directionopp,Y,couponstatus,acceptedcoupon,rejectedcoupon
0,No Urgent Place,Alone,Sunny,55,2PM,Restaurant(<20),1d,Male,21,Single,...,less1,1,0,0,0,1,1,accepted,0,0
1,No Urgent Place,Friend(s),Sunny,80,10AM,Coffee House,2h,Male,21,Single,...,less1,1,0,0,0,1,0,not accepted,0,0
2,No Urgent Place,Friend(s),Sunny,80,10AM,Bar,1d,Male,21,Single,...,less1,1,0,0,0,1,1,accepted,0,0
3,No Urgent Place,Friend(s),Sunny,80,10AM,Carry out & Take away,2h,Male,21,Single,...,less1,1,1,0,0,1,0,not accepted,0,0
4,No Urgent Place,Friend(s),Sunny,80,2PM,Coffee House,1d,Male,21,Single,...,less1,1,0,0,0,1,0,not accepted,0,0


# Examples of encodings for coupon recommendation dataset
### Dummy Encoding or One-Hot Encoding

In [3]:
categorical_column_index = 2
print('There are',dataset[categorical_features[categorical_column_index]].nunique(),'categories in the feature',"'"+categorical_features[categorical_column_index]+"'.",'These are:',dataset[categorical_features[categorical_column_index]].unique())

There are 3 categories in the feature 'weather'. These are: ['Sunny' 'Rainy' 'Snowy']


In [4]:
##create dummy encoding
dataset2 = pd.get_dummies(data=dataset,columns=[categorical_features[categorical_column_index]])
dataset2.head()

Unnamed: 0,destination,passanger,temperature,time,coupon,expiration,gender,age,maritalStatus,haschildren,...,toCouponGEQ25min,directionsame,directionopp,Y,couponstatus,acceptedcoupon,rejectedcoupon,weather_Rainy,weather_Snowy,weather_Sunny
0,No Urgent Place,Alone,55,2PM,Restaurant(<20),1d,Male,21,Single,0,...,0,0,1,1,accepted,0,0,0,0,1
1,No Urgent Place,Friend(s),80,10AM,Coffee House,2h,Male,21,Single,0,...,0,0,1,0,not accepted,0,0,0,0,1
2,No Urgent Place,Friend(s),80,10AM,Bar,1d,Male,21,Single,0,...,0,0,1,1,accepted,0,0,0,0,1
3,No Urgent Place,Friend(s),80,10AM,Carry out & Take away,2h,Male,21,Single,0,...,0,0,1,0,not accepted,0,0,0,0,1
4,No Urgent Place,Friend(s),80,2PM,Coffee House,1d,Male,21,Single,0,...,0,0,1,0,not accepted,0,0,0,0,1


In [5]:
#drop the first dummy encoding feature to avoid 'dummy variable trap'
dataset2 = pd.get_dummies(data=dataset,columns=[categorical_features[categorical_column_index]],drop_first=True)
dataset2.head()

Unnamed: 0,destination,passanger,temperature,time,coupon,expiration,gender,age,maritalStatus,haschildren,...,toCouponGEQ15min,toCouponGEQ25min,directionsame,directionopp,Y,couponstatus,acceptedcoupon,rejectedcoupon,weather_Snowy,weather_Sunny
0,No Urgent Place,Alone,55,2PM,Restaurant(<20),1d,Male,21,Single,0,...,0,0,0,1,1,accepted,0,0,0,1
1,No Urgent Place,Friend(s),80,10AM,Coffee House,2h,Male,21,Single,0,...,0,0,0,1,0,not accepted,0,0,0,1
2,No Urgent Place,Friend(s),80,10AM,Bar,1d,Male,21,Single,0,...,0,0,0,1,1,accepted,0,0,0,1
3,No Urgent Place,Friend(s),80,10AM,Carry out & Take away,2h,Male,21,Single,0,...,1,0,0,1,0,not accepted,0,0,0,1
4,No Urgent Place,Friend(s),80,2PM,Coffee House,1d,Male,21,Single,0,...,0,0,0,1,0,not accepted,0,0,0,1


### Label Encoding

In [6]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

In [7]:
#lets create label encoded feature for 
categorical_column_index =3
dataset['timeLabelEncoded'] = label_encoder.fit_transform(dataset[categorical_features[categorical_column_index]])
dataset[[categorical_features[categorical_column_index],'timeLabelEncoded']].head(10)

Unnamed: 0,time,timeLabelEncoded
0,2PM,2
1,10AM,0
2,10AM,0
3,10AM,0
4,2PM,2
5,2PM,2
6,2PM,2
7,2PM,2
8,6PM,3
9,6PM,3


### Count Encoding

In [8]:
#create encoding dictionary to replace category with count
column = 'passanger'
count_encoding_dictionary = dict(Counter(dataset[column]))
print(count_encoding_dictionary)

#create percent encoding
percent_encoding_dictionary = {}
for key in count_encoding_dictionary:
    percent_encoding_dictionary[key] = round((count_encoding_dictionary[key]/dataset.shape[0])*100,2)
print(percent_encoding_dictionary)

#create new feature for count encoded
dataset[column+'_countEncoded']=dataset[column].replace(count_encoding_dictionary)
dataset[column+'_percentEncoded']=dataset[column].replace(percent_encoding_dictionary)
dataset[[column,column+'_countEncoded',column+'_percentEncoded']].head()

{'Alone': 6969, 'Friend(s)': 3148, 'Kid(s)': 938, 'Partner': 1024}
{'Alone': 57.7, 'Friend(s)': 26.06, 'Kid(s)': 7.77, 'Partner': 8.48}


Unnamed: 0,passanger,passanger_countEncoded,passanger_percentEncoded
0,Alone,6969,57.7
1,Friend(s),3148,26.06
2,Friend(s),3148,26.06
3,Friend(s),3148,26.06
4,Friend(s),3148,26.06


### Rank of Counts

In [9]:
categorical_column_index = 1
column = categorical_features[categorical_column_index]

count_encoding_dictionary = dict(Counter(dataset[column]))

rank_list = sorted(count_encoding_dictionary, key=count_encoding_dictionary.get)

rank_count_encoding_dictionary = {}
counter = 1
for category in rank_list:
    rank_count_encoding_dictionary[category] = counter
    counter += 1

print(rank_count_encoding_dictionary)

dataset[column+'_countRankEncoded']=dataset[column].replace(rank_count_encoding_dictionary)
dataset[[column,column+'_countRankEncoded']].head()

{'Kid(s)': 1, 'Partner': 2, 'Friend(s)': 3, 'Alone': 4}


Unnamed: 0,passanger,passanger_countRankEncoded
0,Alone,4
1,Friend(s),3
2,Friend(s),3
3,Friend(s),3
4,Friend(s),3


### Percent Encoding

In [10]:
percent_encoding_dictionary = {}
for key in count_encoding_dictionary:
    percent_encoding_dictionary[key] = round((count_encoding_dictionary[key]/dataset.shape[0])*100,2)
    
dataset[column+'_percentEncoded']=dataset[column].replace(percent_encoding_dictionary)
dataset[[column,column+'_percentEncoded']].head()

Unnamed: 0,passanger,passanger_percentEncoded
0,Alone,57.7
1,Friend(s),26.06
2,Friend(s),26.06
3,Friend(s),26.06
4,Friend(s),26.06


### Mean encoding

In [11]:
mean_groupby = dataset[['occupation','Y']].groupby('occupation').mean()
mean_encoding = {}

for index_value in mean_groupby.index:
    mean_encoding[index_value] = mean_groupby['Y'][index_value]

##apply encoding
dataset['occupation_MeanEncoded']=dataset['occupation'].replace(mean_encoding)
dataset[['occupation','occupation_MeanEncoded']].head()

Unnamed: 0,occupation,occupation_MeanEncoded
0,Architecture & Engineering,0.634286
1,Architecture & Engineering,0.634286
2,Architecture & Engineering,0.634286
3,Architecture & Engineering,0.634286
4,Architecture & Engineering,0.634286
