In [1]:
import pandas as pd
import category_encoders as ce
from patsy.contrasts import Poly

##module for adding all EDA features in the dataset
from ImportData_EDAFeatures import CreateDF_UntilEDA

import warnings
warnings.filterwarnings("ignore")

In [2]:
dataset,dependent_variable,numerical_features,categorical_features = CreateDF_UntilEDA('CouponRecommendation')

# Examples of encodings for coupon recommendation dataset
### Rank Encoding

In [3]:
income_ranking_dictionary = {'Less than $12500':1,
'$12500 - $24999':2,
'$25000 - $37499':3,
'$37500 - $49999':4,
'$50000 - $62499':5,
'$62500 - $74999':6,
'$75000 - $87499':7,
'$87500 - $99999':8,
'$100000 or More':9}

dataset['income_Ranking'] = dataset['income'].replace(income_ranking_dictionary)
dataset[['income','income_Ranking']].head()

Unnamed: 0,income,income_Ranking
0,$62500 - $74999,6
1,$62500 - $74999,6
2,$62500 - $74999,6
3,$62500 - $74999,6
4,$62500 - $74999,6


### Polynomial Encoding

In [4]:
#save index for joining

dataset['Saveindex'] = dataset.index

#encoding will create N-1 columns. We nee to merge these back with the original data frame
keep_columns = ['Saveindex','income_0', 'income_1', 'income_2', 'income_3', 'income_4', 'income_5', 'income_6', 'income_7']

column_rename = {'income_0':'Poly_incm_0',
 'income_1':'Poly_incm_1',
 'income_2':'Poly_incm_2',
 'income_3':'Poly_incm_3',
 'income_4':'Poly_incm_4',
 'income_5':'Poly_incm_5',
 'income_6':'Poly_incm_6',
 'income_7':'Poly_incm_7'}
display_columns = ['income','Poly_incm_0','Poly_incm_1','Poly_incm_2','Poly_incm_3','Poly_incm_4','Poly_incm_5','Poly_incm_6','Poly_incm_7']

encoder = ce.PolynomialEncoder(cols=["income"])
data2 = encoder.fit_transform(dataset, verbose=1)
#remove the intercept column
data2 = data2[keep_columns]
data2.rename(columns=column_rename,inplace=True)
data2.head()
dataset = dataset.merge(data2)

dataset[display_columns].head()

Unnamed: 0,income,Poly_incm_0,Poly_incm_1,Poly_incm_2,Poly_incm_3,Poly_incm_4,Poly_incm_5,Poly_incm_6,Poly_incm_7
0,$62500 - $74999,-0.516398,0.531816,-0.444949,0.312893,-0.1849,0.089893,-0.034139,0.008815
1,$62500 - $74999,-0.516398,0.531816,-0.444949,0.312893,-0.1849,0.089893,-0.034139,0.008815
2,$62500 - $74999,-0.516398,0.531816,-0.444949,0.312893,-0.1849,0.089893,-0.034139,0.008815
3,$62500 - $74999,-0.516398,0.531816,-0.444949,0.312893,-0.1849,0.089893,-0.034139,0.008815
4,$62500 - $74999,-0.516398,0.531816,-0.444949,0.312893,-0.1849,0.089893,-0.034139,0.008815


### Backward Difference Encoder

In [5]:
#encoding will create N-1 columns. We nee to merge these back with the original data frame
keep_columns = ['Saveindex','income_0', 'income_1', 'income_2', 'income_3', 'income_4', 'income_5', 'income_6', 'income_7']

column_rename = {'income_0':'BackDif_incm_0',
 'income_1':'BackDif_incm_1',
 'income_2':'BackDif_incm_2',
 'income_3':'BackDif_incm_3',
 'income_4':'BackDif_incm_4',
 'income_5':'BackDif_incm_5',
 'income_6':'BackDif_incm_6',
 'income_7':'BackDif_incm_7'}

display_columns = ['income','BackDif_incm_0','BackDif_incm_1','BackDif_incm_2','BackDif_incm_3','BackDif_incm_4','BackDif_incm_5','BackDif_incm_6','BackDif_incm_7']

encoder = ce.BackwardDifferenceEncoder(cols=["income"])
data2 = encoder.fit_transform(dataset, verbose=1)
#remove the intercept column
data2 = data2[keep_columns]
data2.rename(columns=column_rename,inplace=True)
data2.head()
dataset = dataset.merge(data2)
del dataset['Saveindex']
dataset[display_columns].head()

Unnamed: 0,income,BackDif_incm_0,BackDif_incm_1,BackDif_incm_2,BackDif_incm_3,BackDif_incm_4,BackDif_incm_5,BackDif_incm_6,BackDif_incm_7
0,$62500 - $74999,-0.888889,-0.777778,-0.666667,-0.555556,-0.444444,-0.333333,-0.222222,-0.111111
1,$62500 - $74999,-0.888889,-0.777778,-0.666667,-0.555556,-0.444444,-0.333333,-0.222222,-0.111111
2,$62500 - $74999,-0.888889,-0.777778,-0.666667,-0.555556,-0.444444,-0.333333,-0.222222,-0.111111
3,$62500 - $74999,-0.888889,-0.777778,-0.666667,-0.555556,-0.444444,-0.333333,-0.222222,-0.111111
4,$62500 - $74999,-0.888889,-0.777778,-0.666667,-0.555556,-0.444444,-0.333333,-0.222222,-0.111111
