# Categorical Variable Encoding

Type of categorical variables:
 * Nominal (no perticular order)
 * Ordinal (some ordered)

Methods of encoding:
   1. One Hot Encoding
   2. Label Encoding
   3. Ordinal Encoding
   4. Helmert Encoding
   5. Binary Encoding
   6. Frequency Encoding
   7. Mean Encoding
   8. Weight of Evidence Encoding
   9. Probability ration Encoding
   10. Hashing Encoding
   11. Backward Difference Encoding
   12. Leave one out Encoding
   13. James-Stein Encoding
   14. M-estimator Encoding
   15. Thermometer Encoder
    

In [1]:
import pandas as pd
import numpy as np

In [2]:
data={'Temperature':['Hot','Cold','Very Hot','Warm','Hot','Warm','Warm','Hot','Hot','Cold'],
     'Color':['Red','Yellow','Blue','Blue','Red','Yellow','Red','Yellow','Yellow','Yellow'],
     'Target':[1,1,1,0,1,0,1,0,1,1]}

df=pd.DataFrame(data,columns=['Temperature','Color','Target'])

In [3]:
df

Unnamed: 0,Temperature,Color,Target
0,Hot,Red,1
1,Cold,Yellow,1
2,Very Hot,Blue,1
3,Warm,Blue,0
4,Hot,Red,1
5,Warm,Yellow,0
6,Warm,Red,1
7,Hot,Yellow,0
8,Hot,Yellow,1
9,Cold,Yellow,1


### 1. One Hot Encoding

In [4]:
df_1=df.copy(deep=True)
df_1=pd.get_dummies(df,prefix=['Temp'],columns=['Temperature'])
df_1

Unnamed: 0,Color,Target,Temp_Cold,Temp_Hot,Temp_Very Hot,Temp_Warm
0,Red,1,0,1,0,0
1,Yellow,1,1,0,0,0
2,Blue,1,0,0,1,0
3,Blue,0,0,0,0,1
4,Red,1,0,1,0,0
5,Yellow,0,0,0,0,1
6,Red,1,0,0,0,1
7,Yellow,0,0,1,0,0
8,Yellow,1,0,1,0,0
9,Yellow,1,1,0,0,0


In [5]:
from sklearn.preprocessing import OneHotEncoder

ohc=OneHotEncoder()
ohe=ohc.fit_transform(df.Temperature.values.reshape(-1,1)).toarray()
dfOneHot=pd.DataFrame(ohe,columns=['Temp'+str(ohc.categories_[0][i]) for i in range(len(ohc.categories_[0]))])

dfh=pd.concat([df,dfOneHot],axis=1)
dfh

Unnamed: 0,Temperature,Color,Target,TempCold,TempHot,TempVery Hot,TempWarm
0,Hot,Red,1,0.0,1.0,0.0,0.0
1,Cold,Yellow,1,1.0,0.0,0.0,0.0
2,Very Hot,Blue,1,0.0,0.0,1.0,0.0
3,Warm,Blue,0,0.0,0.0,0.0,1.0
4,Hot,Red,1,0.0,1.0,0.0,0.0
5,Warm,Yellow,0,0.0,0.0,0.0,1.0
6,Warm,Red,1,0.0,0.0,0.0,1.0
7,Hot,Yellow,0,0.0,1.0,0.0,0.0
8,Hot,Yellow,1,0.0,1.0,0.0,0.0
9,Cold,Yellow,1,1.0,0.0,0.0,0.0


### 2. Label Encoding

In [10]:
from sklearn.preprocessing import LabelEncoder

df['Temp_label_encoder']=LabelEncoder().fit_transform(df.Temperature)
df

Unnamed: 0,Temperature,Color,Target,Temp_label_encoder
0,Hot,Red,1,1
1,Cold,Yellow,1,0
2,Very Hot,Blue,1,2
3,Warm,Blue,0,3
4,Hot,Red,1,1
5,Warm,Yellow,0,3
6,Warm,Red,1,3
7,Hot,Yellow,0,1
8,Hot,Yellow,1,1
9,Cold,Yellow,1,0


In [11]:
#pandas factorize alos perform same
df.loc[:,'Temp_factorize_encode']=pd.factorize(df['Temperature'])[0].reshape(-1,1)
df

Unnamed: 0,Temperature,Color,Target,Temp_label_encoder,Temp_factorize_encode
0,Hot,Red,1,1,0
1,Cold,Yellow,1,0,1
2,Very Hot,Blue,1,2,2
3,Warm,Blue,0,3,3
4,Hot,Red,1,1,0
5,Warm,Yellow,0,3,3
6,Warm,Red,1,3,3
7,Hot,Yellow,0,1,0
8,Hot,Yellow,1,1,0
9,Cold,Yellow,1,0,1


### 3. Ordinal Encoding

In [12]:
Temp_dict={'Cold':1,'Warm':2,'Hot':3,'Very Hot':4}

df['Temp_Ordinal']=df.Temperature.map(Temp_dict)
df

Unnamed: 0,Temperature,Color,Target,Temp_label_encoder,Temp_factorize_encode,Temp_Ordinal
0,Hot,Red,1,1,0,3
1,Cold,Yellow,1,0,1,1
2,Very Hot,Blue,1,2,2,4
3,Warm,Blue,0,3,3,2
4,Hot,Red,1,1,0,3
5,Warm,Yellow,0,3,3,2
6,Warm,Red,1,3,3,2
7,Hot,Yellow,0,1,0,3
8,Hot,Yellow,1,1,0,3
9,Cold,Yellow,1,0,1,1


### 4. Helmert Encoding

In [16]:
import category_encoders as ce

In [17]:
encoder=ce.HelmertEncoder(cols=['Temperature'],drop_invariant=True)
dfh=encoder.fit_transform(df['Temperature'])
df=pd.concat([df,dfh],axis=1)
df



Unnamed: 0,Temperature,Color,Target,Temp_label_encoder,Temp_factorize_encode,Temp_Ordinal,Temperature_0,Temperature_1,Temperature_2
0,Hot,Red,1,1,0,3,-1.0,-1.0,-1.0
1,Cold,Yellow,1,0,1,1,1.0,-1.0,-1.0
2,Very Hot,Blue,1,2,2,4,0.0,2.0,-1.0
3,Warm,Blue,0,3,3,2,0.0,0.0,3.0
4,Hot,Red,1,1,0,3,-1.0,-1.0,-1.0
5,Warm,Yellow,0,3,3,2,0.0,0.0,3.0
6,Warm,Red,1,3,3,2,0.0,0.0,3.0
7,Hot,Yellow,0,1,0,3,-1.0,-1.0,-1.0
8,Hot,Yellow,1,1,0,3,-1.0,-1.0,-1.0
9,Cold,Yellow,1,0,1,1,1.0,-1.0,-1.0


### 5. Binary Encoding

In [18]:
encoder=ce.BinaryEncoder(cols=['Temperature'])
dfbin=encoder.fit_transform(df['Temperature'])
df=pd.concat([df,dfbin],axis=1)
df

Unnamed: 0,Temperature,Color,Target,Temp_label_encoder,Temp_factorize_encode,Temp_Ordinal,Temperature_0,Temperature_1,Temperature_2,Temperature_0.1,Temperature_1.1,Temperature_2.1
0,Hot,Red,1,1,0,3,-1.0,-1.0,-1.0,0,0,1
1,Cold,Yellow,1,0,1,1,1.0,-1.0,-1.0,0,1,0
2,Very Hot,Blue,1,2,2,4,0.0,2.0,-1.0,0,1,1
3,Warm,Blue,0,3,3,2,0.0,0.0,3.0,1,0,0
4,Hot,Red,1,1,0,3,-1.0,-1.0,-1.0,0,0,1
5,Warm,Yellow,0,3,3,2,0.0,0.0,3.0,1,0,0
6,Warm,Red,1,3,3,2,0.0,0.0,3.0,1,0,0
7,Hot,Yellow,0,1,0,3,-1.0,-1.0,-1.0,0,0,1
8,Hot,Yellow,1,1,0,3,-1.0,-1.0,-1.0,0,0,1
9,Cold,Yellow,1,0,1,1,1.0,-1.0,-1.0,0,1,0


### 6. Frequency Encoder

In [23]:
fe=df.groupby('Temperature').size()/len(df)
df.loc[:,'Temp_freq_encode']=df['Temperature'].map(fe)
df

Unnamed: 0,Temperature,Color,Target,Temp_label_encoder,Temp_factorize_encode,Temp_Ordinal,Temperature_0,Temperature_1,Temperature_2,Temperature_0.1,Temperature_1.1,Temperature_2.1,Temp_freq_encode
0,Hot,Red,1,1,0,3,-1.0,-1.0,-1.0,0,0,1,0.4
1,Cold,Yellow,1,0,1,1,1.0,-1.0,-1.0,0,1,0,0.2
2,Very Hot,Blue,1,2,2,4,0.0,2.0,-1.0,0,1,1,0.1
3,Warm,Blue,0,3,3,2,0.0,0.0,3.0,1,0,0,0.3
4,Hot,Red,1,1,0,3,-1.0,-1.0,-1.0,0,0,1,0.4
5,Warm,Yellow,0,3,3,2,0.0,0.0,3.0,1,0,0,0.3
6,Warm,Red,1,3,3,2,0.0,0.0,3.0,1,0,0,0.3
7,Hot,Yellow,0,1,0,3,-1.0,-1.0,-1.0,0,0,1,0.4
8,Hot,Yellow,1,1,0,3,-1.0,-1.0,-1.0,0,0,1,0.4
9,Cold,Yellow,1,0,1,1,1.0,-1.0,-1.0,0,1,0,0.2


In [22]:
df.groupby('Temperature').size()/len(df)

Temperature
Cold        0.2
Hot         0.4
Very Hot    0.1
Warm        0.3
dtype: float64

### 7. Mean Encoding

In [24]:
mean_encode=df.groupby('Temperature')['Target'].mean()
print(mean_encode)

df['mean_encode']=df['Temperature'].map(mean_encode)
df

Temperature
Cold        1.000000
Hot         0.750000
Very Hot    1.000000
Warm        0.333333
Name: Target, dtype: float64


Unnamed: 0,Temperature,Color,Target,Temp_label_encoder,Temp_factorize_encode,Temp_Ordinal,Temperature_0,Temperature_1,Temperature_2,Temperature_0.1,Temperature_1.1,Temperature_2.1,Temp_freq_encode,mean_encode
0,Hot,Red,1,1,0,3,-1.0,-1.0,-1.0,0,0,1,0.4,0.75
1,Cold,Yellow,1,0,1,1,1.0,-1.0,-1.0,0,1,0,0.2,1.0
2,Very Hot,Blue,1,2,2,4,0.0,2.0,-1.0,0,1,1,0.1,1.0
3,Warm,Blue,0,3,3,2,0.0,0.0,3.0,1,0,0,0.3,0.333333
4,Hot,Red,1,1,0,3,-1.0,-1.0,-1.0,0,0,1,0.4,0.75
5,Warm,Yellow,0,3,3,2,0.0,0.0,3.0,1,0,0,0.3,0.333333
6,Warm,Red,1,3,3,2,0.0,0.0,3.0,1,0,0,0.3,0.333333
7,Hot,Yellow,0,1,0,3,-1.0,-1.0,-1.0,0,0,1,0.4,0.75
8,Hot,Yellow,1,1,0,3,-1.0,-1.0,-1.0,0,0,1,0.4,0.75
9,Cold,Yellow,1,0,1,1,1.0,-1.0,-1.0,0,1,0,0.2,1.0


In [25]:
#compute the global mean
mean=df['Target'].mean()

#compute number of values and mean of each group
agg=df.groupby('Temperature')['Target'].agg(['count','mean'])
counts=agg['count']
means=agg['mean']
weight=100

#compute the smoothed mean
smooth=(counts * means + weight * mean)/(counts + weight)
print(smooth)

#soomth_encoding
df['smooth_encode']=df['Temperature'].map(smooth)
df

Temperature
Cold        0.705882
Hot         0.701923
Very Hot    0.702970
Warm        0.689320
dtype: float64


Unnamed: 0,Temperature,Color,Target,Temp_label_encoder,Temp_factorize_encode,Temp_Ordinal,Temperature_0,Temperature_1,Temperature_2,Temperature_0.1,Temperature_1.1,Temperature_2.1,Temp_freq_encode,mean_encode,smooth_encode
0,Hot,Red,1,1,0,3,-1.0,-1.0,-1.0,0,0,1,0.4,0.75,0.701923
1,Cold,Yellow,1,0,1,1,1.0,-1.0,-1.0,0,1,0,0.2,1.0,0.705882
2,Very Hot,Blue,1,2,2,4,0.0,2.0,-1.0,0,1,1,0.1,1.0,0.70297
3,Warm,Blue,0,3,3,2,0.0,0.0,3.0,1,0,0,0.3,0.333333,0.68932
4,Hot,Red,1,1,0,3,-1.0,-1.0,-1.0,0,0,1,0.4,0.75,0.701923
5,Warm,Yellow,0,3,3,2,0.0,0.0,3.0,1,0,0,0.3,0.333333,0.68932
6,Warm,Red,1,3,3,2,0.0,0.0,3.0,1,0,0,0.3,0.333333,0.68932
7,Hot,Yellow,0,1,0,3,-1.0,-1.0,-1.0,0,0,1,0.4,0.75,0.701923
8,Hot,Yellow,1,1,0,3,-1.0,-1.0,-1.0,0,0,1,0.4,0.75,0.701923
9,Cold,Yellow,1,0,1,1,1.0,-1.0,-1.0,0,1,0,0.2,1.0,0.705882


### 8. Weight of Evidence Encoding

In [26]:
#we calculate probability of target=1 i.e Good=1 for each category
woe_df=df.groupby('Temperature')['Target'].mean()
woe_df=pd.DataFrame(woe_df)

#rename target column to Good
woe_df=woe_df.rename(columns={'Target':'Good'})

#calculate the probability of bad which is 1-probability of Good
woe_df['Bad']=1-woe_df['Good']

#add small value to avoid denominator as zero
woe_df['Bad']=np.where(woe_df['Bad']==0,0.0000001,woe_df['Bad'])

#compute the woe
woe_df['WoE']=np.log(woe_df['Good']/woe_df['Bad'])
woe_df

Unnamed: 0_level_0,Good,Bad,WoE
Temperature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Cold,1.0,1e-07,16.118096
Hot,0.75,0.25,1.098612
Very Hot,1.0,1e-07,16.118096
Warm,0.333333,0.6666667,-0.693147


In [27]:
#map WoE value to dataframe
df['WoE_encode']=df['Temperature'].map(woe_df['WoE'])
df

Unnamed: 0,Temperature,Color,Target,Temp_label_encoder,Temp_factorize_encode,Temp_Ordinal,Temperature_0,Temperature_1,Temperature_2,Temperature_0.1,Temperature_1.1,Temperature_2.1,Temp_freq_encode,mean_encode,smooth_encode,WoE_encode
0,Hot,Red,1,1,0,3,-1.0,-1.0,-1.0,0,0,1,0.4,0.75,0.701923,1.098612
1,Cold,Yellow,1,0,1,1,1.0,-1.0,-1.0,0,1,0,0.2,1.0,0.705882,16.118096
2,Very Hot,Blue,1,2,2,4,0.0,2.0,-1.0,0,1,1,0.1,1.0,0.70297,16.118096
3,Warm,Blue,0,3,3,2,0.0,0.0,3.0,1,0,0,0.3,0.333333,0.68932,-0.693147
4,Hot,Red,1,1,0,3,-1.0,-1.0,-1.0,0,0,1,0.4,0.75,0.701923,1.098612
5,Warm,Yellow,0,3,3,2,0.0,0.0,3.0,1,0,0,0.3,0.333333,0.68932,-0.693147
6,Warm,Red,1,3,3,2,0.0,0.0,3.0,1,0,0,0.3,0.333333,0.68932,-0.693147
7,Hot,Yellow,0,1,0,3,-1.0,-1.0,-1.0,0,0,1,0.4,0.75,0.701923,1.098612
8,Hot,Yellow,1,1,0,3,-1.0,-1.0,-1.0,0,0,1,0.4,0.75,0.701923,1.098612
9,Cold,Yellow,1,0,1,1,1.0,-1.0,-1.0,0,1,0,0.2,1.0,0.705882,16.118096


### 9. Probability Ration Encoding

In [29]:
#calculate probability of target=1 i.e 'Good=1' for each category
pr_df=df.groupby('Temperature')['Target'].mean()
pr_df=pd.DataFrame(pr_df)

#rename column name Target as Good
pr_df=pr_df.rename(columns={'Target':'Good'})

#calculate bad probability which is 1-good
pr_df['Bad']=1-pr_df['Good']

#add small value to avoid divide by zero error
pr_df['Bad']=np.where(pr_df['Bad']==0,0.000001,pr_df['Bad'])

#compute the probability ratio
pr_df['PR']=pr_df.Good/pr_df.Bad
pr_df

Unnamed: 0_level_0,Good,Bad,PR
Temperature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Cold,1.0,1e-06,1000000.0
Hot,0.75,0.25,3.0
Very Hot,1.0,1e-06,1000000.0
Warm,0.333333,0.666667,0.5


In [31]:
df['PR_encode']=df['Temperature'].map(pr_df['PR'])
df

Unnamed: 0,Temperature,Color,Target,Temp_label_encoder,Temp_factorize_encode,Temp_Ordinal,Temperature_0,Temperature_1,Temperature_2,Temperature_0.1,Temperature_1.1,Temperature_2.1,Temp_freq_encode,mean_encode,smooth_encode,WoE_encode,PR_encode
0,Hot,Red,1,1,0,3,-1.0,-1.0,-1.0,0,0,1,0.4,0.75,0.701923,1.098612,3.0
1,Cold,Yellow,1,0,1,1,1.0,-1.0,-1.0,0,1,0,0.2,1.0,0.705882,16.118096,1000000.0
2,Very Hot,Blue,1,2,2,4,0.0,2.0,-1.0,0,1,1,0.1,1.0,0.70297,16.118096,1000000.0
3,Warm,Blue,0,3,3,2,0.0,0.0,3.0,1,0,0,0.3,0.333333,0.68932,-0.693147,0.5
4,Hot,Red,1,1,0,3,-1.0,-1.0,-1.0,0,0,1,0.4,0.75,0.701923,1.098612,3.0
5,Warm,Yellow,0,3,3,2,0.0,0.0,3.0,1,0,0,0.3,0.333333,0.68932,-0.693147,0.5
6,Warm,Red,1,3,3,2,0.0,0.0,3.0,1,0,0,0.3,0.333333,0.68932,-0.693147,0.5
7,Hot,Yellow,0,1,0,3,-1.0,-1.0,-1.0,0,0,1,0.4,0.75,0.701923,1.098612,3.0
8,Hot,Yellow,1,1,0,3,-1.0,-1.0,-1.0,0,0,1,0.4,0.75,0.701923,1.098612,3.0
9,Cold,Yellow,1,0,1,1,1.0,-1.0,-1.0,0,1,0,0.2,1.0,0.705882,16.118096,1000000.0


### 10. M-estimator encoding

In [36]:
df_1=df.copy(deep=True)
df_1=df_1[['Temperature','Color','Target']]

In [39]:
#count the frequency of each category
category_counts=df_1['Temperature'].value_counts()

#calculate the weight for each category based on deviation from the mean frequency
weights =(category_counts-category_counts.mean()).abs()
weights =weights / weights.sum()

#create a dictionary to map each category to its weight
mapping = dict(zip(weights.index,weights.values))

#map the category to their weight
df_1['weights']=df_1['Temperature'].map(mapping)

#calculate weighted encoding for each category
encoded=df_1.groupby('Temperature')['weights'].sum()
encoded=encoded/encoded.sum()

#map the weighted encoding back to categories
df_1['encoded_temp']=df_1['Temperature'].map(encoded)

In [40]:
df_1

Unnamed: 0,Temperature,Color,Target,weights,encoded_temp
0,Hot,Red,1,0.375,0.6
1,Cold,Yellow,1,0.125,0.1
2,Very Hot,Blue,1,0.375,0.15
3,Warm,Blue,0,0.125,0.15
4,Hot,Red,1,0.375,0.6
5,Warm,Yellow,0,0.125,0.15
6,Warm,Red,1,0.125,0.15
7,Hot,Yellow,0,0.375,0.6
8,Hot,Yellow,1,0.375,0.6
9,Cold,Yellow,1,0.125,0.1


### 11. Thermometer Encoder

In [44]:
data = {'color': ['Red','Yellow','Blue','Blue','Red','Yellow','Red','Yellow','Yellow','Yellow','Green']}
cl_df = pd.DataFrame(data)

#get the unique categories and their ranks
categories = cl_df['color'].unique()
ranks = range(len(categories))

#create a dictionary to map each category to its rank
mapping=dict(zip(categories,ranks))

#map the categories to their rank
cl_df['ranks']=cl_df['color'].map(mapping)

#create themometer encoding for each category
encoded = pd.get_dummies(cl_df['ranks'])
encoded = encoded.add_prefix('thermo_')

#merge the encoding with original data
cl_df=pd.concat([cl_df,encoded],axis=1)
cl_df

Unnamed: 0,color,ranks,thermo_0,thermo_1,thermo_2,thermo_3
0,Red,0,1,0,0,0
1,Yellow,1,0,1,0,0
2,Blue,2,0,0,1,0
3,Blue,2,0,0,1,0
4,Red,0,1,0,0,0
5,Yellow,1,0,1,0,0
6,Red,0,1,0,0,0
7,Yellow,1,0,1,0,0
8,Yellow,1,0,1,0,0
9,Yellow,1,0,1,0,0


In [None]:
#end