## One Hot Encoding - Variable with many categories

In [1]:
import pandas as pd
import numpy as np

data=pd.read_csv('train.csv',usecols=['X1','X2','X3','X4','X5','X6'])
data

Unnamed: 0,X1,X2,X3,X4,X5,X6
0,v,at,a,d,u,j
1,t,av,e,d,y,l
2,w,n,c,d,x,j
3,t,n,f,d,x,l
4,v,n,f,d,h,d
...,...,...,...,...,...,...
4204,s,as,c,d,aa,d
4205,o,t,d,d,aa,h
4206,v,r,a,d,aa,g
4207,r,e,f,d,aa,l


In [2]:
# How many unique categories are present in each and every column
for col in data.columns:
    print(col,':',len(data[col].unique()), 'labels')

X1 : 27 labels
X2 : 44 labels
X3 : 7 labels
X4 : 4 labels
X5 : 29 labels
X6 : 12 labels


In [3]:
pd.get_dummies(data)

Unnamed: 0,X1_a,X1_aa,X1_ab,X1_b,X1_c,X1_d,X1_e,X1_f,X1_g,X1_h,...,X6_c,X6_d,X6_e,X6_f,X6_g,X6_h,X6_i,X6_j,X6_k,X6_l
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4205,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4206,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4207,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [3]:
# Let's examine how many column we will obtain after one hot encoding these variables
pd.get_dummies(data).shape # we will be getting 123 features instead of 6 features

(4209, 123)

In [4]:
# To resolve the above problem , they took the most frequent categories from each features and neglect the others
# this is called ensemble selection
# It is the way is which we handle features that have many categorical variables
# Let's find the top 10 most frequent categories for the variable X2
data.X2.value_counts().sort_values(ascending=False).head(20)

as    1659
ae     496
ai     415
m      367
ak     265
r      153
n      137
s       94
f       87
e       81
aq      63
ay      54
a       47
t       29
k       25
i       25
b       21
ao      20
ag      19
z       19
Name: X2, dtype: int64

In [4]:
# Make a list with the most frequent categories of the variable
    
top_10=[x for x in data.X2.value_counts().sort_values(ascending=False).head(10).index]
top_10 # label

['as', 'ae', 'ai', 'm', 'ak', 'r', 'n', 's', 'f', 'e']

In [8]:
# Now we make the 10 binary variables

for label in top_10:
    data[label]=np.where(data['X2']==label,1,0)
    
data[['X2']+top_10].head(40)   
# Compare top 10 features with top 40 features to create dummies

Unnamed: 0,X2,as,ae,ai,m,ak,r,n,s,f,e
0,at,0,0,0,0,0,0,0,0,0,0
1,av,0,0,0,0,0,0,0,0,0,0
2,n,0,0,0,0,0,0,1,0,0,0
3,n,0,0,0,0,0,0,1,0,0,0
4,n,0,0,0,0,0,0,1,0,0,0
5,e,0,0,0,0,0,0,0,0,0,1
6,e,0,0,0,0,0,0,0,0,0,1
7,as,1,0,0,0,0,0,0,0,0,0
8,as,1,0,0,0,0,0,0,0,0,0
9,aq,0,0,0,0,0,0,0,0,0,0


In [9]:
# Now I have to apply it into all the columns
# get whole set of dummy variable for all categorical variables

def one_hot_top_x(df,variable,top_x_labels):
    # df- data frame
    # variable :- column name
    # top_x_labels= top 10 names
    # Function to create the dummies variables for the most frequent labels
    # we can vary the number of most frequent labels that we encode
    
    for label in top_x_labels:
        df[variable+'_'+label] = np.where(data[variable]==label,1,0)
        
# Read the data again
data=pd.read_csv('train.csv',usecols=['X1','X2','X3','X4','X5','X6'])

# encode X2 into the 10 most frequent categories
one_hot_top_x(data,'X2',top_10)
data.head()
# top 10 features we are taking  
# Apply for each and every features and drop the original 

Unnamed: 0,X1,X2,X3,X4,X5,X6,X2_as,X2_ae,X2_ai,X2_m,X2_ak,X2_r,X2_n,X2_s,X2_f,X2_e
0,v,at,a,d,u,j,0,0,0,0,0,0,0,0,0,0
1,t,av,e,d,y,l,0,0,0,0,0,0,0,0,0,0
2,w,n,c,d,x,j,0,0,0,0,0,0,1,0,0,0
3,t,n,f,d,x,l,0,0,0,0,0,0,1,0,0,0
4,v,n,f,d,h,d,0,0,0,0,0,0,1,0,0,0


In [None]:
# Advantages 
# easy to implement
# donot require hour of variable exploration
# donot expand massively the feature space

# Disadvantage
# not any information that may make the variable more predictive
# donot keep the information of the ignored labels(noise data)