## One Hot Encoding - variables with many categories

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('Mercedes.csv', usecols = ['X1','X2','X3','X4','X5','X6'])
df.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6
0,v,at,a,d,u,j
1,t,av,e,d,y,l
2,w,n,c,d,x,j
3,t,n,f,d,x,l
4,v,n,f,d,h,d


In [3]:
for col in df.columns:
    print(col, ': ', len(df[col].unique()), ' labels')

X1 :  27  labels
X2 :  44  labels
X3 :  7  labels
X4 :  4  labels
X5 :  29  labels
X6 :  12  labels


In [4]:
#Before encoding
df.shape

(4209, 6)

In [5]:
#after encoding
pd.get_dummies(df, drop_first=True).shape

(4209, 117)

As there are 117 features, there is a chance that it can lead to Curse-of-Dimensionality

We will follow the techniques implemented from this 
http://proceedings.mlr.press/v7/niculescu09/niculescu09.pdf

In [6]:
# First let's do one variable
df.X2.value_counts().sort_values(ascending=False).head(10)

as    1659
ae     496
ai     415
m      367
ak     265
r      153
n      137
s       94
f       87
e       81
Name: X2, dtype: int64

In [7]:
#lets make a list with most frequent categories then
top_10_labels = [y for y in df.X2.value_counts().sort_values(ascending=False).head(10).index]
top_10_labels

['as', 'ae', 'ai', 'm', 'ak', 'r', 'n', 's', 'f', 'e']

In [8]:
# get whole set of dummy variables, for all the categorical variables

def one_hot_encoding_top_x(df, variable, top_x_labels):
    # function to create the dummy variables for the most frequent labels
    # we can vary the number of most frequent labels that we encode
    
    for label in top_x_labels:
        df[variable+'_'+label] = np.where(df[variable]==label, 1, 0)

In [9]:
df.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6
0,v,at,a,d,u,j
1,t,av,e,d,y,l
2,w,n,c,d,x,j
3,t,n,f,d,x,l
4,v,n,f,d,h,d


In [10]:
# read the data again
df = pd.read_csv('Mercedes.csv', usecols=['X1', 'X2'])

# encode X2 into the 10 most frequent categories
one_hot_encoding_top_x(df, 'X2', top_10_labels)
df.head()

Unnamed: 0,X1,X2,X2_as,X2_ae,X2_ai,X2_m,X2_ak,X2_r,X2_n,X2_s,X2_f,X2_e
0,v,at,0,0,0,0,0,0,0,0,0,0
1,t,av,0,0,0,0,0,0,0,0,0,0
2,w,n,0,0,0,0,0,0,1,0,0,0
3,t,n,0,0,0,0,0,0,1,0,0,0
4,v,n,0,0,0,0,0,0,1,0,0,0
