# One Hot Encoding - Variables with many categories

In [2]:
import pandas as pd
import numpy as np

data=pd.read_csv('mercedesbenz.csv',usecols=['X1','X2','X3','X4','X5','X6'])
data.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6
0,v,at,a,d,u,j
1,t,av,e,d,y,l
2,w,n,c,d,x,j
3,t,n,f,d,x,l
4,v,n,f,d,h,d


In [6]:
# lets have a look how many unique categories are there in each column
for col in data.columns:
    print(col,':',len(data[col].unique()),' labels')

X1 : 27  labels
X2 : 44  labels
X3 : 7  labels
X4 : 4  labels
X5 : 29  labels
X6 : 12  labels


In [10]:
# let's examine how many columns we will obtain after one hot encoding these variables
pd.get_dummies(data,drop_first=True).shape 

(4209, 117)

In [14]:
data.shape

(4209, 6)

In [16]:
# Taking 10 most frequent feature from each categories and the rest will be just skipped

In [18]:
data.X2.value_counts().sort_values(ascending=False).head(20)

X2
as    1659
ae     496
ai     415
m      367
ak     265
r      153
n      137
s       94
f       87
e       81
aq      63
ay      54
a       47
t       29
k       25
i       25
b       21
ao      20
ag      19
z       19
Name: count, dtype: int64

In [20]:
top_10_X2=[x for x in data.X2.value_counts().sort_values(ascending=False).head(10).index]
top_10_X2

['as', 'ae', 'ai', 'm', 'ak', 'r', 'n', 's', 'f', 'e']

In [22]:
# and now we make tthe 10 binary variable
for label in top_10_X2:
    data[label]=np.where(data['X2']==label,1,0)

data[['X2']+top_10_X2].head(40)

Unnamed: 0,X2,as,ae,ai,m,ak,r,n,s,f,e
0,at,0,0,0,0,0,0,0,0,0,0
1,av,0,0,0,0,0,0,0,0,0,0
2,n,0,0,0,0,0,0,1,0,0,0
3,n,0,0,0,0,0,0,1,0,0,0
4,n,0,0,0,0,0,0,1,0,0,0
5,e,0,0,0,0,0,0,0,0,0,1
6,e,0,0,0,0,0,0,0,0,0,1
7,as,1,0,0,0,0,0,0,0,0,0
8,as,1,0,0,0,0,0,0,0,0,0
9,aq,0,0,0,0,0,0,0,0,0,0


In [58]:
## Create a function to turn coulumn feature into binary variables
def one_hot_top_x(df,variable):
    #function to create dummy variables for the most frequent lables
    # we can vary the number of most frequent labels that we encode
    top_10 =[x for x in data[variable].value_counts().sort_values(ascending=False).head(10).index]
    print(top_10)

    for label in top_10:
        df[variable+"_"+label]=np.where(df[variable]==label,1,0)

In [60]:
data=pd.read_csv('mercedesbenz.csv',usecols=['X1','X2','X3','X4','X5','X6'])

In [62]:
one_hot_top_x(data,'X2')
data.head()

['as', 'ae', 'ai', 'm', 'ak', 'r', 'n', 's', 'f', 'e']


Unnamed: 0,X1,X2,X3,X4,X5,X6,X2_as,X2_ae,X2_ai,X2_m,X2_ak,X2_r,X2_n,X2_s,X2_f,X2_e
0,v,at,a,d,u,j,0,0,0,0,0,0,0,0,0,0
1,t,av,e,d,y,l,0,0,0,0,0,0,0,0,0,0
2,w,n,c,d,x,j,0,0,0,0,0,0,1,0,0,0
3,t,n,f,d,x,l,0,0,0,0,0,0,1,0,0,0
4,v,n,f,d,h,d,0,0,0,0,0,0,1,0,0,0


In [64]:
one_hot_top_x(data,'X3')
data.head()

['c', 'f', 'a', 'd', 'g', 'e', 'b']


Unnamed: 0,X1,X2,X3,X4,X5,X6,X2_as,X2_ae,X2_ai,X2_m,...,X2_s,X2_f,X2_e,X3_c,X3_f,X3_a,X3_d,X3_g,X3_e,X3_b
0,v,at,a,d,u,j,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,t,av,e,d,y,l,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,w,n,c,d,x,j,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,t,n,f,d,x,l,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,v,n,f,d,h,d,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [66]:
one_hot_top_x(data,'X4')
data.head()

['d', 'a', 'b', 'c']


Unnamed: 0,X1,X2,X3,X4,X5,X6,X2_as,X2_ae,X2_ai,X2_m,...,X3_f,X3_a,X3_d,X3_g,X3_e,X3_b,X4_d,X4_a,X4_b,X4_c
0,v,at,a,d,u,j,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0
1,t,av,e,d,y,l,0,0,0,0,...,0,0,0,0,1,0,1,0,0,0
2,w,n,c,d,x,j,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,t,n,f,d,x,l,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
4,v,n,f,d,h,d,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0


In [68]:
one_hot_top_x(data,'X5')
data.head()

['w', 'v', 'q', 'r', 's', 'd', 'n', 'p', 'm', 'i']


Unnamed: 0,X1,X2,X3,X4,X5,X6,X2_as,X2_ae,X2_ai,X2_m,...,X5_w,X5_v,X5_q,X5_r,X5_s,X5_d,X5_n,X5_p,X5_m,X5_i
0,v,at,a,d,u,j,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,t,av,e,d,y,l,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,w,n,c,d,x,j,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,t,n,f,d,x,l,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,v,n,f,d,h,d,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [70]:
one_hot_top_x(data,'X6')
data.head()

['g', 'j', 'd', 'i', 'l', 'a', 'h', 'k', 'c', 'b']


Unnamed: 0,X1,X2,X3,X4,X5,X6,X2_as,X2_ae,X2_ai,X2_m,...,X6_g,X6_j,X6_d,X6_i,X6_l,X6_a,X6_h,X6_k,X6_c,X6_b
0,v,at,a,d,u,j,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,t,av,e,d,y,l,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,w,n,c,d,x,j,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,t,n,f,d,x,l,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,v,n,f,d,h,d,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [72]:
data.shape

(4209, 47)

In [36]:
one_hot_top_x(data,'X2')

['as', 'ae', 'ai', 'm', 'ak', 'r', 'n', 's', 'f', 'e']
