# One Hot Encoding - Variables with many categories

In [7]:
import numpy as np
import pandas as pd

data = pd.read_csv('train.csv', usecols = ['X1', 'X2', 'X3', 'X4', 'X5', 'X6'])
data.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6
0,v,at,a,d,u,j
1,t,av,e,d,y,l
2,w,n,c,d,x,j
3,t,n,f,d,x,l
4,v,n,f,d,h,d


In [11]:
# Lets lok at how many unique categories that are in each and every column
# Lets look at how many labels each variable has

for col in data.columns:
    print(col, ':', len(data[col].unique()), 'labels')

X1 : 27 labels
X2 : 44 labels
X3 : 7 labels
X4 : 4 labels
X5 : 29 labels
X6 : 12 labels


In [14]:
# To get the amount of shape we get after one hot encoding
pd.get_dummies(data, drop_first=True).shape

(4209, 117)

- We can see that from 6 initial variables, we end up getting 117 varibles

In [20]:
# Lets find the most frequent categories of feature X2

data.X2.value_counts().sort_values(ascending=False).head(20)

as    1659
ae     496
ai     415
m      367
ak     265
r      153
n      137
s       94
f       87
e       81
aq      63
ay      54
a       47
t       29
k       25
i       25
b       21
ao      20
ag      19
z       19
Name: X2, dtype: int64

In [22]:
# Lets make a list of the most frequent category of the variable
top10 = [x for x in data.X2.value_counts().sort_values(ascending=False).head(10).index]
top10

['as', 'ae', 'ai', 'm', 'ak', 'r', 'n', 's', 'f', 'e']

In [30]:
for label in top10:
    data[label] = np.where(data['X2'] == label, 1, 0)
data[['X2'] + top10].head(40)

Unnamed: 0,X2,as,ae,ai,m,ak,r,n,s,f,e
0,at,0,0,0,0,0,0,0,0,0,0
1,av,0,0,0,0,0,0,0,0,0,0
2,n,0,0,0,0,0,0,1,0,0,0
3,n,0,0,0,0,0,0,1,0,0,0
4,n,0,0,0,0,0,0,1,0,0,0
5,e,0,0,0,0,0,0,0,0,0,1
6,e,0,0,0,0,0,0,0,0,0,1
7,as,1,0,0,0,0,0,0,0,0,0
8,as,1,0,0,0,0,0,0,0,0,0
9,aq,0,0,0,0,0,0,0,0,0,0
