# One Hot Encoding (Columns having many categories)

In [14]:
import pandas as pd
import numpy as np

In [17]:
ds = pd.read_csv('mercedesbenz.csv', usecols=['X1', 'X2','X3', 'X4','X5', 'X6'])

In [18]:
ds.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6
0,v,at,a,d,u,j
1,t,av,e,d,y,l
2,w,n,c,d,x,j
3,t,n,f,d,x,l
4,v,n,f,d,h,d


In [20]:
# lets find out how many unique categories are present in each columns
for column in ds.columns:
  print(f"Column {column} has {len(ds[column].unique())} categories")

Column X1 has 27 categories
Column X2 has 44 categories
Column X3 has 7 categories
Column X4 has 4 categories
Column X5 has 29 categories
Column X6 has 12 categories


Since there are many categories, performing one-hot encoding with get_dummies will increase the number of columns to 177 columns

In [21]:
ds_copy = ds.copy()

In [22]:
pd.get_dummies(ds_copy, drop_first=True).shape

(4209, 117)

So due to huge number of columns(features), we will face the problem of "Curse of Dimensionality". And our accuracy would be compromised

Now how we solve this. We take 10 most frequent categories of each column(features) and then perform one-hot encoding on them, and for the remaining categories we make them '0' (zero)

In [27]:
# lets find the top 10 most frequent categories for any one of the features of the ds
ds['X2'].value_counts().sort_values(ascending=False).head(10)

as    1659
ae     496
ai     415
m      367
ak     265
r      153
n      137
s       94
f       87
e       81
Name: X2, dtype: int64

In [30]:
# Let's make a list of top 10 most frequent words for a column and try it out
X2_top_10 = [x for x in ds["X2"].value_counts().sort_values(ascending=False).head(10).index]
# .index gives us the column name
X2_top_10

['as', 'ae', 'ai', 'm', 'ak', 'r', 'n', 's', 'f', 'e']

In [31]:
for label in X2_top_10:
  ds[label] = np.where(ds['X2']==label,1,0)
  # here we create column for all the top 10 categories of feature X2 and in those new column we set 1 if
  # the categories of X2 comes within the top 10 cateogires. 0 otherwise

In [32]:
ds.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,as,ae,ai,m,ak,r,n,s,f,e
0,v,at,a,d,u,j,0,0,0,0,0,0,0,0,0,0
1,t,av,e,d,y,l,0,0,0,0,0,0,0,0,0,0
2,w,n,c,d,x,j,0,0,0,0,0,0,1,0,0,0
3,t,n,f,d,x,l,0,0,0,0,0,0,1,0,0,0
4,v,n,f,d,h,d,0,0,0,0,0,0,1,0,0,0


Now lets extract the fresh data and do all these for all the features

In [65]:
ds = pd.read_csv('mercedesbenz.csv', usecols=['X1', 'X2','X3', 'X4','X5', 'X6'])

In [66]:
# creating a function to get the top10 categories
def get_10(col_name):
  return [x for x in ds[col_name].value_counts().sort_values(ascending=False).head(10).index]


In [67]:
# lets generate top 10 categories for all the features
X1_top_10 = get_10('X1')
X2_top_10 = get_10('X2')
X3_top_10 = get_10('X3')
X4_top_10 = get_10('X4')
X5_top_10 = get_10('X5')
X6_top_10 = get_10('X6')

In [68]:
# lets print the top 10 features of some columns
print(f"Top to features of X1 is: ",X1_top_10)
print(f"Top to features of X2 is: ",X2_top_10)

Top to features of X1 is:  ['aa', 's', 'b', 'l', 'v', 'r', 'i', 'a', 'c', 'o']
Top to features of X2 is:  ['as', 'ae', 'ai', 'm', 'ak', 'r', 'n', 's', 'f', 'e']


In [69]:
# Now creating a function to perform one-hot encoding to top 10 categories
def one_hot(ds, col_name, top_10_cats):
  for cat in top_10_cats:
    ds[col_name+ '_'+cat]= np.where(ds[col_name] == cat,1,0)

In [70]:
# performing one hot encoding for 10 most frequent categories for all the features
one_hot(ds,'X1', X1_top_10)
one_hot(ds,'X2', X2_top_10)
one_hot(ds,'X3', X3_top_10)
one_hot(ds,'X4', X4_top_10)
one_hot(ds,'X5', X5_top_10)
one_hot(ds,'X6', X6_top_10)


In [71]:
# lets check the dataset
ds.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X1_aa,X1_s,X1_b,X1_l,...,X6_g,X6_j,X6_d,X6_i,X6_l,X6_a,X6_h,X6_k,X6_c,X6_b
0,v,at,a,d,u,j,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,t,av,e,d,y,l,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,w,n,c,d,x,j,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,t,n,f,d,x,l,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,v,n,f,d,h,d,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [72]:
# Now we can drop the original features
ds.drop(['X1', 'X2','X3','X4','X5','X6'], inplace=True, axis=1)

In [73]:
ds.head()

Unnamed: 0,X1_aa,X1_s,X1_b,X1_l,X1_v,X1_r,X1_i,X1_a,X1_c,X1_o,...,X6_g,X6_j,X6_d,X6_i,X6_l,X6_a,X6_h,X6_k,X6_c,X6_b
0,0,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


### Advantages
-   Easy to implement and understand
-   Does not require time to explore the variables
- Wont increase the size of features to large extend



### Disadvantage
- Doesn't add any information that may make the prediction strong
- Doesn't keep the information of the ignored labels
- Loss of information 