In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('mercedesbenz.csv',usecols=['X1','X2','X3','X4','X5','X6'])

In [3]:
data.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6
0,v,at,a,d,u,j
1,t,av,e,d,y,l
2,w,n,c,d,x,j
3,t,n,f,d,x,l
4,v,n,f,d,h,d


In [4]:
# we count the numbers of unique categorical values in each column.
for col in data.columns:
    print(col,' :  ',len(data[col].unique()),'labels')

X1  :   27 labels
X2  :   44 labels
X3  :   7 labels
X4  :   4 labels
X5  :   29 labels
X6  :   12 labels


In [5]:
# we try to use one hot encoding 
pd.get_dummies(data,drop_first=True).shape

# In the output we see that after doing one hot encoding in these 5 columns 
# our number of columns increses to 117(i.e. 112 more columns)
# this Stratagy is not viable as it will start inducing curse of dimensionality

(4209, 117)

In [6]:
# here we find the top 20 most frequent columns in the dataset 
data.X2.value_counts().sort_values(ascending=False).head(20)

as    1659
ae     496
ai     415
m      367
ak     265
r      153
n      137
s       94
f       87
e       81
aq      63
ay      54
a       47
t       29
i       25
k       25
b       21
ao      20
ag      19
z       19
Name: X2, dtype: int64

In [7]:
# we find the index of top 10 most frequent values i.e. the most frrequent categories in the column

top_10 = [x for x in data.X6.value_counts().sort_values(ascending=False).head(10).index]
top_10

['g', 'j', 'd', 'i', 'l', 'a', 'h', 'k', 'c', 'b']

In [8]:
# now we replace the top 10 most frequent variable with their one hot encoding
for lable in top_10:
    data[lable] = np.where(data['X2']==lable,1,0)

In [9]:
data.head(20)

Unnamed: 0,X1,X2,X3,X4,X5,X6,g,j,d,i,l,a,h,k,c,b
0,v,at,a,d,u,j,0,0,0,0,0,0,0,0,0,0
1,t,av,e,d,y,l,0,0,0,0,0,0,0,0,0,0
2,w,n,c,d,x,j,0,0,0,0,0,0,0,0,0,0
3,t,n,f,d,x,l,0,0,0,0,0,0,0,0,0,0
4,v,n,f,d,h,d,0,0,0,0,0,0,0,0,0,0
5,b,e,c,d,g,h,0,0,0,0,0,0,0,0,0,0
6,r,e,f,d,f,h,0,0,0,0,0,0,0,0,0,0
7,l,as,f,d,f,j,0,0,0,0,0,0,0,0,0,0
8,s,as,e,d,f,i,0,0,0,0,0,0,0,0,0,0
9,b,aq,c,d,f,a,0,0,0,0,0,0,0,0,0,0


In [10]:
def oneHotEncoder(df,top_x,cols):
    for xCols in cols:
        top_x_lable = [x for x in df[xCols].value_counts().sort_values(ascending=False).head(top_x).index]
        for lable in top_x_lable:
            df[xCols+'_'+lable] = np.where(df[xCols]==lable,1,0)
    return df

In [11]:
data = oneHotEncoder(data,10,['X1','X2','X3','X4','X5','X6'])

In [12]:
data.describe()

Unnamed: 0,g,j,d,i,l,a,h,k,c,b,...,X6_g,X6_j,X6_d,X6_i,X6_l,X6_a,X6_h,X6_k,X6_c,X6_b
count,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,...,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0
mean,0.002851,0.000238,0.004277,0.00594,0.000238,0.011167,0.001426,0.00594,0.000238,0.004989,...,0.247565,0.246852,0.148491,0.115942,0.113566,0.048943,0.045141,0.010216,0.009028,0.006652
std,0.053325,0.015414,0.065263,0.076849,0.015414,0.105093,0.037734,0.076849,0.015414,0.070467,...,0.431649,0.431231,0.355629,0.320193,0.317321,0.215774,0.207639,0.10057,0.094599,0.0813
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [14]:
data.head(10)

Unnamed: 0,X1,X2,X3,X4,X5,X6,g,j,d,i,...,X6_g,X6_j,X6_d,X6_i,X6_l,X6_a,X6_h,X6_k,X6_c,X6_b
0,v,at,a,d,u,j,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,t,av,e,d,y,l,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,w,n,c,d,x,j,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,t,n,f,d,x,l,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,v,n,f,d,h,d,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
5,b,e,c,d,g,h,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
6,r,e,f,d,f,h,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
7,l,as,f,d,f,j,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
8,s,as,e,d,f,i,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
9,b,aq,c,d,f,a,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [15]:
data.shape

(4209, 67)