# One Hot Encoding - variables with many changes

In [2]:
import pandas as pd
import numpy as np
dataset="https://raw.githubusercontent.com/subhadipml/Mercedes-Benz-Greener-Manufacturing/master/train.csv"
data = pd.read_csv(dataset,usecols=['X1','X2','X3','X4','X5','X6'])

In [3]:
data.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6
0,v,at,a,d,u,j
1,t,av,e,d,y,l
2,w,n,c,d,x,j
3,t,n,f,d,x,l
4,v,n,f,d,h,d


In [4]:
#how many categories are there in each and every column
for col in data.columns:
  print(col,":",len(data[col].unique()),' labels') #unique values in each column

X1 : 27  labels
X2 : 44  labels
X3 : 7  labels
X4 : 4  labels
X5 : 29  labels
X6 : 12  labels


In [5]:
# It converts categorical data into dummy or indicator variables.
pd.get_dummies(data,drop_first=True).shape

(4209, 117)

In [6]:
#limit one hot encoding to the 10 most frequent labels only
#finding the the top 20 most frequent categories for the variable X2
data['X2'].value_counts().sort_values(ascending=False).head(20)

as    1659
ae     496
ai     415
m      367
ak     265
r      153
n      137
s       94
f       87
e       81
aq      63
ay      54
a       47
t       29
i       25
k       25
b       21
ao      20
ag      19
z       19
Name: X2, dtype: int64

In [7]:
#Make a list with the most frequent catogeries of the variable

top10 = [x for x in data['X2'].value_counts().sort_values(ascending=False).head(10).index]
top10

['as', 'ae', 'ai', 'm', 'ak', 'r', 'n', 's', 'f', 'e']

In [8]:
# we make the 10 binary variables
for label in top10:
  data[label] = np.where(data['X2']==label,1,0)

In [9]:
data

Unnamed: 0,X1,X2,X3,X4,X5,X6,as,ae,ai,m,ak,r,n,s,f,e
0,v,at,a,d,u,j,0,0,0,0,0,0,0,0,0,0
1,t,av,e,d,y,l,0,0,0,0,0,0,0,0,0,0
2,w,n,c,d,x,j,0,0,0,0,0,0,1,0,0,0
3,t,n,f,d,x,l,0,0,0,0,0,0,1,0,0,0
4,v,n,f,d,h,d,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,s,as,c,d,aa,d,1,0,0,0,0,0,0,0,0,0
4205,o,t,d,d,aa,h,0,0,0,0,0,0,0,0,0,0
4206,v,r,a,d,aa,g,0,0,0,0,0,1,0,0,0,0
4207,r,e,f,d,aa,l,0,0,0,0,0,0,0,0,0,1


In [14]:
data[['X2']+top10].head(30) #just concatination

Unnamed: 0,X2,as,ae,ai,m,ak,r,n,s,f,e
0,at,0,0,0,0,0,0,0,0,0,0
1,av,0,0,0,0,0,0,0,0,0,0
2,n,0,0,0,0,0,0,1,0,0,0
3,n,0,0,0,0,0,0,1,0,0,0
4,n,0,0,0,0,0,0,1,0,0,0
5,e,0,0,0,0,0,0,0,0,0,1
6,e,0,0,0,0,0,0,0,0,0,1
7,as,1,0,0,0,0,0,0,0,0,0
8,as,1,0,0,0,0,0,0,0,0,0
9,aq,0,0,0,0,0,0,0,0,0,0


In [15]:
#get whole set of dummy variables for all the categorical variables
def one_hot_top_x(df,variable, top_x_labels):
  #we can pass dynamically how many frequent labels we need to use
  for label in top_x_labels:
    df[variable+' '+label]=np.where(data[variable]==label,1,0)

data = pd.read_csv(dataset,usecols=['X1','X2','X3','X4','X5','X6'])

one_hot_top_x(data,'X2',top10)

In [16]:
data.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X2 as,X2 ae,X2 ai,X2 m,X2 ak,X2 r,X2 n,X2 s,X2 f,X2 e
0,v,at,a,d,u,j,0,0,0,0,0,0,0,0,0,0
1,t,av,e,d,y,l,0,0,0,0,0,0,0,0,0,0
2,w,n,c,d,x,j,0,0,0,0,0,0,1,0,0,0
3,t,n,f,d,x,l,0,0,0,0,0,0,1,0,0,0
4,v,n,f,d,h,d,0,0,0,0,0,0,1,0,0,0
