In [8]:
import pandas as pd
df = pd.read_csv('titanic.csv', usecols = ['Embarked'])

In [9]:
df['Embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [10]:
df.shape

(891, 1)

In [11]:
df.isnull().sum()

Embarked    2
dtype: int64

In [12]:
df.dropna()

Unnamed: 0,Embarked
0,S
1,C
2,S
3,S
4,S
...,...
886,S
887,S
888,S
889,C


In [14]:
pd.get_dummies(df, drop_first = True).head()

Unnamed: 0,Embarked_Q,Embarked_S
0,0,1
1,0,0
2,0,1
3,0,1
4,0,1


In [21]:
df = pd.read_csv('mercedes.csv', usecols = ['X1', 'X2', 'X3', 'X4', 'X5', 'X6'])

In [22]:
df.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6
0,v,at,a,d,u,j
1,t,av,e,d,y,l
2,w,n,c,d,x,j
3,t,n,f,d,x,l
4,v,n,f,d,h,d


In [23]:
df.shape

(4209, 6)

#### Now, let's perform the one hot encoding for this dataset which has a numerous number of categories in a feature

In [24]:
df['X1'].unique()

array(['v', 't', 'w', 'b', 'r', 'l', 's', 'aa', 'c', 'a', 'e', 'h', 'z',
       'j', 'o', 'u', 'p', 'n', 'i', 'y', 'd', 'f', 'm', 'k', 'g', 'q',
       'ab'], dtype=object)

In [32]:
pd.get_dummies(df['X1'], drop_first = True).head() ### Shouldn't use one hot coding for these kind of problems as it creates a various number of features which is not possible

Unnamed: 0,aa,ab,b,c,d,e,f,g,h,i,...,p,q,r,s,t,u,v,w,y,z
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [28]:
for i in df.columns:
    print(df[i].value_counts())

aa    833
s     598
b     592
l     590
v     408
r     251
i     203
a     143
c     121
o      82
w      52
z      46
u      37
e      33
m      32
t      31
h      29
y      23
f      23
j      22
n      19
k      17
p       9
g       6
ab      3
q       3
d       3
Name: X1, dtype: int64
as    1659
ae     496
ai     415
m      367
ak     265
r      153
n      137
s       94
f       87
e       81
aq      63
ay      54
a       47
t       29
i       25
k       25
b       21
ao      20
ag      19
z       19
d       18
ac      13
g       12
ap      11
y       11
x       10
aw       8
h        6
at       6
al       5
q        5
an       5
ah       4
p        4
av       4
au       3
l        1
af       1
am       1
o        1
aa       1
c        1
j        1
ar       1
Name: X2, dtype: int64
c    1942
f    1076
a     440
d     290
g     241
e     163
b      57
Name: X3, dtype: int64
d    4205
a       2
c       1
b       1
Name: X4, dtype: int64
v     231
w     231
q     220
r     215
d   

In [31]:
for i in df.columns:
    print(len(df[i].unique()))

27
44
7
4
29
12


#### For these kind of problems we can apply the technique which was used in kde ornge cup challenge kaggle competition

They have actually took the top 10 most frequent categories in every variable or feature and replace  the remaining categories with 1 if they are present in a list of top 10 orelse replace it with 0.

In [43]:
top_10 = df.X1.value_counts().sort_values(ascending = False).head(10).index

In [44]:
top_10

Index(['aa', 's', 'b', 'l', 'v', 'r', 'i', 'a', 'c', 'o'], dtype='object')

In [45]:
top = list(top_10)

In [46]:
top

['aa', 's', 'b', 'l', 'v', 'r', 'i', 'a', 'c', 'o']

In [47]:
import numpy as np
for categories in top:
    df[categories] = np.where(df['X1'] == categories, 1, 0)
     

In [50]:
df[top]

Unnamed: 0,aa,s,b,l,v,r,i,a,c,o
0,0,0,0,0,1,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
4204,0,1,0,0,0,0,0,0,0,0
4205,0,0,0,0,0,0,0,0,0,1
4206,0,0,0,0,1,0,0,0,0,0
4207,0,0,0,0,0,1,0,0,0,0


In [52]:
top.append('X1')

In [54]:
df[top].head(20)

Unnamed: 0,aa,s,b,l,v,r,i,a,c,o,X1,X1.1
0,0,0,0,0,1,0,0,0,0,0,v,v
1,0,0,0,0,0,0,0,0,0,0,t,t
2,0,0,0,0,0,0,0,0,0,0,w,w
3,0,0,0,0,0,0,0,0,0,0,t,t
4,0,0,0,0,1,0,0,0,0,0,v,v
5,0,0,1,0,0,0,0,0,0,0,b,b
6,0,0,0,0,0,1,0,0,0,0,r,r
7,0,0,0,1,0,0,0,0,0,0,l,l
8,0,1,0,0,0,0,0,0,0,0,s,s
9,0,0,1,0,0,0,0,0,0,0,b,b
