## Handle Categorical Features

### One Hot Encoding

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("titanic.csv", usecols = ["Sex"])

In [3]:
df.head()

Unnamed: 0,Sex
0,male
1,female
2,female
3,female
4,male


In [4]:
pd.get_dummies(df,drop_first = True).head()

Unnamed: 0,Sex_male
0,1
1,0
2,0
3,0
4,1


In [5]:
df=pd.read_csv('titanic.csv',usecols=['Embarked'])

In [6]:
df["Embarked"].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [7]:
df["Embarked"].dropna(inplace=True)

In [8]:
pd.get_dummies(df, drop_first= True).head()

Unnamed: 0,Embarked_Q,Embarked_S
0,0,1
1,0,0
2,0,1
3,0,1
4,0,1


### One Hot Encoding with many categories in a feature

In [9]:
df=pd.read_csv('mercedes.csv',usecols=["X0","X1","X2","X3","X4","X5","X6"])

In [10]:
df.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6
0,k,v,at,a,d,u,j
1,k,t,av,e,d,y,l
2,az,w,n,c,d,x,j
3,az,t,n,f,d,x,l
4,az,v,n,f,d,h,d


In [11]:
for i in df.columns:
    print(len(df[i].unique()))

47
27
44
7
4
29
12


If we use one hot encoding here then may be huge problem it will create more columns so in this case dont use one hot endoing

Use Technique from KDD Orange Competition

In [12]:
df.X1.value_counts().sort_values(ascending=False).head(10)

aa    833
s     598
b     592
l     590
v     408
r     251
i     203
a     143
c     121
o      82
Name: X1, dtype: int64

In [13]:
lst_10 = df.X1.value_counts().sort_values(ascending=False).head(10).index
lst_10 = list(lst_10)

In [14]:
lst_10


['aa', 's', 'b', 'l', 'v', 'r', 'i', 'a', 'c', 'o']

Now apply one hot encoding only to this top 10 categories. remaining skip 

In [15]:
import numpy as np
for category in lst_10:
    df[category] = np.where(df["X1"] == category, 1, 0)

In [16]:
lst_10.append('X1')

In [17]:
df[lst_10].head(10)

Unnamed: 0,aa,s,b,l,v,r,i,a,c,o,X1
0,0,0,0,0,1,0,0,0,0,0,v
1,0,0,0,0,0,0,0,0,0,0,t
2,0,0,0,0,0,0,0,0,0,0,w
3,0,0,0,0,0,0,0,0,0,0,t
4,0,0,0,0,1,0,0,0,0,0,v
5,0,0,1,0,0,0,0,0,0,0,b
6,0,0,0,0,0,1,0,0,0,0,r
7,0,0,0,1,0,0,0,0,0,0,l
8,0,1,0,0,0,0,0,0,0,0,s
9,0,0,1,0,0,0,0,0,0,0,b


### Ordinal Number Encoding

In [18]:
import datetime

In [19]:
today_date = datetime.datetime.today()

In [20]:
today_date

datetime.datetime(2021, 9, 4, 17, 42, 14, 724914)

In [21]:
today_date - datetime.timedelta(2)

datetime.datetime(2021, 9, 2, 17, 42, 14, 724914)

In [22]:
# List Comprehension
days = [today_date - datetime.timedelta(x) for x in range(0,15)]

In [23]:
import pandas as pd
data = pd.DataFrame(days)
data.columns = ["Days"]

In [24]:
data.head()

Unnamed: 0,Days
0,2021-09-04 17:42:14.724914
1,2021-09-03 17:42:14.724914
2,2021-09-02 17:42:14.724914
3,2021-09-01 17:42:14.724914
4,2021-08-31 17:42:14.724914


In [25]:
data["Weekday"] = data["Days"].dt.day_name()
data.head()

Unnamed: 0,Days,Weekday
0,2021-09-04 17:42:14.724914,Saturday
1,2021-09-03 17:42:14.724914,Friday
2,2021-09-02 17:42:14.724914,Thursday
3,2021-09-01 17:42:14.724914,Wednesday
4,2021-08-31 17:42:14.724914,Tuesday


In [26]:
dictionary={'Monday':1,'Tuesday':2,'Wednesday':3,'Thursday':4,'Friday':5,'Saturday':6,'Sunday':7}

In [27]:
dictionary

{'Monday': 1,
 'Tuesday': 2,
 'Wednesday': 3,
 'Thursday': 4,
 'Friday': 5,
 'Saturday': 6,
 'Sunday': 7}

In [28]:
data["Weekday_Ordinal"] = data["Weekday"].map(dictionary)


In [29]:
data

Unnamed: 0,Days,Weekday,Weekday_Ordinal
0,2021-09-04 17:42:14.724914,Saturday,6
1,2021-09-03 17:42:14.724914,Friday,5
2,2021-09-02 17:42:14.724914,Thursday,4
3,2021-09-01 17:42:14.724914,Wednesday,3
4,2021-08-31 17:42:14.724914,Tuesday,2
5,2021-08-30 17:42:14.724914,Monday,1
6,2021-08-29 17:42:14.724914,Sunday,7
7,2021-08-28 17:42:14.724914,Saturday,6
8,2021-08-27 17:42:14.724914,Friday,5
9,2021-08-26 17:42:14.724914,Thursday,4


### Count or Frequency encoding

##### Advantages
1) Easy To Use
2) Not increasing feature space 
##### Disadvantages
1) It will provide same weight if the frequencies are same

### Target Guided Ordinal Encoding

1) Ordering the labels according to the target
2) Replace the labels by the joint probability of being 1 or 0

In [30]:
df=pd.read_csv('titanic.csv', usecols=['Cabin','Survived'])
df.head()

Unnamed: 0,Survived,Cabin
0,0,
1,1,C85
2,1,
3,1,C123
4,0,


In [31]:
df["Cabin"].fillna("Missing", inplace= True)
df.head()

Unnamed: 0,Survived,Cabin
0,0,Missing
1,1,C85
2,1,Missing
3,1,C123
4,0,Missing


In [32]:
df["Cabin"] = df["Cabin"].astype(str).str[0]

In [33]:
df.head()

Unnamed: 0,Survived,Cabin
0,0,M
1,1,C
2,1,M
3,1,C
4,0,M


In [34]:
df.Cabin.unique()

array(['M', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [35]:
df.groupby(["Cabin"])["Survived"].mean()

Cabin
A    0.466667
B    0.744681
C    0.593220
D    0.757576
E    0.750000
F    0.615385
G    0.500000
M    0.299854
T    0.000000
Name: Survived, dtype: float64

In [36]:
df.groupby(["Cabin"])["Survived"].mean().sort_values().index

Index(['T', 'M', 'A', 'G', 'C', 'F', 'B', 'E', 'D'], dtype='object', name='Cabin')

In [37]:
ordinal_labels = df.groupby(["Cabin"])["Survived"].mean().sort_values().index
ordinal_labels

Index(['T', 'M', 'A', 'G', 'C', 'F', 'B', 'E', 'D'], dtype='object', name='Cabin')

In [38]:
enumerate(ordinal_labels,0)

<enumerate at 0x271af7788c0>

In [39]:
ordinal_labels2={k:i for i,k in enumerate(ordinal_labels,0)}
ordinal_labels2

{'T': 0, 'M': 1, 'A': 2, 'G': 3, 'C': 4, 'F': 5, 'B': 6, 'E': 7, 'D': 8}

In [40]:
df["Cabin_Ordinal_labels"] = df["Cabin"].map(ordinal_labels2)
df.head()

Unnamed: 0,Survived,Cabin,Cabin_Ordinal_labels
0,0,M,1
1,1,C,4
2,1,M,1
3,1,C,4
4,0,M,1


Now we can drop Cabin column.

### Mean Encoding

In [41]:
mean_ordinal=df.groupby(['Cabin'])['Survived'].mean().to_dict()

In [42]:
mean_ordinal

{'A': 0.4666666666666667,
 'B': 0.7446808510638298,
 'C': 0.5932203389830508,
 'D': 0.7575757575757576,
 'E': 0.75,
 'F': 0.6153846153846154,
 'G': 0.5,
 'M': 0.29985443959243085,
 'T': 0.0}

In [43]:
df['mean_ordinal_encode']=df['Cabin'].map(mean_ordinal)
df.head()

Unnamed: 0,Survived,Cabin,Cabin_Ordinal_labels,mean_ordinal_encode
0,0,M,1,0.299854
1,1,C,4,0.59322
2,1,M,1,0.299854
3,1,C,4,0.59322
4,0,M,1,0.299854


### Probability Ratio Encoding

1) Probability of Survived based on Cabin--- Categorical Feature
2) Probability of Not Survived---1-pr(Survived)
3) pr(Survived)/pr(Not Survived)
4) Dictonary to map cabin with probability
5) replace with the categorical feature

In [44]:
df = pd.read_csv("titanic.csv", usecols = ["Cabin", "Survived"])
df.head()

Unnamed: 0,Survived,Cabin
0,0,
1,1,C85
2,1,
3,1,C123
4,0,


In [45]:
## Replacing NAN values
df["Cabin"].fillna("Missing", inplace=True)

In [46]:
df.head()

Unnamed: 0,Survived,Cabin
0,0,Missing
1,1,C85
2,1,Missing
3,1,C123
4,0,Missing


In [47]:
df["Cabin"].unique()

array(['Missing', 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6',
       'C23 C25 C27', 'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33',
       'F G73', 'E31', 'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101',
       'F E69', 'D47', 'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4',
       'A32', 'B4', 'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35',
       'C87', 'B77', 'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19',
       'B49', 'D', 'C22 C26', 'C106', 'C65', 'E36', 'C54',
       'B57 B59 B63 B66', 'C7', 'E34', 'C32', 'B18', 'C124', 'C91', 'E40',
       'T', 'C128', 'D37', 'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44',
       'A34', 'C104', 'C111', 'C92', 'E38', 'D21', 'E12', 'E63', 'A14',
       'B37', 'C30', 'D20', 'B79', 'E25', 'D46', 'B73', 'C95', 'B38',
       'B39', 'B22', 'C86', 'C70', 'A16', 'C101', 'C68', 'A10', 'E68',
       'B41', 'A20', 'D19', 'D50', 'D9', 'A23', 'B50', 'A26', 'D48',
       'E58', 'C126', 'B71', 'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63',
       'C62

In [48]:
df["Cabin"] = df["Cabin"].astype(str).str[0]
df.head()

Unnamed: 0,Survived,Cabin
0,0,M
1,1,C
2,1,M
3,1,C
4,0,M


In [49]:
df.Cabin.unique()

array(['M', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [50]:
prob_df = df.groupby(["Cabin"])["Survived"].mean()

In [51]:
prob_df=pd.DataFrame(prob_df)
prob_df

Unnamed: 0_level_0,Survived
Cabin,Unnamed: 1_level_1
A,0.466667
B,0.744681
C,0.59322
D,0.757576
E,0.75
F,0.615385
G,0.5
M,0.299854
T,0.0


In [52]:
prob_df["Died"] = 1 - prob_df["Survived"]
prob_df.head()

Unnamed: 0_level_0,Survived,Died
Cabin,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0.466667,0.533333
B,0.744681,0.255319
C,0.59322,0.40678
D,0.757576,0.242424
E,0.75,0.25


In [53]:
prob_df["Probability_ratio"] = prob_df["Survived"] / prob_df["Died"]
prob_df.head()

Unnamed: 0_level_0,Survived,Died,Probability_ratio
Cabin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,0.466667,0.533333,0.875
B,0.744681,0.255319,2.916667
C,0.59322,0.40678,1.458333
D,0.757576,0.242424,3.125
E,0.75,0.25,3.0


In [54]:
probability_encoded = prob_df["Probability_ratio"].to_dict()

In [55]:
df["Cabin_encoded"] = df["Cabin"].map(probability_encoded)
df.head()

Unnamed: 0,Survived,Cabin,Cabin_encoded
0,0,M,0.428274
1,1,C,1.458333
2,1,M,0.428274
3,1,C,1.458333
4,0,M,0.428274


In [56]:
df.head(20)

Unnamed: 0,Survived,Cabin,Cabin_encoded
0,0,M,0.428274
1,1,C,1.458333
2,1,M,0.428274
3,1,C,1.458333
4,0,M,0.428274
5,0,M,0.428274
6,0,E,3.0
7,0,M,0.428274
8,1,M,0.428274
9,1,M,0.428274


Now we can drop Cabin Column.