#                                Handle Categorical variable

## Target Guided Ordinal Encoding
1. Ordering the labels according to the target
2. Replace the labels by the joint probability of being 1 or 0 in classification problem

In [3]:
import pandas as pd
df=pd.read_csv('train.csv', usecols=['Cabin','Survived'])
df.head()

Unnamed: 0,Survived,Cabin
0,0,
1,1,C85
2,1,
3,1,C123
4,0,


In [4]:
# Instead of NAN I will use some other categories 
df['Cabin'].fillna('Missing',inplace=True)

In [5]:
df

Unnamed: 0,Survived,Cabin
0,0,Missing
1,1,C85
2,1,Missing
3,1,C123
4,0,Missing
...,...,...
886,0,Missing
887,1,B42
888,0,Missing
889,1,C148


In [6]:
# I am going to take the first letter( the first letter indicates in which block it is)
# convert to type string
df['Cabin']=df['Cabin'].astype(str).str[0] # Store the value 
df['Cabin'].astype(str).str[0]

0      M
1      C
2      M
3      C
4      M
      ..
886    M
887    B
888    M
889    C
890    M
Name: Cabin, Length: 891, dtype: object

In [7]:
df

Unnamed: 0,Survived,Cabin
0,0,M
1,1,C
2,1,M
3,1,C
4,0,M
...,...,...
886,0,M
887,1,B
888,0,M
889,1,C


In [8]:
df.Cabin.unique()

array(['M', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [9]:
# Percentage of the person getting survived in different cabin based on survived column 
df.groupby(['Cabin'])['Survived'].mean().sort_values(ascending=False)

Cabin
D    0.757576
E    0.750000
B    0.744681
F    0.615385
C    0.593220
G    0.500000
A    0.466667
M    0.299854
T    0.000000
Name: Survived, dtype: float64

In [12]:
df.groupby(['Cabin'])['Survived'].mean().sort_values().index

Index(['T', 'M', 'A', 'G', 'C', 'F', 'B', 'E', 'D'], dtype='object', name='Cabin')

In [10]:
df.groupby(['Cabin'])['Survived'].mean().index

Index(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'M', 'T'], dtype='object', name='Cabin')

In [11]:
ordinal_labels=df.groupby(['Cabin'])['Survived'].mean().sort_values().index
ordinal_labels

Index(['T', 'M', 'A', 'G', 'C', 'F', 'B', 'E', 'D'], dtype='object', name='Cabin')

In [12]:
enumerate(ordinal_labels,0)
# It will iterate throgh each and every label and the first value it will assigh is 0

<enumerate at 0x1a31eebe7f0>

In [13]:
# Map all the labels to a number
ordinal_labels2={k:i for i,k in enumerate(ordinal_labels,0)} 
ordinal_labels2
# Based on the ranking , index are being assigned
# Highest value has highest rank 

{'T': 0, 'M': 1, 'A': 2, 'G': 3, 'C': 4, 'F': 5, 'B': 6, 'E': 7, 'D': 8}

In [14]:
# we will do the mapping (index of the cabin will be stored )
df['Cabin_ordinal_labels']=df['Cabin'].map(ordinal_labels2)

In [15]:
df # It is based on target values

Unnamed: 0,Survived,Cabin,Cabin_ordinal_labels
0,0,M,1
1,1,C,4
2,1,M,1
3,1,C,4
4,0,M,1
...,...,...,...
886,0,M,1
887,1,B,6
888,0,M,1
889,1,C,4


# Mean Encoding 

In [16]:
# This is my mean encoding ( Replace The category(A) with mean)
df.groupby(['Cabin'])['Survived'].mean()

Cabin
A    0.466667
B    0.744681
C    0.593220
D    0.757576
E    0.750000
F    0.615385
G    0.500000
M    0.299854
T    0.000000
Name: Survived, dtype: float64

In [17]:
df.groupby(['Cabin'])['Survived'].mean().to_dict()

{'A': 0.4666666666666667,
 'B': 0.7446808510638298,
 'C': 0.5932203389830508,
 'D': 0.7575757575757576,
 'E': 0.75,
 'F': 0.6153846153846154,
 'G': 0.5,
 'M': 0.29985443959243085,
 'T': 0.0}

In [18]:
mean_ordinal=df.groupby(['Cabin'])['Survived'].mean().to_dict()
mean_ordinal

{'A': 0.4666666666666667,
 'B': 0.7446808510638298,
 'C': 0.5932203389830508,
 'D': 0.7575757575757576,
 'E': 0.75,
 'F': 0.6153846153846154,
 'G': 0.5,
 'M': 0.29985443959243085,
 'T': 0.0}

In [19]:
df['mean_ordinal_encode']=df['Cabin'].map(mean_ordinal)

In [20]:
df

Unnamed: 0,Survived,Cabin,Cabin_ordinal_labels,mean_ordinal_encode
0,0,M,1,0.299854
1,1,C,4,0.593220
2,1,M,1,0.299854
3,1,C,4,0.593220
4,0,M,1,0.299854
...,...,...,...,...
886,0,M,1,0.299854
887,1,B,6,0.744681
888,0,M,1,0.299854
889,1,C,4,0.593220
