# Probability Ratio Encoding 
## Steps
1. Probability of Survived based on Cabin -- Categorical feature
2. Probability of not Survived [1 - probability(survived)]  based on Cabin -- Categorical feature
3. Find the ratio of survived and died based on probability of step 1 and 2
4. Create a Dictionary which maps cabin with probability
5. Replace with the missing values 

In [2]:
import pandas as pd
df=pd.read_csv('train.csv',usecols=['Cabin','Survived'])
df

Unnamed: 0,Survived,Cabin
0,0,
1,1,C85
2,1,
3,1,C123
4,0,
...,...,...
886,0,
887,1,B42
888,0,
889,1,C148


In [3]:
# Fix the nan value(replace the nan value with missing value) and we will do probability encoding
# Replacing NAN with some other variable
df['Cabin'].fillna('Missing',inplace=True)

In [4]:
df

Unnamed: 0,Survived,Cabin
0,0,Missing
1,1,C85
2,1,Missing
3,1,C123
4,0,Missing
...,...,...
886,0,Missing
887,1,B42
888,0,Missing
889,1,C148


In [6]:
df['Cabin'].unique()

array(['Missing', 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6',
       'C23 C25 C27', 'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33',
       'F G73', 'E31', 'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101',
       'F E69', 'D47', 'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4',
       'A32', 'B4', 'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35',
       'C87', 'B77', 'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19',
       'B49', 'D', 'C22 C26', 'C106', 'C65', 'E36', 'C54',
       'B57 B59 B63 B66', 'C7', 'E34', 'C32', 'B18', 'C124', 'C91', 'E40',
       'T', 'C128', 'D37', 'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44',
       'A34', 'C104', 'C111', 'C92', 'E38', 'D21', 'E12', 'E63', 'A14',
       'B37', 'C30', 'D20', 'B79', 'E25', 'D46', 'B73', 'C95', 'B38',
       'B39', 'B22', 'C86', 'C70', 'A16', 'C101', 'C68', 'A10', 'E68',
       'B41', 'A20', 'D19', 'D50', 'D9', 'A23', 'B50', 'A26', 'D48',
       'E58', 'C126', 'B71', 'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63',
       'C62

In [7]:
# I will take the first letter 
df['Cabin']=df['Cabin'].astype(str).str[0] # first letter

In [8]:
df

Unnamed: 0,Survived,Cabin
0,0,M
1,1,C
2,1,M
3,1,C
4,0,M
...,...,...
886,0,M
887,1,B
888,0,M
889,1,C


In [9]:
df['Cabin'].unique()

array(['M', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [10]:
# Probability encoding says that I have to find the percentage of the survived based on cabin
df.groupby(['Cabin'])['Survived'].mean()
# Survival rate (It is completely based on survived column)

Cabin
A    0.466667
B    0.744681
C    0.593220
D    0.757576
E    0.750000
F    0.615385
G    0.500000
M    0.299854
T    0.000000
Name: Survived, dtype: float64

In [11]:
# Probability Data Frame
prob_df=df.groupby(['Cabin'])['Survived'].mean()

In [12]:
# convert to data frame
prob_df=pd.DataFrame(prob_df)
prob_df
# Changed the survived column value with the probability of the person surviving based on column values 

Unnamed: 0_level_0,Survived
Cabin,Unnamed: 1_level_1
A,0.466667
B,0.744681
C,0.59322
D,0.757576
E,0.75
F,0.615385
G,0.5
M,0.299854
T,0.0


In [13]:
# Next Step, To calculate, what is the probability that the person had died
prob_df['Died']=1-prob_df['Survived']
prob_df

Unnamed: 0_level_0,Survived,Died
Cabin,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0.466667,0.533333
B,0.744681,0.255319
C,0.59322,0.40678
D,0.757576,0.242424
E,0.75,0.25
F,0.615385,0.384615
G,0.5,0.5
M,0.299854,0.700146
T,0.0,1.0


In [14]:
# formula for probability encoding
prob_df['Probability_ratio']=prob_df['Survived']/prob_df['Died']
prob_df

Unnamed: 0_level_0,Survived,Died,Probability_ratio
Cabin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,0.466667,0.533333,0.875
B,0.744681,0.255319,2.916667
C,0.59322,0.40678,1.458333
D,0.757576,0.242424,3.125
E,0.75,0.25,3.0
F,0.615385,0.384615,1.6
G,0.5,0.5,1.0
M,0.299854,0.700146,0.428274
T,0.0,1.0,0.0


In [15]:
# Convert Probability ratio to a dictionary
prob_df['Probability_ratio'].to_dict()

{'A': 0.875,
 'B': 2.916666666666666,
 'C': 1.4583333333333333,
 'D': 3.125,
 'E': 3.0,
 'F': 1.6000000000000003,
 'G': 1.0,
 'M': 0.42827442827442824,
 'T': 0.0}

In [16]:
Probability_encoded=prob_df['Probability_ratio'].to_dict()
# Map the value in the data frame

In [17]:
df['Cabin'].map(Probability_encoded)

0      0.428274
1      1.458333
2      0.428274
3      1.458333
4      0.428274
         ...   
886    0.428274
887    2.916667
888    0.428274
889    1.458333
890    0.428274
Name: Cabin, Length: 891, dtype: float64

In [18]:
df['Cabin_encoded']=df['Cabin'].map(Probability_encoded)

In [19]:
df # Now I can use the last column to do model training
# Instead of cabin , I am using the probability column

Unnamed: 0,Survived,Cabin,Cabin_encoded
0,0,M,0.428274
1,1,C,1.458333
2,1,M,0.428274
3,1,C,1.458333
4,0,M,0.428274
...,...,...,...
886,0,M,0.428274
887,1,B,2.916667
888,0,M,0.428274
889,1,C,1.458333
