# Handling missing values using all techniques (Categorical features)

## Frequent category imputation

In [None]:
df = pd.read_csv("loan.csv")
df.head()

In [None]:
df.columns

In [None]:
df = pd.read_csv("loan.csv",usecols=['BsmtQual','FireplaceQu','GarageType','SalePrice'])

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.shape

In [None]:
df.isnull().mean().sort_values(ascending = True)

In [None]:
df['BsmtQual'].value_counts().plot.bar()

In [None]:
df['FireplaceQu'].value_counts().plot.bar()

In [None]:
df['GarageType'].value_counts().plot.bar()

In [None]:
df['GarageType'].value_counts().index[0]

In [None]:
def impute_freq(df,variable):
    most_freq = df[variable].value_counts().index[0]
    df[variable] = df[variable].fillna(most_freq)

In [None]:
for feature in ['BsmtQual','FireplaceQu','GarageType']:
    impute_freq(df,feature)

In [None]:
df.head()

In [None]:
df.isnull().mean()

In [None]:
df.drop(['BsmtQual_freq','FireplaceQu_freq','GarageType_freq'],axis=1,inplace=True)

In [None]:
df.head()

## Adding a variable to capture NAN

In [None]:
df = pd.read_csv("loan.csv",usecols=['BsmtQual','FireplaceQu','GarageType','SalePrice'])
df.head()

In [None]:
df['BsmtQual_var'] = np.where(df['BsmtQual'].isnull(),1,0)

In [None]:
df.head()

In [None]:
def impute_new_feature(df,variable):
    df[variable+"_newvar"] = np.where(df[variable].isnull(),1,0)
    most_freq = df[variable].value_counts().index[0]
    df[variable] = df[variable].fillna(most_freq)

In [None]:
for feature in ['BsmtQual','FireplaceQu','GarageType']:
    impute_new_feature(df,feature)

In [None]:
df.head()

## Suppose we have more frequent categories, just replace NAN with new category

In [None]:
df = pd.read_csv("loan.csv",usecols=['BsmtQual','FireplaceQu','GarageType','SalePrice'])
df.head()

In [None]:
def impute_new_category(df,variable):
    df[variable+"_newvar"] = np.where(df[variable].isnull(),"Missing",df[variable])

In [None]:
for feature in ['BsmtQual','FireplaceQu','GarageType']:
    impute_new_category(df,feature)

In [None]:
df.head()

In [None]:
df.drop(['BsmtQual','FireplaceQu','GarageType'],axis=1,inplace=True)

In [None]:
df.head()

## One Hot Encoding (Nominal Features)

In [38]:
df = pd.read_csv("titanic.csv")
df.head(10)

<IPython.core.display.Javascript object>

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [39]:
df = pd.read_csv("titanic.csv", usecols = ["Sex","Embarked"])
df.head()

<IPython.core.display.Javascript object>

Unnamed: 0,Sex,Embarked
0,male,S
1,female,C
2,female,S
3,female,S
4,male,S


In [40]:
pd.get_dummies(df["Sex"],drop_first= True).head()

<IPython.core.display.Javascript object>

Unnamed: 0,male
0,1
1,0
2,0
3,0
4,1


In [None]:
df["Embarked"].dropna(inplace = True)

In [None]:
pd.get_dummies(df["Embarked"],drop_first= True).head(10)

In [None]:
df.head(10)

### If we have many categories under single feature we follow top 10 approach

In [41]:
df= pd.read_csv("mercedes.csv" ,usecols =["X0","X1","X2","X3","X4","X5","X6"])
df.head()

<IPython.core.display.Javascript object>

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6
0,k,v,at,a,d,u,j
1,k,t,av,e,d,y,l
2,az,w,n,c,d,x,j
3,az,t,n,f,d,x,l
4,az,v,n,f,d,h,d


In [42]:
df["X0"].value_counts().head(10)

z     360
ak    349
y     324
ay    313
t     306
x     300
o     269
f     227
n     195
w     182
Name: X0, dtype: int64

In [43]:
for category in df.columns:
    print(len(df[category].value_counts()))
    

47
27
44
7
4
29
12


In [None]:
df["X1"].value_counts().head(10)

In [None]:
#lst_10=df.X1.value_counts().sort_values(ascending=False).head(10).index
#lst_10=list(lst_10)

In [None]:
#or categories in lst_10:
 #  df[categories]=np.where(df['X1']==categories,1,0)

In [None]:
#print(type(df['X1']))
#print(type(categories))

In [None]:
#lst_10.append('X1')
#df[lst_10]

In [44]:
top_10_all=[]
for i in df.columns:
    top_10 = df[i].value_counts().head(10).index
    top_10= list(top_10)
    top_10_all.append(top_10)

In [45]:
top_10_all

[['z', 'ak', 'y', 'ay', 't', 'x', 'o', 'f', 'n', 'w'],
 ['aa', 's', 'b', 'l', 'v', 'r', 'i', 'a', 'c', 'o'],
 ['as', 'ae', 'ai', 'm', 'ak', 'r', 'n', 's', 'f', 'e'],
 ['c', 'f', 'a', 'd', 'g', 'e', 'b'],
 ['d', 'a', 'c', 'b'],
 ['w', 'v', 'q', 'r', 'd', 's', 'n', 'm', 'p', 'i'],
 ['g', 'j', 'd', 'i', 'l', 'a', 'h', 'k', 'c', 'b']]

In [46]:
def multi_encode(variable,top_10_all):
        first_top10 = top_10_all[0]
        print(first_top10)

        for value in first_top10:
            df[value]= np.where(df[variable]== value,1,0)
        first_top10.append(variable)
        print(df[first_top10])
        top_10_all = top_10_all.pop(0)

In [47]:
for variable in ["X0","X1","X2","X3","X4","X5","X6"]:
    multi_encode(variable,top_10_all)

['z', 'ak', 'y', 'ay', 't', 'x', 'o', 'f', 'n', 'w']


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

      z  ak  y  ay  t  x  o  f  n  w  X0
0     0   0  0   0  0  0  0  0  0  0   k
1     0   0  0   0  0  0  0  0  0  0   k
2     0   0  0   0  0  0  0  0  0  0  az
3     0   0  0   0  0  0  0  0  0  0  az
4     0   0  0   0  0  0  0  0  0  0  az
5     0   0  0   0  1  0  0  0  0  0   t
6     0   0  0   0  0  0  0  0  0  0  al
7     0   0  0   0  0  0  1  0  0  0   o
8     0   0  0   0  0  0  0  0  0  1   w
9     0   0  0   0  0  0  0  0  0  0   j
10    0   0  0   0  0  0  0  0  0  0   h
11    0   0  0   0  0  0  0  0  0  0  al
12    0   0  0   0  0  0  0  0  0  0   s
13    0   0  0   0  0  0  0  0  0  0  al
14    0   0  0   0  0  0  1  0  0  0   o
15    0   0  0   0  0  0  0  0  1  0   n
16    0   0  0   0  0  0  0  0  0  0  al
17    0   0  0   1  0  0  0  0  0  0  ay
18    0   0  0   0  0  0  0  1  0  0   f
19    0   0  0   0  0  0  0  0  1  0   n
20    0   0  0   0  0  0  0  1  0  0   f
21    0   0  0   0  1  0  0  0  0  0   t
22    0   0  0   0  0  1  0  0  0  0   x
23    0   0  0  

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

      aa  s  b  l  v  r  i  a  c  o  X1
0      0  0  0  0  1  0  0  0  0  0   v
1      0  0  0  0  0  0  0  0  0  0   t
2      0  0  0  0  0  0  0  0  0  0   w
3      0  0  0  0  0  0  0  0  0  0   t
4      0  0  0  0  1  0  0  0  0  0   v
5      0  0  1  0  0  0  0  0  0  0   b
6      0  0  0  0  0  1  0  0  0  0   r
7      0  0  0  1  0  0  0  0  0  0   l
8      0  1  0  0  0  0  0  0  0  0   s
9      0  0  1  0  0  0  0  0  0  0   b
10     0  0  0  0  0  1  0  0  0  0   r
11     0  0  0  0  0  1  0  0  0  0   r
12     0  0  1  0  0  0  0  0  0  0   b
13     0  0  0  0  0  1  0  0  0  0   r
14     0  1  0  0  0  0  0  0  0  0   s
15     0  0  0  1  0  0  0  0  0  0   l
16     0  0  0  0  0  1  0  0  0  0   r
17     1  0  0  0  0  0  0  0  0  0  aa
18     0  0  0  0  0  0  0  0  1  0   c
19     0  0  0  0  0  0  0  1  0  0   a
20     0  1  0  0  0  0  0  0  0  0   s
21     1  0  0  0  0  0  0  0  0  0  aa
22     0  0  0  0  0  1  0  0  0  0   r
23     0  0  1  0  0  0  0  0  0  0   b


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

      as  ae  ai  m  ak  r  n  s  f  e  X2
0      0   0   0  0   0  0  0  0  0  0  at
1      0   0   0  0   0  0  0  0  0  0  av
2      0   0   0  0   0  0  1  0  0  0   n
3      0   0   0  0   0  0  1  0  0  0   n
4      0   0   0  0   0  0  1  0  0  0   n
5      0   0   0  0   0  0  0  0  0  1   e
6      0   0   0  0   0  0  0  0  0  1   e
7      1   0   0  0   0  0  0  0  0  0  as
8      1   0   0  0   0  0  0  0  0  0  as
9      0   0   0  0   0  0  0  0  0  0  aq
10     0   0   0  0   0  1  0  0  0  0   r
11     0   0   0  0   0  0  0  0  0  1   e
12     0   0   1  0   0  0  0  0  0  0  ai
13     0   0   0  0   0  0  0  0  0  1   e
14     1   0   0  0   0  0  0  0  0  0  as
15     0   0   0  0   1  0  0  0  0  0  ak
16     0   0   0  0   0  0  0  0  0  1   e
17     1   0   0  0   0  0  0  0  0  0  as
18     0   0   0  1   0  0  0  0  0  0   m
19     0   0   0  0   1  0  0  0  0  0  ak
20     0   0   0  1   0  0  0  0  0  0   m
21     1   0   0  0   0  0  0  0  0  0  as
22     0   

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

      c  f  a  d  g  e  b X3
0     0  0  1  0  0  0  0  a
1     0  0  0  0  0  1  0  e
2     1  0  0  0  0  0  0  c
3     0  1  0  0  0  0  0  f
4     0  1  0  0  0  0  0  f
5     1  0  0  0  0  0  0  c
6     0  1  0  0  0  0  0  f
7     0  1  0  0  0  0  0  f
8     0  0  0  0  0  1  0  e
9     1  0  0  0  0  0  0  c
10    0  1  0  0  0  0  0  f
11    0  1  0  0  0  0  0  f
12    1  0  0  0  0  0  0  c
13    0  1  0  0  0  0  0  f
14    0  0  0  0  0  1  0  e
15    0  1  0  0  0  0  0  f
16    0  1  0  0  0  0  0  f
17    1  0  0  0  0  0  0  c
18    1  0  0  0  0  0  0  c
19    0  1  0  0  0  0  0  f
20    1  0  0  0  0  0  0  c
21    0  0  0  1  0  0  0  d
22    0  0  0  0  0  0  1  b
23    1  0  0  0  0  0  0  c
24    0  0  0  1  0  0  0  d
25    1  0  0  0  0  0  0  c
26    0  1  0  0  0  0  0  f
27    1  0  0  0  0  0  0  c
28    1  0  0  0  0  0  0  c
29    1  0  0  0  0  0  0  c
...  .. .. .. .. .. .. .. ..
4179  0  1  0  0  0  0  0  f
4180  0  0  1  0  0  0  0  a
4181  0  1  0 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

      d  a  c  b X4
0     1  0  0  0  d
1     1  0  0  0  d
2     1  0  0  0  d
3     1  0  0  0  d
4     1  0  0  0  d
5     1  0  0  0  d
6     1  0  0  0  d
7     1  0  0  0  d
8     1  0  0  0  d
9     1  0  0  0  d
10    1  0  0  0  d
11    1  0  0  0  d
12    1  0  0  0  d
13    1  0  0  0  d
14    1  0  0  0  d
15    1  0  0  0  d
16    1  0  0  0  d
17    1  0  0  0  d
18    1  0  0  0  d
19    1  0  0  0  d
20    1  0  0  0  d
21    1  0  0  0  d
22    1  0  0  0  d
23    1  0  0  0  d
24    1  0  0  0  d
25    1  0  0  0  d
26    1  0  0  0  d
27    1  0  0  0  d
28    1  0  0  0  d
29    1  0  0  0  d
...  .. .. .. .. ..
4179  1  0  0  0  d
4180  1  0  0  0  d
4181  1  0  0  0  d
4182  1  0  0  0  d
4183  1  0  0  0  d
4184  1  0  0  0  d
4185  1  0  0  0  d
4186  1  0  0  0  d
4187  1  0  0  0  d
4188  1  0  0  0  d
4189  1  0  0  0  d
4190  1  0  0  0  d
4191  1  0  0  0  d
4192  1  0  0  0  d
4193  1  0  0  0  d
4194  1  0  0  0  d
4195  1  0  0  0  d
4196  1  0  0  0  d


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

      w  v  q  r  d  s  n  m  p  i  X5
0     0  0  0  0  0  0  0  0  0  0   u
1     0  0  0  0  0  0  0  0  0  0   y
2     0  0  0  0  0  0  0  0  0  0   x
3     0  0  0  0  0  0  0  0  0  0   x
4     0  0  0  0  0  0  0  0  0  0   h
5     0  0  0  0  0  0  0  0  0  0   g
6     0  0  0  0  0  0  0  0  0  0   f
7     0  0  0  0  0  0  0  0  0  0   f
8     0  0  0  0  0  0  0  0  0  0   f
9     0  0  0  0  0  0  0  0  0  0   f
10    0  0  0  0  0  0  0  0  0  0   f
11    0  0  0  0  0  0  0  0  0  0   f
12    0  0  0  0  0  0  0  0  0  0   f
13    0  0  0  0  0  0  0  0  0  0   j
14    0  0  0  0  0  0  0  0  0  0   j
15    0  0  0  0  0  0  0  0  0  0   j
16    0  0  0  0  0  0  0  0  0  0   j
17    0  0  0  0  0  0  0  0  0  0   j
18    0  0  0  0  0  0  0  0  0  0   j
19    0  0  0  0  0  0  0  0  0  0   j
20    0  0  0  0  0  0  0  0  0  0   j
21    0  0  0  0  0  0  0  0  0  0   j
22    0  0  0  0  0  0  0  0  0  0   j
23    0  0  0  0  0  0  0  0  0  0   j
24    0  0  0  0  0  0  0

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

      g  j  d  i  l  a  h  k  c  b X6
0     0  1  0  0  0  0  0  0  0  0  j
1     0  0  0  0  1  0  0  0  0  0  l
2     0  1  0  0  0  0  0  0  0  0  j
3     0  0  0  0  1  0  0  0  0  0  l
4     0  0  1  0  0  0  0  0  0  0  d
5     0  0  0  0  0  0  1  0  0  0  h
6     0  0  0  0  0  0  1  0  0  0  h
7     0  1  0  0  0  0  0  0  0  0  j
8     0  0  0  1  0  0  0  0  0  0  i
9     0  0  0  0  0  1  0  0  0  0  a
10    0  0  0  0  0  0  1  0  0  0  h
11    0  0  0  0  0  0  1  0  0  0  h
12    1  0  0  0  0  0  0  0  0  0  g
13    0  0  0  0  0  0  1  0  0  0  h
14    1  0  0  0  0  0  0  0  0  0  g
15    0  0  1  0  0  0  0  0  0  0  d
16    0  0  0  0  0  0  1  0  0  0  h
17    0  0  0  0  0  0  0  0  1  0  c
18    0  0  1  0  0  0  0  0  0  0  d
19    0  0  0  1  0  0  0  0  0  0  i
20    0  1  0  0  0  0  0  0  0  0  j
21    0  1  0  0  0  0  0  0  0  0  j
22    0  1  0  0  0  0  0  0  0  0  j
23    0  1  0  0  0  0  0  0  0  0  j
24    0  0  1  0  0  0  0  0  0  0  d
25    0  0  

### Ordinal Number Encoding

In [None]:
import datetime

In [None]:
dt.time

In [None]:
today_date = datetime.datetime.today()

In [None]:
today_date

In [None]:
today_date - datetime.timedelta(2)

In [None]:
# list comprehension
new_day = [today_date - datetime.timedelta(i) for i in range(0,15)]

In [None]:
new_day

In [None]:
pd.DataFrame(new_day)

In [None]:
data= pd.DataFrame(new_day)
data.columns = ["Day"]
data.head()

In [None]:
data["Weekday"] =  data["Day"].dt.weekday_name
data.head(10)

In [None]:
dictionary = {'Monday':1, 'Tuesday':2, 'Wednesday':3,'Thursday':4,'Friday':5,'Saturday':6,'Sunday':7}

In [None]:
data["Weekday_ordinal"] = data["Weekday"].map(dictionary)

In [None]:
data.head()

## Count or Frequency encoding

In [None]:
train_set = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data' , header = None,index_col=None)
train_set.head()                                                         

In [None]:
columns = [1,3,5,6,7,8,9,13]

In [None]:
train_set = train_set[columns]

In [None]:
train_set.columns=['Employment','Degree','Status',
                 'Designation','family_job','Race','Sex','Country']

In [None]:
train_set.head()

In [None]:
train_set.shape

In [None]:
country_data = train_set['Country'].value_counts()
country_data

In [None]:
for feature in train_set.columns:
    print(feature,":",len(train_set[feature].value_counts()),"Labels")

In [None]:
country_data = train_set['Country'].value_counts()
country_data

In [None]:
country_data = train_set['Country'].value_counts().to_dict()

In [None]:
country_data

In [None]:
train_set['Country'] =train_set['Country'].map(country_data)
train_set.head(10)

### Target Guided Encoding

In [4]:
df = pd.read_csv("titanic.csv", usecols=['Cabin','Survived'])
df.head()

<IPython.core.display.Javascript object>

Unnamed: 0,Survived,Cabin
0,0,
1,1,C85
2,1,
3,1,C123
4,0,


In [5]:
df['Cabin'] = df['Cabin'].fillna('Missing')

In [6]:
df.head()

Unnamed: 0,Survived,Cabin
0,0,Missing
1,1,C85
2,1,Missing
3,1,C123
4,0,Missing


In [7]:
df['Cabin']= df['Cabin'].astype(str).str[0]

In [8]:
df.head()

Unnamed: 0,Survived,Cabin
0,0,M
1,1,C
2,1,M
3,1,C
4,0,M


In [9]:
df['Cabin'].unique()

array(['M', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [11]:
df.groupby(df['Cabin'])['Survived'].mean()

Cabin
A    0.466667
B    0.744681
C    0.593220
D    0.757576
E    0.750000
F    0.615385
G    0.500000
M    0.299854
T    0.000000
Name: Survived, dtype: float64

In [12]:
df.groupby(df['Cabin'])['Survived'].mean().sort_values().index

Index(['T', 'M', 'A', 'G', 'C', 'F', 'B', 'E', 'D'], dtype='object', name='Cabin')

In [13]:
target_ordinal = df.groupby(df['Cabin'])['Survived'].mean().sort_values().index

In [14]:
enumerate(target_ordinal,0)

<enumerate at 0x197155ac288>

In [15]:
target_ordinal_dict = {k:v for v,k in enumerate(target_ordinal,0)}

In [16]:
target_ordinal_dict

{'T': 0, 'M': 1, 'A': 2, 'G': 3, 'C': 4, 'F': 5, 'B': 6, 'E': 7, 'D': 8}

In [17]:
df['Cabin_class'] = df['Cabin'].map(target_ordinal_dict)
df.head()

Unnamed: 0,Survived,Cabin,Cabin_class
0,0,M,1
1,1,C,4
2,1,M,1
3,1,C,4
4,0,M,1


### Mean Encoding

In [18]:
df.groupby(df['Cabin'])['Survived'].mean()

Cabin
A    0.466667
B    0.744681
C    0.593220
D    0.757576
E    0.750000
F    0.615385
G    0.500000
M    0.299854
T    0.000000
Name: Survived, dtype: float64

In [20]:
mean_ordinal = df.groupby(df['Cabin'])['Survived'].mean().to_dict()

In [22]:
df['mean_ordinal'] = df['Cabin'].map(mean_ordinal)
df.head()

Unnamed: 0,Survived,Cabin,Cabin_class,mean_ordinal
0,0,M,1,0.299854
1,1,C,4,0.59322
2,1,M,1,0.299854
3,1,C,4,0.59322
4,0,M,1,0.299854


## Probability Ratio Encoding

In [48]:
df = pd.read_csv("titanic.csv", usecols=['Cabin','Survived'])
df.head()

<IPython.core.display.Javascript object>

Unnamed: 0,Survived,Cabin
0,0,
1,1,C85
2,1,
3,1,C123
4,0,


In [49]:
df['Cabin'] = df['Cabin'].fillna('Missing')

In [25]:
df['Cabin']= df['Cabin'].astype(str).str[0]

In [26]:
df['Cabin'].unique()

array(['M', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [28]:
mean_od = df.groupby(df['Cabin'])['Survived'].mean()

In [30]:
prob_df = pd.DataFrame(mean_od)
prob_df

<IPython.core.display.Javascript object>

Unnamed: 0_level_0,Survived
Cabin,Unnamed: 1_level_1
A,0.466667
B,0.744681
C,0.59322
D,0.757576
E,0.75
F,0.615385
G,0.5
M,0.299854
T,0.0


In [31]:
prob_df['died'] = 1 - prob_df['Survived']
prob_df.head()

Unnamed: 0_level_0,Survived,died
Cabin,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0.466667,0.533333
B,0.744681,0.255319
C,0.59322,0.40678
D,0.757576,0.242424
E,0.75,0.25


In [32]:
prob_df['ratio'] = prob_df['Survived'] / prob_df['died']
prob_df.head()

Unnamed: 0_level_0,Survived,died,ratio
Cabin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,0.466667,0.533333,0.875
B,0.744681,0.255319,2.916667
C,0.59322,0.40678,1.458333
D,0.757576,0.242424,3.125
E,0.75,0.25,3.0


In [33]:
prob_dictionary = prob_df['ratio'].to_dict()

In [36]:
df['Prob_ratio']= df['Cabin'].map(prob_dictionary)
df.head()

Unnamed: 0,Survived,Cabin,Prob_ratio
0,0,M,0.428274
1,1,C,1.458333
2,1,M,0.428274
3,1,C,1.458333
4,0,M,0.428274
