# **Titanic Dataset - Feature Engineering**

In [205]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [206]:
df=pd.read_csv('train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## **Checking for NULL values**

In [207]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [208]:
df.isnull().mean()

PassengerId    0.000000
Survived       0.000000
Pclass         0.000000
Name           0.000000
Sex            0.000000
Age            0.198653
SibSp          0.000000
Parch          0.000000
Ticket         0.000000
Fare           0.000000
Cabin          0.771044
Embarked       0.002245
dtype: float64

## **Data Missing not at random (MNAR)**, 
i.e there is a specific reason for the missing data,
for example, the data of dead people in the Titanic can't be easily
collected, so it may be missing

In [209]:
df['cabin_null']=np.where(df['Cabin'].isnull(),1,0)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,cabin_null
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,1


In [210]:
df.groupby(['Survived'])['cabin_null'].mean()

Survived
0    0.876138
1    0.602339
Name: cabin_null, dtype: float64

**87% of the 'not survived' category data is missing(much more then 'survived' category)(clear reason)**

In [211]:
df['age_null']=np.where(df['Age'].isnull(),1,0)
df.groupby(['Survived'])['age_null'].mean()

Survived
0    0.227687
1    0.152047
Name: age_null, dtype: float64

## **Data missing completely at random(MCAR)**
No obvious reason for the data to be missing

In [212]:
df['Embarked'].isnull().mean()

0.002244668911335578

In [213]:
df[df['Embarked'].isnull()] # only two values missing, which doesn't have obvious reason

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,cabin_null,age_null
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,,0,0
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,,0,0


## **Cardinality**
If a feature has very high cardinality (comparable to training set size), it may happen that a lot of unique values of that feature which are present in the training set are not present in the testing set. This may lead to poor performance of the model.

In [214]:
from sklearn.metrics import roc_auc_score

In [215]:
# Checking the cardinality for each feature

for col in df.columns:
  print(col,":",len(df[col].unique()))

PassengerId : 891
Survived : 2
Pclass : 3
Name : 891
Sex : 2
Age : 89
SibSp : 7
Parch : 7
Ticket : 681
Fare : 248
Cabin : 148
Embarked : 4
cabin_null : 2
age_null : 2


In [216]:
# Reducing the cardinality of the 'Cabin' column

df['Cabin'].str[0].unique()

array([nan, 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [217]:
# Replacing the cabin values with their first characters to reduce their cardinality

new_cabin_col=df['Cabin'].str[0].fillna('N')
df['Cabin_reduced']=new_cabin_col
len(df['Cabin_reduced'].unique())

9

**Droppin the high cardinality Cabin column**

In [218]:
df.drop(columns=['Cabin'],inplace=True,axis=0)
df.rename(columns={'Cabin_reduced':'Cabin'},inplace=True)

In [219]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,cabin_null,age_null,Cabin
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,1,0,N
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,0,0,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,1,0,N
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S,0,0,C
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S,1,0,N


In [220]:
data=df[['Cabin','Sex']]
Y=df[['Survived']]
data.head()

Unnamed: 0,Cabin,Sex
0,N,male
1,C,female
2,N,female
3,C,female
4,N,male


## **One hot encoding**

In [221]:
data=pd.get_dummies(data)
data.drop(columns=['Cabin_T'],inplace=True)
data.head()

Unnamed: 0,Cabin_A,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_N,Sex_female,Sex_male
0,0,0,0,0,0,0,0,1,0,1
1,0,0,1,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,1,1,0
3,0,0,1,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,1,0,1


In [222]:
data.shape

(891, 10)

## Splitting train and test data

In [223]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data, Y, test_size=0.33, random_state=42)
X_train.shape,X_test.shape

((596, 10), (295, 10))

## **Training the model**

In [224]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [225]:
clf=RandomForestClassifier(n_estimators=200,random_state=42)
clf.fit(X_train,y_train)

  


RandomForestClassifier(n_estimators=200, random_state=42)

In [226]:
train_pred=clf.predict_proba(X_train)
test_pred=clf.predict_proba(X_test)

roc_auc_score(y_test,test_pred[:,1])

0.8421190476190477

In [227]:
test=pd.read_csv('test.csv')

In [228]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [229]:
# Replacing the cabin values with their first characters to reduce their cardinality

new_cabin_col=test['Cabin'].str[0].fillna('N')
test['Cabin_reduced']=new_cabin_col
len(test['Cabin_reduced'].unique())

8

In [230]:
test_data=test[['Cabin_reduced','Sex']]
# test_data.head()
test_data.shape

(418, 2)

In [231]:
test_data=pd.get_dummies(test_data)
test_data.head()

Unnamed: 0,Cabin_reduced_A,Cabin_reduced_B,Cabin_reduced_C,Cabin_reduced_D,Cabin_reduced_E,Cabin_reduced_F,Cabin_reduced_G,Cabin_reduced_N,Sex_female,Sex_male
0,0,0,0,0,0,0,0,1,0,1
1,0,0,0,0,0,0,0,1,1,0
2,0,0,0,0,0,0,0,1,0,1
3,0,0,0,0,0,0,0,1,0,1
4,0,0,0,0,0,0,0,1,1,0


In [232]:
final_pred=clf.predict_proba(test_data)
ans=[]
for x in final_pred:
  if x[0]>=x[1]:
    ans.append(0)
  else:
    ans.append(1)

Feature names unseen at fit time:
- Cabin_reduced_A
- Cabin_reduced_B
- Cabin_reduced_C
- Cabin_reduced_D
- Cabin_reduced_E
- ...
Feature names seen at fit time, yet now missing:
- Cabin_A
- Cabin_B
- Cabin_C
- Cabin_D
- Cabin_E
- ...



In [234]:
submission=pd.DataFrame()
submission['PassengerId']=test['PassengerId']
submission['Survived']=ans
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [235]:
submission.to_csv('gender_submission.csv',index=False)