# Titanic Survival Prediction

In [1]:
import pandas as pd
import numpy as np


In [2]:
df = pd.read_excel('titanic.xls')
df.shape

(1309, 14)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pclass     1309 non-null   int64  
 1   survived   1309 non-null   int64  
 2   name       1309 non-null   object 
 3   sex        1309 non-null   object 
 4   age        1046 non-null   float64
 5   sibsp      1309 non-null   int64  
 6   parch      1309 non-null   int64  
 7   ticket     1309 non-null   object 
 8   fare       1308 non-null   float64
 9   cabin      295 non-null    object 
 10  embarked   1307 non-null   object 
 11  boat       486 non-null    object 
 12  body       121 non-null    float64
 13  home.dest  745 non-null    object 
dtypes: float64(3), int64(4), object(7)
memory usage: 143.3+ KB


In [4]:
df.describe(include='all')

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
count,1309.0,1309.0,1309,1309,1046.0,1309.0,1309.0,1309,1308.0,295,1307,486.0,121.0,745
unique,,,1307,2,,,,939,,186,3,28.0,,369
top,,,"Connolly, Miss. Kate",male,,,,CA. 2343,,C23 C25 C27,S,13.0,,"New York, NY"
freq,,,2,843,,,,11,,6,914,39.0,,64
mean,2.294882,0.381971,,,29.881135,0.498854,0.385027,,33.295479,,,,160.809917,
std,0.837836,0.486055,,,14.4135,1.041658,0.86556,,51.758668,,,,97.696922,
min,1.0,0.0,,,0.1667,0.0,0.0,,0.0,,,,1.0,
25%,2.0,0.0,,,21.0,0.0,0.0,,7.8958,,,,72.0,
50%,3.0,0.0,,,28.0,0.0,0.0,,14.4542,,,,155.0,
75%,3.0,1.0,,,39.0,1.0,0.0,,31.275,,,,256.0,


In [5]:
print(list(df.columns))

['pclass', 'survived', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket', 'fare', 'cabin', 'embarked', 'boat', 'body', 'home.dest']


In [6]:
for col in list(df.columns):
    print('-'*50, '\n')
    print(col)
    print(df[col].value_counts())

-------------------------------------------------- 

pclass
3    709
1    323
2    277
Name: pclass, dtype: int64
-------------------------------------------------- 

survived
0    809
1    500
Name: survived, dtype: int64
-------------------------------------------------- 

name
Connolly, Miss. Kate             2
Kelly, Mr. James                 2
Allen, Miss. Elisabeth Walton    1
Ilmakangas, Miss. Ida Livija     1
Ilieff, Mr. Ylio                 1
                                ..
Hart, Miss. Eva Miriam           1
Harris, Mr. Walter               1
Harris, Mr. George               1
Harper, Rev. John                1
Zimmerman, Mr. Leo               1
Name: name, Length: 1307, dtype: int64
-------------------------------------------------- 

sex
male      843
female    466
Name: sex, dtype: int64
-------------------------------------------------- 

age
24.0000    47
22.0000    43
21.0000    41
30.0000    40
18.0000    39
           ..
0.3333      1
22.5000     1
70.5000     1
0.6

In [7]:
df.drop(columns=['name', 'ticket', 'cabin', 'boat', 'home.dest'], inplace=True)

In [8]:
df.isna().sum().sort_values()/df.shape[0]

pclass      0.000000
survived    0.000000
sex         0.000000
sibsp       0.000000
parch       0.000000
fare        0.000764
embarked    0.001528
age         0.200917
body        0.907563
dtype: float64

In [10]:
df.drop(columns='body', inplace=True)

## Preprocessing

In [13]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, KBinsDiscretizer, Binarizer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import make_column_transformer

In [14]:
encode_sex = OrdinalEncoder()
encode_sibsp = Binarizer(threshold=1)
encode_parch = Binarizer()
process_fare = make_pipeline(
    SimpleImputer(strategy='mean'),
    KBinsDiscretizer(n_bins=3, encode='ordinal')
)
process_embarked = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OrdinalEncoder()
)
process_age = make_pipeline(
    SimpleImputer(strategy='mean'),
    KBinsDiscretizer(n_bins=3)
)


transformer = make_column_transformer(
    (encode_sex, ['sex']),
    (encode_sibsp, ['sibsp']),
    (encode_parch, ['parch']),
    (process_fare, ['fare']),
    (process_embarked, ['embarked']),
    (process_age, ['age'])
)

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
df.columns.to_list()

['pclass', 'survived', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked']

In [17]:
y = df['survived']
X = df.drop(columns='survived')

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

In [18]:
X_train = transformer.fit_transform(X_train)

## Model