# Titanic

## 00 : Index

## 01 : Setting

In [122]:
# Basic Library
import numpy as np
import pandas as pd
import sklearn
from IPython.display import display

# Ensemble model
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# ML model
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

# scoring metric
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

# split
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedGroupKFold

# Encoder
from sklearn.preprocessing import LabelEncoder

# Scaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# plot
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
%matplotlib inline

## 02 : DATA

Load DATA & check

In [126]:
titanic_df = pd.read_csv('train.csv')
display(titanic_df.head(3))
titanic_df.rename(columns={'PassengerId':'Id', 'Survived':'Surv'}, inplace=True)
display(titanic_df.head(3))

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


Unnamed: 0,Id,Surv,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


Split DATA

In [127]:
# split X + y
X_df = titanic_df.drop('Surv', axis=1)
y_df = titanic_df['Surv']
display('DF X',X_df.head(3))
display('DF y',y_df.head(3))

# split train(train+eval) + test
X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size = 0.2, random_state=1)

# split tr + eval
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=1)
A_tr = pd.concat((X_tr, y_tr), axis=1)
display('DF A',A_tr.head(3))

'DF X'

Unnamed: 0,Id,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


'DF y'

0    0
1    1
2    1
Name: Surv, dtype: int64

'DF A'

Unnamed: 0,Id,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Surv
639,640,3,"Thorneycroft, Mr. Percival",male,,1,0,376564,16.1,,S,0
559,560,3,"de Messemaeker, Mrs. Guillaume Joseph (Emma)",female,36.0,1,0,345572,17.4,,S,1
543,544,2,"Beane, Mr. Edward",male,32.0,1,0,2908,26.0,,S,1


## 03 : EDA

### 01 - Null Check

In [128]:
A_tr.describe()

Unnamed: 0,Id,Pclass,Age,SibSp,Parch,Fare,Surv
count,640.0,640.0,516.0,640.0,640.0,640.0,640.0
mean,436.082812,2.298437,30.08124,0.5,0.3625,31.71675,0.390625
std,252.034031,0.833567,15.003337,1.031583,0.758778,46.253586,0.488272
min,5.0,1.0,0.42,0.0,0.0,0.0,0.0
25%,218.75,2.0,20.0,0.0,0.0,7.925,0.0
50%,432.5,3.0,29.0,0.0,0.0,14.5,0.0
75%,651.25,3.0,39.0,1.0,0.0,30.5,1.0
max,891.0,3.0,74.0,8.0,6.0,512.3292,1.0


In [129]:
A_tr.isna().sum()

Id            0
Pclass        0
Name          0
Sex           0
Age         124
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       492
Embarked      2
Surv          0
dtype: int64

### 02 - Fill Null

In [130]:
def find_null_per(col_name) :
    print('Age null DATA : {:.2f} %'.format(A_tr[col_name].isna().sum()/(A_tr[col_name].count() + A_tr[col_name].isna().sum()) * 100))

Age

In [131]:
find_null_per('Age')

Age null DATA : 19.38 %


In [132]:
A_tr['Age'].fillna(A_tr['Age'].mean(), inplace=True)
find_null_per('Age')

Age null DATA : 0.00 %


Cabin

In [133]:
find_null_per('Cabin')

Age null DATA : 76.88 %


In [134]:
A_tr.drop('Cabin', axis=1, inplace=True)
display(A_tr.head(3))

Unnamed: 0,Id,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Surv
639,640,3,"Thorneycroft, Mr. Percival",male,30.08124,1,0,376564,16.1,S,0
559,560,3,"de Messemaeker, Mrs. Guillaume Joseph (Emma)",female,36.0,1,0,345572,17.4,S,1
543,544,2,"Beane, Mr. Edward",male,32.0,1,0,2908,26.0,S,1


Embarked

In [135]:
print(A_tr['Embarked'].value_counts())

S    462
C    114
Q     62
Name: Embarked, dtype: int64


In [136]:
A_tr['Embarked'].fillna('X', inplace=True)
find_null_per('Embarked')

Age null DATA : 0.00 %


### 03 - Drop DATA

ID

In [142]:
A_tr.drop('Id', axis=1, inplace=True)
display(A_tr.head(3))

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Surv
639,3,0,30.08124,1,0,376564,16.1,1.0,0
559,3,1,36.0,1,0,345572,17.4,2.0,1
543,2,0,32.0,1,0,2908,26.0,2.0,1


Name

In [137]:
A_tr.drop('Name', axis=1, inplace=True)
display(A_tr.head(3))

Unnamed: 0,Id,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Surv
639,640,3,male,30.08124,1,0,376564,16.1,S,0
559,560,3,female,36.0,1,0,345572,17.4,S,1
543,544,2,male,32.0,1,0,2908,26.0,S,1


Sex

In [138]:
A_tr['Sex'].replace({'male':'m', 'female':'f'}, inplace=True)
display(A_tr.head(3))

Unnamed: 0,Id,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Surv
639,640,3,m,30.08124,1,0,376564,16.1,S,0
559,560,3,f,36.0,1,0,345572,17.4,S,1
543,544,2,m,32.0,1,0,2908,26.0,S,1


### 04 - Encoding

In [139]:
encoder = LabelEncoder()
A_tr['Sex'].replace({'m':'0', 'f':'1'}, inplace=True)
A_tr['Embarked'] = pd.DataFrame(encoder.fit_transform(A_tr['Embarked']))

A_tr.head(3)

Unnamed: 0,Id,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Surv
639,640,3,0,30.08124,1,0,376564,16.1,1.0,0
559,560,3,1,36.0,1,0,345572,17.4,2.0,1
543,544,2,0,32.0,1,0,2908,26.0,2.0,1


### 05 - Scaling

### XX - EDA def

In [143]:
def eda_after(X_df_name) :
    X_df_name['Age'].fillna(X_df_name['Age'].mean(), inplace=True)
    X_df_name.drop('Cabin', axis=1, inplace=True)
    X_df_name['Embarked'].fillna('X', inplace=True)
    X_df_name.drop('Id', axis=1, inplace=True)
    X_df_name.drop('Name', axis=1, inplace=True)
    X_df_name['Sex'].replace({'male':'m', 'female':'f'}, inplace=True)
    X_df_name['Sex'].replace({'m':'0', 'f':'1'}, inplace=True)
    encoder = LabelEncoder()
    X_df_name['Embarked'] = pd.DataFrame(encoder.fit_transform(X_df_name['Embarked']))




    