# Titanic Survival Prediction

### ToDo
- [x] Prepare Notebook
- [x] Exploratory Data Analysis
- [x] Data Preprocessing
- [x] Model Training

### Prepare Notebook

In [None]:
# Importing Packages

import os
import numpy as np
import pandas as pd

import plotly.express as px
import matplotlib.pyplot as plt

In [None]:
# Load Dataset

dir = '../input/titanic/'

train_df = pd.read_csv(dir + 'train.csv')
test_df = pd.read_csv(dir + 'test.csv')

### Exploratory Data Analysis

In [None]:
train_df.head(5)

In [None]:
train_df.describe()

In [None]:
fig = px.histogram(data_frame = train_df, x = 'Age', title = "Age Histogram Plot", marginal = 'box')
fig.update_layout(bargap=0.2)
fig.show()

In [None]:
train_df.columns

In [None]:
train_df.hist(column = ['Survived', 'Pclass', 'Age', 'SibSp', 'Parch'], figsize=(15, 10), grid = False, legend = True)

In [None]:
px.imshow(train_df.corr().sort_values(by = 'Survived', ascending = False), text_auto = True)

### Data Preprocessing

In [None]:
# Splitting Dataset

from sklearn.model_selection import train_test_split

train_set, val_set = train_test_split(train_df, test_size = 0.2)
train_set.shape, val_set.shape

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

In [None]:
# Imputing NaN values

class AgeImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y = None):
        return self
    
    def transform(self, X):
        imputer = SimpleImputer(strategy = 'mean')
        X['Age'] = imputer.fit_transform(X[['Age']])
        return X

In [None]:
# Encoding Categorical Features

class FeatureEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y = None):
        return self
    
    def transform(self, X):
        encoder = OneHotEncoder()
        matrix = encoder.fit_transform(X[['Embarked']]).toarray()
        
        em_column_names = ['C', 'S', 'Q', 'N']
        
        for i in range(len(matrix.T)):
            X[em_column_names[i]] = matrix.T[i]
        
        matrix = encoder.fit_transform(X[['Sex']]).toarray()
        
        column_names = ['Female', 'Male']
        
        for i in range(len(matrix.T)):
            X[column_names[i]] = matrix.T[i]
        
        return X

In [None]:
# Dropping Features

class FeatureDropper(BaseEstimator, TransformerMixin):
    def fit(self, X, y = None):
        return self
    
    def transform(self, X):
        return X.drop(['Embarked', 'Name', 'Ticket', 'Cabin', 'Sex', 'N'], axis = 1, errors = 'ignore')

In [None]:
from sklearn.pipeline import Pipeline

main_pipeline = Pipeline([
    ('ageimputer', AgeImputer()),
    ('featureencoder', FeatureEncoder()),
    ('featuredropper', FeatureDropper())
])

In [None]:
train_set = main_pipeline.fit_transform(train_set)
train_set

In [None]:
# Scaling Numerical Data

from sklearn.preprocessing import StandardScaler

X = train_set.drop(['Survived'], axis = 1)
y = train_set['Survived']

scaler = StandardScaler()
X_data = scaler.fit_transform(X)
y_data = y.to_numpy()

In [None]:
val_set = main_pipeline.fit_transform(val_set)
val_set

In [None]:
X = val_set.drop(['Survived'], axis = 1)
y = val_set['Survived']

scaler = StandardScaler()
X_data_val = scaler.fit_transform(X)
y_data_val = y.to_numpy()

### Training Model

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

clf = RandomForestClassifier(n_jobs = -1, random_state = 4)

paramGrid = {
    'n_estimators' : [10, 20, 50, 100, 500],
    'max_depth' : [None, 10, 20, 50, 100]
}

cv = GridSearchCV(clf, paramGrid, scoring = 'accuracy', return_train_score = True, cv = 3, verbose = 2, n_jobs = -1)
cv.fit(X_data, y_data)

In [None]:
cv.best_estimator_

In [None]:
best_model_df = pd.DataFrame(cv.cv_results_)

In [None]:
bestParams = cv.best_params_
bestParams

In [None]:
final_clf = RandomForestClassifier(n_jobs = -1, random_state = 4, max_depth = 10, n_estimators = 50)
final_clf.fit(X_data, y_data)

train_preds = final_clf.predict(X_data)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix

In [None]:
val_preds = final_clf.predict(X_data_val)

In [None]:
accuracy_score(y_data, train_preds), accuracy_score(y_data_val, val_preds)

In [None]:
train_cm = confusion_matrix(y_data, train_preds)
px.imshow(train_cm, text_auto = True)

In [None]:
val_cm = confusion_matrix(y_data_val, val_preds)
px.imshow(val_cm, text_auto = True)