In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

--------------------------
# Introduction

![](https://c.tenor.com/csSu8i3jaRQAAAAC/titanic-sinking.gif)

Picture Credit: https://c.tenor.com



### Missing Values

> For various reasons, many real world datasets contain missing values, often encoded as blanks, NaNs or other placeholders. Such datasets however are incompatible with scikit-learn estimators which assume that all values in an array are numerical, and that all have and hold meaning. A basic strategy to use incomplete datasets is to discard entire rows and/or columns containing missing values. However, this comes at the price of losing data which may be valuable (even though incomplete). A better strategy is to impute the missing values, i.e., to infer them from the known part of the data. 

Reference: https://scikit-learn.org/stable/

The first projects most people start with are probably titanic. However, As I think, titatic datasets are not easy.
The reason for thinking like this is as follows.
* Missing Values: There are many missing values of features that are considered important for survivor judgment, such as Age and Cabin.
* Small Dataset: Because the dataset is small, it seems difficult to train the model sufficiently.

In this notebook, we will focus on imputation of missing values and ensemble methods to improve performance.
In addition, we will create new derivative variables if necessary for each feature.

---------------------------------------------------------------
# Setting up

In [None]:
!pip install pycaret-nightly
!pip install missingno

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics import f1_score, confusion_matrix, precision_recall_curve, roc_curve
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn import preprocessing
import umap
import umap.plot

import warnings
warnings.filterwarnings('ignore')

# Utility Function

In [None]:
def get_clf_eval(y_test, pred=None, pred_proba=None):
    confusion = confusion_matrix( y_test, pred)
    accuracy = accuracy_score(y_test , pred)
    precision = precision_score(y_test , pred)
    recall = recall_score(y_test , pred)
    f1 = f1_score(y_test,pred)

    roc_auc = roc_auc_score(y_test, pred_proba)
    print('confusion matrix')
    print(confusion)

    # ROC-AUC print 
    print('accuracy: {0:.4f}, precision: {1:.4f}, recall: {2:.4f},\
    F1: {3:.4f}, AUC:{4:.4f}'.format(accuracy, precision, recall, f1, roc_auc))
    return confusion

In [None]:
def apply_pca(X, standardize=True):
    # Standardize
    if standardize:
        X = (X - X.mean(axis=0)) / X.std(axis=0)
    # Create principal components
    pca = PCA()
    X_pca = pca.fit_transform(X)
    # Convert to dataframe
    component_names = [f"PC{i+1}" for i in range(X_pca.shape[1])]
    X_pca = pd.DataFrame(X_pca, columns=component_names)
    # Create loadings
    loadings = pd.DataFrame(
        pca.components_.T,  # transpose the matrix of loadings
        columns=component_names,  # so the columns are the principal components
        index=X.columns,  # and the rows are the original features
    )
    return pca, X_pca, loadings

In [None]:
def outlier_iqr(data):
    q1,q3 = np.percentile(data,[25,75])
    iqr = q3-q1
    lower = q1-(iqr*1.5)
    upper = q3+(iqr*1.5)
    return np.where((data>upper)|(data<lower))

In [None]:
def encode_features(dataDF,feat_list):
    for feature in feat_list:
        le = preprocessing.LabelEncoder()
        le = le.fit(dataDF[feature])
        dataDF[feature] = le.transform(dataDF[feature])
        
    return dataDF

# Loading Dataset

In [None]:
train_data = pd.read_csv('/kaggle/input/titanic/train.csv')
test_data = pd.read_csv('/kaggle/input/titanic/test.csv')
submission_data = pd.read_csv('/kaggle/input/titanic/gender_submission.csv')
titanic_df = pd.concat([train_data, test_data], ignore_index = True, sort = False)
tr_idx = titanic_df['Survived'].notnull()

In [None]:
titanic_df.head(3).T.style.set_properties(**{'background-color': 'lightyellow',
                           'color': 'darkblack',
                           'border-color': 'darkblack'})

In [None]:
titanic_df.shape

PassengerId has nothing to do with survior. It can be removed immediately.

In [None]:
titanic_df.drop(['PassengerId'],axis=1,inplace=True)

--------------------------------------------
# EDA

## Checking Data Type

In [None]:
titanic_df.info()

# Checking Target Value Imbalace

In [None]:
print(plt.style.available)

In [None]:
total_cnt = titanic_df['Survived'].count()
sns.set(font_scale = 2)
sns.set_style("white")
sns.set_palette("bright")
f, ax = plt.subplots(1, 2, figsize = (18, 8))
titanic_df['Survived'].value_counts().plot.pie(explode = [0, 0.1], 
                                               autopct = '%1.1f%%', 
                                               ax = ax[0],
                                               shadow = True,
                                               colors = ['grey', 'green'])
ax[0].set_title('Survived %')
ax[0].set_ylabel('')
sns.countplot('Survived', data = titanic_df, ax = ax[1], palette='Blues_r')
ax[1].set_title('Survived')
for p in ax[1].patches:
    x, height, width = p.get_x(), p.get_height(), p.get_width()
    ax[1].text(x + width / 2, height + 10, f'{height} / {height / total_cnt * 100:2.1f}%', va='center', ha='center', size=20)
sns.despine()

Although the survivor is small, the imbalance is not large enough for over/under sampling.
If you want to know more about over/under sampling, please refer to the notebook below.

[Over/Under sampling](https://www.kaggle.com/ohseokkim/preprocessing-resolving-imbalance-by-sampling)

## Checking and Handling Missing Values

In [None]:
import missingno as msno
msno.matrix(titanic_df.drop(['Survived'],axis=1))

In [None]:
titanic_df.drop(['Survived'],axis=1).isnull().sum()

<span style="color:Blue"> Observation:
    
There are missing values for Age, Cabin, Fare, and Embarked features. In particular, there are many missing values for Age and Cabin features. Let's think about how to handle these missing values.

-------------------------------------
# Checking features

## Cabin

![](https://www.retrograph.com/wp-content/uploads/thumbnails/s5RGL181.jpg)

Picture Credit: https://www.retrograph.com

Looking at the picture above, it can be hypothesized that the survival rate will be different depending on the location of the cabin.

If you are the captain or sailor of titanic, you will be able to explain the relationship between cabin and survival rate well. If you have some kind of domain knowledge, you will be able to process that feature well. However, in the absence of such domain knowledge, we must examine the corresponding features in detail and process them to be suitable for machine learning.

In [None]:
titanic_df['Cabin'].unique()

The first letter is an uppercase letter of the alphabet. Let's analyze it a bit more using this.

In [None]:
titanic_df['Cabin'].isnull().sum()

<span style="color:Blue"> Observation:
    
Missing values are 1014. This is used to determine whether the cabin is owned or not. To do this, create a new derived variable.

------------------------------------------------------------------------
## Has_Cabin ( Derived variable )

**Question: Is there a difference in the survival rate between passengers with and without cabin?**

In [None]:
titanic_df['Has_Cabin'] = titanic_df['Cabin'].isnull().astype(int)

In [None]:
total_cnt = titanic_df['Survived'].count()
rcParams['figure.figsize'] = 12,8
sns.set(font_scale = 2)
sns.set_style("white")
ax = sns.countplot(x="Has_Cabin",
                   hue="Survived", 
                   data=titanic_df,
                   palette = 'Blues_r')
ax.set_title('Survived Count/Rate')
plt.legend(loc = 'upper right')
for p in ax.patches:
    x, height, width = p.get_x(), p.get_height(), p.get_width()
    ax.text(x + width / 2, height + 10, f'{height} / {height / total_cnt * 100:2.1f}%', va='center', ha='center', size=20)
sns.despine()

<span style="color:Blue"> Observation:

* Cases with cabins have more survivors compared to cases without cabins. It is likely that the new derived variable will be helpful in the classification of survivors.

---------------------------------------------
## Cabin_Label ( Derived variable )

In [None]:
rcParams['figure.figsize'] = 10,7
sns.set(font_scale = 2)
sns.set_style("white")
titanic_df['Cabin'] = titanic_df['Cabin'].fillna('N')
titanic_df['Cabin_label'] = titanic_df['Cabin'].str.get(0)
ax = sns.barplot(x = 'Cabin_label', y = 'Survived', data = titanic_df, palette = 'Blues_r',ci=False)
sns.despine()

In [None]:
rcParams['figure.figsize'] = 20,10
sns.set(font_scale = 2)
sns.set_style("white")
ax = sns.countplot(x="Cabin_label", hue="Survived", data=titanic_df,palette = 'Blues_r')
ax.set_title('Survived Rate')
plt.legend(loc = 'upper right')
for p in ax.patches:
    x, height, width = p.get_x(), p.get_height(), p.get_width()
    ax.text(x + width / 2, height + 7, f'{height / total_cnt * 100:2.1f}%',va='center', ha='center', size=20)
sns.despine()

------------------------------------------------
## Parch ( Number of parents/children )

In [None]:
rcParams['figure.figsize'] = 20,10
sns.set(font_scale = 2)
sns.set_style("white")
sns.set_palette("bright")
ax = sns.countplot(x='Parch',hue ='Survived',data=titanic_df,palette="Blues_r")
ax.set_title('Survived Rate')
plt.legend(loc = 'upper right')
for p in ax.patches:
    x, height, width = p.get_x(), p.get_height(), p.get_width()
    ax.text(x + width / 2, height + 7, f'{height / total_cnt * 100:2.1f}%',va='center', ha='center', size=20)
sns.despine()

----------------------------------------------------------------------
## SibSp ( Number of siblings/spouses )

In [None]:
rcParams['figure.figsize'] = 20,10
sns.set(font_scale = 2)
sns.set_style("white")
sns.set_palette("bright")
ax = sns.countplot(x='SibSp',hue ='Survived',data=titanic_df,palette="Blues_r")
ax.set_title('Survived Rate')
plt.legend(loc = 'upper right')
for p in ax.patches:
    x, height, width = p.get_x(), p.get_height(), p.get_width()
    ax.text(x + width / 2, height + 7, f'{height / total_cnt * 100:2.1f}%',va='center', ha='center', size=20)
sns.despine()

-------------------------------------------------
## FamilySize ( Derived variable )

**Question: Does the number of accompanying family members affect the survival rate??**

In [None]:
titanic_df['FamilySize'] = titanic_df['SibSp'] + titanic_df['Parch'] + 1

In [None]:
rcParams['figure.figsize'] = 20,10
sns.set(font_scale = 2)
sns.set_style("white")
sns.set_palette("bright")
ax = sns.countplot(x='FamilySize',hue ='Survived',data=titanic_df,palette="Blues_r")
ax.set_title('Survived Rate')
plt.legend(loc = 'upper right')
for p in ax.patches:
    x, height, width = p.get_x(), p.get_height(), p.get_width()
    ax.text(x + width / 2, height + 7, f'{height / total_cnt * 100:2.1f}%',va='center', ha='center', size=20)
sns.despine()

<span style="color:Blue"> Observation:

When FamilySize is 1, the survival rate is significantly lower than in other cases. I think it will be helpful when the model is learning.

______________________________________________
## Alone ( Derived variable )

In [None]:
titanic_df['IsAlone'] = 0
titanic_df.loc[titanic_df['FamilySize'] == 1, 'IsAlone'] = 1

In [None]:
rcParams['figure.figsize'] = 10,6
sns.set(font_scale = 2)
sns.set_style("white")
sns.set_palette("bright")
ax = sns.countplot(x='IsAlone',hue ='Survived',data=titanic_df,palette="Blues_r")
ax.set_title('Survived Rate')
plt.legend(loc = 'upper right')
for p in ax.patches:
    x, height, width = p.get_x(), p.get_height(), p.get_width()
    ax.text(x + width / 2, height + 7, f'{height / total_cnt * 100:2.1f}%',va='center', ha='center', size=25)
sns.despine()

<span style="color:Blue"> Observation:

Those who were alone died more than those who were not alone. The derived feature seems to be helpful for model training.

________________________________________________________
## Name

It seems difficult to find the feature directly related to the survivor.

In [None]:
titanic_df['Name'].unique()[:5]

English honistic looks. Let's check some more.

In [None]:
import re
def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    # If the title exists, extract and return it.
    if title_search:
        return title_search.group(1)
    return ""

titanic_df['Title'] = titanic_df['Name'].apply(get_title)

In [None]:
titanic_df['Title'].unique()

In [None]:
titanic_df['Title'] = titanic_df['Title'].replace(
       ['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 
       'Rare')

titanic_df['Title'] = titanic_df['Title'].replace('Mlle', 'Miss')
titanic_df['Title'] = titanic_df['Title'].replace('Ms', 'Miss')
titanic_df['Title'] = titanic_df['Title'].replace('Mme', 'Mrs')
titanic_df['Title'].unique()

In [None]:
rcParams['figure.figsize'] = 20,10
sns.set(font_scale = 2)
sns.set_style("white")
sns.set_palette("bright")
ax = sns.countplot(x='Title',hue ='Survived',data=titanic_df,palette="Blues_r")
ax.set_title('Survived Rate')
plt.legend(loc = 'upper right')
for p in ax.patches:
    x, height, width = p.get_x(), p.get_height(), p.get_width()
    ax.text(x + width / 2, height + 7, f'{height / total_cnt * 100:2.1f}%',va='center', ha='center', size=25)
sns.despine()

<span style="color:Blue"> Observation:

The mortality rate is higher in the case of Mr. I think it will help with learning.

In [None]:
rcParams['figure.figsize'] = 20,15
titles = titanic_df['Title'].unique()
plt.subplots_adjust(hspace=1.5)
idx = 1
sns.set(font_scale = 2)
sns.set_style("white")
sns.set_palette("bright")

for title in titles:
    plt.subplot(3,2,idx)
    ax = sns.histplot(x='Age',data=titanic_df[titanic_df['Title']== title],hue ='Survived',palette="Blues_d",kde=True)
    ax.set_title(title)
    sns.despine()
    idx = idx + 1

<span style="color:Blue"> Observation:
* In the case of Mr, the number of survivors is small.
* In the case of Mrs and Miss, there are many survivors.

I think it will be helpful in judging survivors using this.
However, it seems difficult to find the relationship between age and title from the above distributions. Therefore, it seems difficult to use this to fill in the missing values of age.

--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
## Has_Age ( Derived variable )

**Question: Does the survival rate make a difference with and without age records?|**

In [None]:
titanic_df['Age'].isnull().sum()

Let's check the distribution first to fill in the appropriate values.

### An Extension To Imputation

> Imputation is the standard approach, and it usually works well. However, imputed values may be systematically above or below their actual values (which weren't collected in the dataset). Or rows with missing values may be unique in some other way. In that case, your model would make better predictions by considering which values were originally missing.

![](https://i.imgur.com/UWOyg4a.png)

> In this approach, we impute the missing values, as before. And, additionally, for each column with missing entries in the original dataset, we add a new column that shows the location of the imputed entries.
> 
> In some cases, this will meaningfully improve results. In other cases, it doesn't help at all.

Ref: https://www.kaggle.com/alexisbcook/missing-values

In [None]:
titanic_df['Has_Age'] = titanic_df['Age'].isnull().astype(int)

In [None]:
rcParams['figure.figsize'] = 10,6
sns.set(font_scale = 2)
sns.set_style("white")
sns.set_palette("bright")
ax = sns.countplot(x='Has_Age',hue ='Survived',data=titanic_df,palette="Blues_r")
plt.legend(loc = 'upper right')
ax.set_title('Survived Rate')
for p in ax.patches:
    x, height, width = p.get_x(), p.get_height(), p.get_width()
    ax.text(x + width / 2, height + 10,f'{height / total_cnt * 100:2.1f}%',va='center', ha='center', size=25)
sns.despine()

<span style="color:Blue"> Observation:
    
* More than the case where Age is not missing.
* Cases in which age is not missed have a higher survival rate than cases in which age is omitted.

-------------------------------------------------------------------------
## Age

In [None]:
rcParams['figure.figsize'] = 12,7
sns.set_palette("bright")
sns.set(font_scale = 2)
sns.set_style("white")
ax = sns.histplot(x="Age", hue="Survived", data=titanic_df,palette = 'Blues_d',kde=True)
plt.axvline(x=titanic_df['Age'].mean(), color='g', linestyle='--', linewidth=3)
plt.text(titanic_df['Age'].mean(), 60, "Mean", horizontalalignment='left', size='small', color='black', weight='semibold')
sns.despine()

In [None]:
mean = titanic_df['Age'].mean()
std = titanic_df['Age'].std()
skew = titanic_df['Age'].skew()
print('Age : mean: {0:.4f}, std: {1:.4f}, skew: {2:.4f}'.format(mean, std, skew))

Looking at the skewness, it is skewed to one side.

## Imputting Missing Values

### Univariate feature imputation

> The SimpleImputer class provides basic strategies for imputing missing values. Missing values can be imputed with a provided constant value, or using the statistics (mean, median or most frequent) of each column in which the missing values are located. This class also allows for different missing values encodings.

Ref: https://scikit-learn.org/stable/

In [None]:
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
titanic_df[['Age_mean']] = imp.fit_transform(titanic_df[['Age']])

In [None]:
rcParams['figure.figsize'] = 20,6
sns.set_palette("bright")
sns.set(font_scale = 2)
sns.set_style("white")
plt.subplot(1,2,1)
ax = sns.histplot(x="Age", hue="Survived", data=titanic_df,palette = 'Blues_d',kde=True)
plt.axvline(x=titanic_df['Age'].mean(), color='g', linestyle='--', linewidth=3)
plt.text(titanic_df['Age'].mean(), 60, "Mean", horizontalalignment='left', size='small', color='black', weight='semibold')
sns.despine()
plt.subplot(1,2,2)
ax = sns.histplot(x="Age_mean", hue="Survived", data=titanic_df,palette = 'Blues_d',kde=True)
plt.axvline(x=titanic_df['Age_mean'].mean(), color='g', linestyle='--', linewidth=3)
plt.text(titanic_df['Age_mean'].mean(), 60, "Mean", horizontalalignment='left', size='small', color='black', weight='semibold')
sns.despine()

In [None]:
mean = titanic_df['Age_mean'].mean()
std = titanic_df['Age_mean'].std()
skew = titanic_df['Age_mean'].skew()
print('Age_mean : mean: {0:.4f}, std: {1:.4f}, skew: {2:.4f}'.format(mean, std, skew))

### Multivariate feature imputation

> A more sophisticated approach is to use the IterativeImputer class, which models each feature with missing values as a function of other features, and uses that estimate for imputation. It does so in an iterated round-robin fashion: at each step, a feature column is designated as output y and the other feature columns are treated as inputs X. A regressor is fit on (X, y) for known y. Then, the regressor is used to predict the missing values of y. This is done for each feature in an iterative fashion, and then is repeated for max_iter imputation rounds. The results of the final imputation round are returned.

Ref: https://scikit-learn.org/stable/

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
imp = IterativeImputer(max_iter=10, random_state=0)
titanic_df[['Age_iter']] = imp.fit_transform(titanic_df[['Age']])

In [None]:
rcParams['figure.figsize'] = 20,6
sns.set_palette("bright")
sns.set(font_scale = 2)
sns.set_style("white")
plt.subplot(1,2,1)
ax = sns.histplot(x="Age", hue="Survived", data=titanic_df,palette = 'Blues_d',kde=True)
plt.axvline(x=titanic_df['Age'].mean(), color='g', linestyle='--', linewidth=3)
plt.text(titanic_df['Age'].mean(), 60, "Mean", horizontalalignment='left', size='small', color='black', weight='semibold')
sns.despine()
plt.subplot(1,2,2)
ax = sns.histplot(x="Age_iter", hue="Survived", data=titanic_df,palette = 'Blues_d',kde=True)
plt.axvline(x=titanic_df['Age_iter'].mean(), color='g', linestyle='--', linewidth=3)
plt.text(titanic_df['Age_iter'].mean(), 60, "Mean", horizontalalignment='left', size='small', color='black', weight='semibold')
sns.despine()

In [None]:
mean = titanic_df['Age_iter'].mean()
std = titanic_df['Age_iter'].std()
skew = titanic_df['Age_iter'].skew()
print('Age_iter : mean: {0:.4f}, std: {1:.4f}, skew: {2:.4f}'.format(mean, std, skew))

### Nearest neighbors imputation

> The KNNImputer class provides imputation for filling in missing values using the k-Nearest Neighbors approach. By default, a euclidean distance metric that supports missing values, nan_euclidean_distances, is used to find the nearest neighbors. Each missing feature is imputed using values from n_neighbors nearest neighbors that have a value for the feature. The feature of the neighbors are averaged uniformly or weighted by distance to each neighbor. If a sample has more than one feature missing, then the neighbors for that sample can be different depending on the particular feature being imputed. When the number of available neighbors is less than n_neighbors and there are no defined distances to the training set, the training set average for that feature is used during imputation. If there is at least one neighbor with a defined distance, the weighted or unweighted average of the remaining neighbors will be used during imputation. If a feature is always missing in training, it is removed during transform. 

Ref: https://scikit-learn.org/stable/

In [None]:
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=2, weights="uniform")
titanic_df[['Age_knn']] = imputer.fit_transform(titanic_df[['Age']])

In [None]:
rcParams['figure.figsize'] = 20,6
sns.set_palette("bright")
sns.set(font_scale = 2)
sns.set_style("white")
plt.subplot(1,2,1)
ax = sns.histplot(x="Age", hue="Survived", data=titanic_df,palette = 'Blues_d',kde=True)
plt.axvline(x=titanic_df['Age'].mean(), color='g', linestyle='--', linewidth=3)
plt.text(titanic_df['Age'].mean(), 60, "Mean", horizontalalignment='left', size='small', color='black', weight='semibold')
sns.despine()
plt.subplot(1,2,2)
ax = sns.histplot(x="Age_knn", hue="Survived", data=titanic_df,palette = 'Blues_d',kde=True)
plt.axvline(x=titanic_df['Age_knn'].mean(), color='g', linestyle='--', linewidth=3)
plt.text(titanic_df['Age_knn'].mean(), 60, "Mean", horizontalalignment='left', size='small', color='black', weight='semibold')
sns.despine()

In [None]:
mean = titanic_df['Age_knn'].mean()
std = titanic_df['Age_knn'].std()
skew = titanic_df['Age_knn'].skew()
print('Age_knn : mean: {0:.4f}, std: {1:.4f}, skew: {2:.4f}'.format(mean, std, skew))

### Filling using other features

Another conceivable strategy is to use other features to fill in the missing values ​​of the Age feature. Consider how to fill the Age feature using the Title feature above.

In [None]:
titanic_df['new_Age'] = titanic_df['Age']

In [None]:
for title in titles:   
    t_mean = titanic_df[titanic_df['Title']== title]
    print('{} mean ===> {}'.format(title, t_mean['new_Age'].mean()))

In [None]:
titanic_df['new_Age'].fillna(titanic_df.groupby('Title')['new_Age'].transform('mean'), inplace=True)

In [None]:
rcParams['figure.figsize'] = 20,6
sns.set_palette("bright")
sns.set(font_scale = 2)
sns.set_style("white")
plt.subplot(1,2,1)
ax = sns.histplot(x="Age", hue="Survived", data=titanic_df,palette = 'Blues_d',kde=True)
plt.axvline(x=titanic_df['Age'].mean(), color='g', linestyle='--', linewidth=3)
plt.text(titanic_df['Age'].mean(), 60, "Mean", horizontalalignment='left', size='small', color='black', weight='semibold')
sns.despine()
plt.subplot(1,2,2)
ax = sns.histplot(x="new_Age", hue="Survived", data=titanic_df,palette = 'Blues_d',kde=True)
plt.axvline(x=titanic_df['new_Age'].mean(), color='g', linestyle='--', linewidth=3)
plt.text(titanic_df['new_Age'].mean(), 60, "Mean", horizontalalignment='left', size='small', color='black', weight='semibold')
sns.despine()

In [None]:
mean = titanic_df['new_Age'].mean()
std = titanic_df['new_Age'].std()
skew = titanic_df['new_Age'].skew()
print('new_Age : mean: {0:.4f}, std: {1:.4f}, skew: {2:.4f}'.format(mean, std, skew))

### Scaling 
Looking at the above result, the skewness was 0.4559, not skewed to one side. Let's try linear scaling.

Select RobustScaler during linear scaling. In this way, the influence of outliers can be minimized.

> This is a technique that minimizes the influence of outliers.  
> Since the median and IQR (interquartile range) are used, it can be confirmed that the same values are more widely distributed after standardization when compared with the StandardScaler.
> 
> $𝐼𝑄𝑅=𝑄3−𝑄1$: That is, it deals with values in the 25th and 75th percentiles.

If you want to know more about Scaling, please refer to the notebook below.

[NotebooK](https://www.kaggle.com/ohseokkim/preprocessing-linear-nonlinear-scaling)

In [None]:
from sklearn.preprocessing import RobustScaler
robuster = RobustScaler()
titanic_df['Age_knn'] = robuster.fit_transform(titanic_df[['Age_knn']])

In [None]:
rcParams['figure.figsize'] = 20,6
sns.set_palette("bright")
sns.set(font_scale = 2)
sns.set_style("white")
plt.subplot(1,2,1)
ax = sns.histplot(x="Age", hue="Survived", data=titanic_df,palette = 'Blues_d',kde=True)
plt.axvline(x=titanic_df['Age'].mean(), color='g', linestyle='--', linewidth=3)
plt.text(titanic_df['Age'].mean(), 60, "Mean", horizontalalignment='left', size='small', color='black', weight='semibold')
sns.despine()
plt.subplot(1,2,2)
ax = sns.histplot(x="Age_knn", hue="Survived", data=titanic_df,palette = 'Blues_d',kde=True)
plt.axvline(x=titanic_df['Age_knn'].mean(), color='g', linestyle='--', linewidth=3)
plt.text(titanic_df['Age_knn'].mean(), 60, "Mean", horizontalalignment='left', size='small', color='black', weight='semibold')
sns.despine()

In [None]:
mean = titanic_df['Age_knn'].mean()
std = titanic_df['Age_knn'].std()
skew = titanic_df['Age_knn'].skew()
print('Age_knn : mean: {0:.4f}, std: {1:.4f}, skew: {2:.4f}'.format(mean, std, skew))

**Let imputation be done with KNN. The remaining Age-related features are removed.**

In [None]:
titanic_df.drop(['Age','Age_mean','Age_iter','new_Age'],axis=1,inplace=True)

-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
## Fare

First, let's check the distribution for that value.

In [None]:
rcParams['figure.figsize'] = 12,7
sns.set_palette("bright")
sns.set(font_scale = 2)
sns.set_style("white")
ax = sns.histplot(x="Fare", hue="Survived", data=titanic_df,palette = 'Blues_d',kde=True)
plt.axvline(x=titanic_df['Fare'].mean(), color='g', linestyle='--', linewidth=3)
plt.text(titanic_df['Fare'].mean(), 90, "Mean", horizontalalignment='left', size='small', color='black', weight='semibold')
sns.despine()

In [None]:
mean = titanic_df['Fare'].mean()
std = titanic_df['Fare'].std()
skew = titanic_df['Fare'].skew()
print('Fare : mean: {0:.4f}, std: {1:.4f}, skew: {2:.4f}'.format(mean, std, skew))

**It is skewed to one side. Consider nonlinear scaling. In this case, we will use QuantileTransformer.**

> The quantile function ranks or smooths out the relationship between observations and can be mapped onto other distributions, such as the uniform or normal distribution.

If you want to know more about Scaling, please refer to the notebook below.

[NotebooK](https://www.kaggle.com/ohseokkim/preprocessing-linear-nonlinear-scaling)

In [None]:
from sklearn.preprocessing import QuantileTransformer
transformer = QuantileTransformer(n_quantiles=100, random_state=0, output_distribution='normal')
titanic_df['Fare'] = transformer.fit_transform(titanic_df[['Fare']])

In [None]:
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
titanic_df[['Fare']] = imp.fit_transform(titanic_df[['Fare']])

In [None]:
rcParams['figure.figsize'] = 12,7
sns.set_palette("bright")
sns.set(font_scale = 2)
sns.set_style("white")
ax = sns.histplot(x="Fare", hue="Survived", data=titanic_df,palette = 'Blues_d',kde=True)
plt.axvline(x=titanic_df['Fare'].mean(), color='g', linestyle='--', linewidth=3)
plt.text(0, 100, "Mean", horizontalalignment='left', size='small', color='black', weight='semibold')
sns.despine()

Let's try Bining.

In [None]:
titanic_df['Fare_class'] = pd.qcut(titanic_df['Fare'], 5, labels=['F1', 'F2', 'F3','F4','F5' ])

In [None]:
rcParams['figure.figsize'] = 12,7
sns.set_palette("bright")
sns.set(font_scale = 2)
sns.set_style("white")
ax = sns.histplot(x="Fare_class", hue="Survived", data=titanic_df,palette = 'Blues_d',kde=True)
sns.despine()

In [None]:
titanic_df['Fare_class'] = titanic_df['Fare_class'].replace({'F1':1,'F2':2,'F3':3,'F4':4,'F5':5})

---------------------------------------
## Embarked

In [None]:
rcParams['figure.figsize'] = 12,7
sns.set_palette("bright")
sns.set(font_scale = 2)
sns.set_style("white")
ax = sns.countplot(x='Embarked',hue = 'Survived',data=titanic_df,palette="Blues_d")
for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x() + p.get_width() / 2., height + 3, f'{height / total_cnt * 100:2.1f}%', ha = 'center', size = 25)
sns.despine()

<span style="color:Blue"> Observation:
    
* Many passengers on board at S port died.
* For passengers boarding at port C, the survival rate is higher than the mortality rate.

In [None]:
rcParams['figure.figsize'] = 12,7
sns.set_palette("bright")
sns.set(font_scale = 2)
sns.set_style("white")
ax = sns.countplot(x='Embarked',hue = 'Sex',data=titanic_df,palette="Blues_d")
for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x() + p.get_width() / 2., height + 3, f'{height / total_cnt * 100:2.1f}%', ha = 'center', size = 25)
sns.despine()

<span style="color:Blue"> Observation:
    
* Among the passengers who boarded at S port, the proportion of males is higher than that of other ports.

Let's impute missing value for Embarked feature. The strategy for Embarked's missing values is to choose 'most_frequent'.

In [None]:
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
titanic_df[['Embarked']] = imp.fit_transform(titanic_df[['Embarked']])

------------------------------------------------------------------------------------------
# Checking Missing Value Again
Finally, let's check the missing values.

In [None]:
import missingno as msno
msno.matrix(titanic_df.drop('Survived',axis=1))
print('Number of Missing Values in Dataset ',titanic_df.drop('Survived',axis=1).isnull().sum().sum())

**OK! There is no missing values at the train dataset.**

--------------------------------------------------------------------------------------------------
# Detecting Outliers by PCA

![](https://miro.medium.com/max/602/0*PnqMbZEdnuL9yHuo.png)

Picture Credit: https://miro.medium.com

The more features, the higher the dimension. When projecting to a lower dimension through PCA, new insights can be gained. PCA can effectively detect outliers.

PC 1 has the largest variance in the dataset distribution. That is, the outlier in PC 1 is very likely to be real outlier

In [None]:
from sklearn.decomposition import PCA

In [None]:
features = ["Sex","Age_knn","FamilySize","IsAlone",'Embarked','Cabin_label']
titanic_copy = titanic_df[tr_idx].copy()
y_copy = titanic_copy.pop("Survived")
X_copy = titanic_copy.loc[:, features]
encode_features(X_copy,['Sex', 'Embarked','Cabin_label'])
pca, X_pca, loadings = apply_pca(X_copy)
print(loadings)

In [None]:
import plotly.express as px
fig = px.histogram(X_pca.melt(), color="variable", 
                   marginal="box",
                   barmode ="overlay",
                   histnorm ='density'
                  )  
fig.update_layout(
    title_font_color="black",
    legend_title_font_color="green",
    title={
        'text': "PCA Histogram",
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
)

**Let's check out outliers in PC 1**

In [None]:
pc1_outlier_idx = list(outlier_iqr(X_pca['PC1'])[0])

In [None]:
component = "PC1"

def highlight_min(s, props=''):
    return np.where(s == np.nanmin(s.values), props, '')

train_data.iloc[pc1_outlier_idx,:].style.set_properties(**{'background-color': 'Grey',
                            'color': 'white',
                            'border-color': 'darkblack'})

<span style="color:Blue"> Observation:
* The Sage family started from S port, there was no Cabin, and the ages were not recorded, and they appear to be a poor and pitiful family with a pclass 3 rating.
* All three females in this family using the Miss title have died.

**The last sad news is that the training dataset is small, so it seems difficult to remove even if the above data are outliers.    
Nevertheless, fortunately, this problem is not a regression problem, but a classification problem. If it is a regression problem, outliers should be removed.**

------------------------------------------------------
# Encoding

Let's perform encoding on categorical features.

When only tree-based models are used, label encoding is sufficient. However, we will use one-hot encoding for model extension in the future.

If you want to know more about the encoding of categorical features, please refer to the notebook below.

[Notebook](https://www.kaggle.com/ohseokkim/preprocessing-encoding-categorical-data)

In [None]:
titanic_df = pd.get_dummies(titanic_df, columns = ['Title','Sex', 'Embarked','Cabin_label'],drop_first=True)
titanic_df.head().T.style.set_properties(**{'background-color': 'lightyellow',
                           'color': 'darkblack',
                           'border-color': 'darkblack'})

---------------------------------------
# Checking Correlation

In [None]:
corr=titanic_df.corr().round(1)

sns.set(font_scale=1.15)
plt.figure(figsize=(14, 10))
sns.set_style("white")
sns.set_palette("bright")
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(corr,annot=True,cmap='Blues',mask=mask,cbar=True)
plt.title('Correlation Plot')

<span style="color:Blue"> Observation and Decision:
    
* There is a large correlation between FamilySize and SibSp and Parch. Since the derived variable FamilySize is made of SibSp and Parch, SibSp and Parch are removed.
* The relationship between Cabin and Has_Cabin is high. Therefore, the derived variable Has_Cabin is left and Cabin is removed.
* The relationship between Fare and Fare_class is high. Fare is selected because skewness is removed by nonlinear transform of the Fare feature.
* There are many features that are not related to the survived value.

In [None]:
abs(corr['Survived']).sort_values()[:-1].plot.barh()
plt.gca().set_facecolor('#FFFFFF')

-------------------------------------------------------------
# Selecting Features

Features that are not helpful in judging the above heatmap and survivors, or that have other derived variables, will be removed.

In [None]:
def drop_features(df):
    df.drop(['Name','Ticket','SibSp','Parch','Fare_class',
             'Cabin','Cabin_label_G','Cabin_label_T',
             'Cabin_label_F','FamilySize','Embarked_Q','Title_Rare'],
            axis=1,
            inplace=True)
    return df

titanic_df = drop_features(titanic_df)

**Let's check the correlation of each feature.**

In [None]:
corr=titanic_df.corr().round(1)

sns.set(font_scale=1.15)
plt.figure(figsize=(14, 10))
sns.set_style("white")
sns.set_palette("bright")
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(corr,annot=True,cmap='Blues',mask=mask,cbar=True)
plt.title('Correlation Plot')

Let's check the correlation between the target value (Suvived) and other features.

In [None]:
abs(corr['Survived']).sort_values()[:-1].plot.barh()
plt.gca().set_facecolor('#FFFFFF')

In [None]:
sns.set(font_scale=2)
plt.figure(figsize=(14, 10))
sns.set_style("white")
sns.set_palette("bright")
sns.pairplot(titanic_df,kind = 'reg',corner = True,palette ='Blues',hue='Survived' )

**OK! Looking at the heatmap and pairplot above, it seems that the features are properly selected.**

# Spliting Train/Validation/Test Data

In [None]:
tr_idx = titanic_df['Survived'].notnull()
y_titanic_df = titanic_df[tr_idx]['Survived']
X_titanic_df= titanic_df[tr_idx].drop('Survived',axis=1)
X_test_df = titanic_df[~tr_idx].drop('Survived',axis=1)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val=train_test_split(X_titanic_df, y_titanic_df, \
                                                  test_size=0.2, random_state=11)

In [None]:
y_train.names='Survived'
train_data = pd.concat([X_train,y_train],axis=1)
y_val.names='Survived'
val_data = pd.concat([X_val,y_val],axis=1)

In [None]:
tr_idx = titanic_df['Survived'].notnull()
train_final = titanic_df[tr_idx]

----------------------------------------------------------------------------
# Visualizing Training Dataset after Dimension Reduction

In [None]:
X_train.shape

The training dataset has 16 dimensions. To show the approximate distribution of the training dataset preprocessed above, let's reduce the dimension to two dimensions and draw it.

In [None]:
mapper = umap.UMAP().fit(X_train)
umap.plot.points(mapper, labels=y_train, theme='fire')

As shown in the figure above, when viewed in two dimensions, there are quite a few areas where the survivors and the dead overlap. First, let's feel for a moment that the task of classification is a difficult task before learning the model. Our model does this difficult job!

**Thanks models!**

---------------------------------------------------
# Ensemble

![](https://media3.giphy.com/media/26xBvMWzk7FQr54Sk/giphy.gif)

Picture Credit: https://media3.giphy.com


> Empirically, ensembles tend to yield better results when there is a significant diversity among the models.Many ensemble methods, therefore, seek to promote diversity among the models they combine.Although perhaps non-intuitive, more random algorithms (like random decision trees) can be used to produce a stronger ensemble than very deliberate algorithms (like entropy-reducing decision trees).Using a variety of strong learning algorithms, however, has been shown to be more effective than using techniques that attempt to dumb-down the models in order to promote diversity. It is possible to increase diversity in the training stage of the model using correlation for regression tasks or using information measures such as cross entropy for classification tasks

Ref: https://en.wikipedia.org/

## Setting up models

> This function trains and evaluates performance of all estimators available in the model library using cross validation. The output of this function is a score grid with average cross validated scores.

Ref: https://pycaret.readthedocs.io/en/latest/api/classification.html

In [None]:
all_cols = [cname for cname in X_titanic_df.columns]

In [None]:
from pycaret.classification import *
clf1 = setup(data = train_final, 
             target = 'Survived',
             preprocess = False,
             numeric_features = all_cols,
             silent=True)

## Choosing top models

> This function trains and evaluates the performance of a given estimator using cross validation. The output of this function is a score grid with CV scores by fold. 

Ref: https://pycaret.readthedocs.io/en/latest/api/classification.html

In [None]:
top5 = compare_models(sort='Accuracy',n_select = 5,
                      exclude = ['knn', 'svm','ridge','nb','dummy','qda','xgboost']
                     )

In [None]:
top5

## Creating Models

> This function trains and evaluates the performance of a given estimator using cross validation. The output of this function is a score grid with CV scores by fold. 

Ref: https://pycaret.readthedocs.io/en/latest/api/classification.html

In [None]:
catboost = create_model('catboost')
rf = create_model('rf')
lightgbm = create_model('lightgbm')
#mlp = create_model('mlp')
gbc = create_model('gbc')
lda = create_model('lda')
lr = create_model('lr')

# Interpreting Models

This function analyzes the predictions generated from a trained model. Most plots in this function are implemented based on the SHAP (SHapley Additive exPlanations).

> SHAP (SHapley Additive exPlanations) is a game theoretic approach to explain the output of any machine learning model. It connects optimal credit allocation with local explanations using the classic Shapley values from game theory and their related extensions

Ref: https://shap.readthedocs.io/en/latest/

**If you want to know more about feature importance and SHAP, please refer to the notebook below.**

[Notebook](https://www.kaggle.com/ohseokkim/explaning-machine-by-feature-importnace)

In [None]:
interpret_model(catboost)

In [None]:
interpret_model(rf)

In [None]:
interpret_model(lightgbm)

<span style="color:Blue"> Observation:
* Among the features, if you look at Fare and Age_knn, the features are spread in a wide distribution of importance, and the colors are also spread from blue to red. 
* Each model is learning with the importance of different features. The diversity of these models seems to increase the performance of the ensemble model.
* Title_Mr and Sex_male play an important role in how the model learns.

# Tuning Hyperparameters

> This function tunes the hyperparameters of a given estimator. The output of this function is a score grid with CV scores by fold of the best selected model based on optimize parameter. 

Ref: https://pycaret.readthedocs.io/en/latest/api/classification.html

In [None]:
tuned_rf = tune_model(rf, optimize = 'Accuracy',early_stopping = True)

In [None]:
tuned_lightgbm = tune_model(lightgbm, optimize = 'Accuracy',early_stopping = True)

In [None]:
tuned_catboost = tune_model(catboost, optimize = 'Accuracy',early_stopping = True)

In [None]:
tuned_gbc = tune_model(gbc, optimize = 'Accuracy',early_stopping = True)

In [None]:
tuned_lda = tune_model(lda, optimize = 'Accuracy',early_stopping = True)

In [None]:
tuned_lr = tune_model(lr, optimize = 'Accuracy',early_stopping = True)

## Stacking

In [None]:
# stack top5 models
stack_model = stack_models(estimator_list = [lr,rf,lightgbm,catboost,gbc,lda], meta_model = top5[0] ,optimize = 'Accuracy')

In [None]:
plt.figure(figsize=(10, 10))
plot_model(stack_model, plot='boundary')

In machine learning, it is important to determine the boundary. In particular, in tree-based models, it is more important to determine the boundary, because the process of creating a new leaf in the tree is also the process of determining the boundary.
Looking at the above picture again, there are many overlapping points with the green dot indicating the survior and the blue dot indicating the non-survior. Determining the boundary in this situation would be a very difficult task.
If the feature engineer work was done well, the distribution of the two points to determine the boundary would have been well divided. However, the titanic dataset is difficult to do with some missing values ​​and a small dataset.

Let's look again at the picture above.
Boundary is not very clean. It can be judged that overfitting has occurred, and therefore it can be judged that it does not have generality.

If you want to know more about overfitting, please refer to the notebook below.

[Notebook](https://www.kaggle.com/ohseokkim/overfitting-and-underfitting-eda)

In [None]:
plt.figure(figsize=(10, 10))
plot_model(stack_model, plot = 'auc')

In [None]:
plt.figure(figsize=(8, 8))
plot_model(stack_model, plot='confusion_matrix')

## Soft Blending

In [None]:
blend_soft = blend_models(estimator_list = [lr,rf,lightgbm,catboost,gbc,lda], optimize = 'Accuracy',method = 'soft')

In [None]:
plt.figure(figsize=(10, 10))
plot_model(blend_soft, plot='boundary')

It seems that the Boundary is set properly.

In [None]:
plt.figure(figsize=(10, 10))
plot_model(blend_soft, plot = 'auc')

In [None]:
plt.figure(figsize=(8, 8))
plot_model(blend_soft, plot='confusion_matrix')

## Hard Blending

In [None]:
blend_hard = blend_models(estimator_list = [lr,rf,lightgbm,catboost,gbc,lda], optimize = 'Accuracy',method = 'hard')

In [None]:
plt.figure(figsize=(10, 10))
plot_model(blend_hard, plot='boundary')

Compared to the soft blending model, the boundary does not look clean.

In [None]:
plt.figure(figsize=(8, 8))
plot_model(blend_hard, plot='confusion_matrix')

-------------------------------------------------------------------------------------------------
## Calibrating the final model

> This function calibrates the probability of a given estimator using isotonic or logistic regression. 


In [None]:
cali_model = calibrate_model(blend_soft)

--------------------------------------------------
# Finalizing the last model
> This function trains a given estimator on the entire dataset including the holdout set.

Ref: https://pycaret.readthedocs.io/en/latest/api/classification.html

The blend_soft model is selected based on the above result. Finally, the model is tuned with the entire dataset.


In [None]:
final_model = finalize_model(cali_model)

## Checking the final model

In [None]:
plt.figure(figsize=(8, 8))
plot_model(final_model, plot='boundary')

In [None]:
plt.figure(figsize=(8, 8))
plot_model(final_model, plot='confusion_matrix')

---------------------------------------
# Checking Last Results

Considering above results, the soft blending model seems appropriate among ensemble models. Therefore, we use this model to make the final prediction with the test dataset.

In [None]:
last_prediction = final_model.predict(X_test_df)
submission_data['Survived'] = last_prediction.astype(int)
submission_data.to_csv('submission.csv', index = False)