Following the code of "[Titanic Top 4% with ensemble modeling](https://www.kaggle.com/code/yassineghouzam/titanic-top-4-with-ensemble-modeling)" 

In [None]:
import numpy as np
import pandas as pd 
import os
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import chain
%matplotlib inline

from collections import Counter

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, \
GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score, \
StratifiedKFold, learning_curve

import warnings 
warnings.filterwarnings("ignore")

sns.set(style="white", context="notebook", palette="deep")

# 1. Data

## 1) Remove Outliers

In [None]:
train = pd.read_csv("/kaggle/input/titanic/train.csv")
test = pd.read_csv("/kaggle/input/titanic/test.csv")
IDtest = test["PassengerId"]

In [None]:
def detect_outliers(df, n_outliers, features):
    outlier_dict = {}
    for col in features:
        Q1 = np.percentile(df[col], 25)
        Q3 = np.percentile(df[col], 75)
        IQR = Q3-Q1
        
        outlier_step = 1.5 * IQR
        
        outlier_idx_list = df[(df[col] < Q1 - outlier_step) | (df[col] > (Q3 + outlier_step))].index
        outlier_dict[col] = list(outlier_idx_list)

    outlier_indices = Counter(chain(*[v for k, v in outlier_dict.items()]))
    multiple_outliers = [k for k, v in outlier_indices.items() if v > n_outliers]

    return outlier_dict, multiple_outliers

In [None]:
train.columns

In [None]:
outlier_dict, multiple_outliers = detect_outliers(train, 2, ["Age", "SibSp", "Parch", "Fare"])

In [None]:
for k, v in outlier_dict.items():
    print(k, ":", len(v))
    print(train.loc[v][k].agg(["min", "max"]))
    print('====================================')

In [None]:
train.loc[multiple_outliers]

In [None]:
train = train.drop(multiple_outliers, axis=0).reset_index(drop=True)

## 2) Join Train and Test to apply same function for categorical conversion

In [None]:
train_len = len(train)
dataset = pd.concat([train, test], axis=0).reset_index(drop=True)

## 3) Check Missing Values

In [None]:
import missingno as msno

msno.bar(dataset)

# 2. Feature Analysis 

## 1) Numerical Values

In [None]:
g = sns.heatmap(train[["Survived", "SibSp", "Parch", "Age", "Fare"]].corr(), annot=True, fmt=".2f", cmap="coolwarm")

### 1-1) SibSp & Parch

In [None]:
fig, ax = plt.subplots(1,2, figsize=(12,5))
sns.barplot(train, x="SibSp", y="Survived", ax=ax[0])
sns.barplot(train, x="Parch", y="Survived", ax=ax[1])

### 1-3) Age

In [None]:
fig, ax = plt.subplots(1,2, figsize=(12,5))
sns.histplot(train[train["Survived"]==0], x="Age", kde=True, ax=ax[0])
sns.histplot(train[train["Survived"]==1], x="Age", kde=True, ax=ax[1])

ax[0].set_xlim(0, 100)
ax[0].set_title("Dead")

ax[1].set_xlim(0, 100)
ax[1].set_title("Survived")
plt.show()

In [None]:
g = sns.kdeplot(train[(train["Survived"]==0 & (train["Age"].notnull()))]["Age"], label="Dead", color="Red", fill=True)
g = sns.kdeplot(train[(train["Survived"]==1 & (train["Age"].notnull()))]["Age"], label="Survived", color="Blue", fill=True)
g.legend()

### 1-4) Fare

In [None]:
dataset[dataset["Fare"].isna()]

In [None]:
# replace NaN with median value in Fare
dataset["Fare"] = dataset["Fare"].fillna(dataset["Fare"].median())

In [None]:
g = sns.distplot(dataset["Fare"], label=f"""Skewness : {dataset["Fare"].skew():.2f}""")
g.legend()

In [None]:
dataset["Fare"] = dataset["Fare"].map(lambda x: np.log(x) if x > 0 else 0)
g = sns.distplot(dataset["Fare"], label=f"""Skewness : {dataset["Fare"].skew():.2f}""")
g.legend()

## 2) Categorical Values

### 2-1) Sex

In [None]:
g = sns.barplot(train, x="Sex", y="Survived")

### 2-2) Pclass

In [None]:
g = sns.barplot(train, x="Pclass", y="Survived")

In [None]:
g = sns.barplot(train, x="Pclass", y="Survived", hue="Sex")
g.legend()

### 2-3) Embarked

In [None]:
dataset[dataset["Embarked"].isna()]

In [None]:
dataset["Embarked"] = dataset["Embarked"].fillna("S")

In [None]:
sns.barplot(train, x="Embarked", y="Survived")

In [None]:
sns.histplot(train, x="Embarked", hue="Pclass", multiple="dodge", shrink=.9, discrete=True)

In [None]:
dataset = pd.get_dummies(dataset, columns=["Embarked"], prefix="Em")

# 3. Filling missing Values

## 1) Age

In [None]:
fig, ax = plt.subplots(2,2, figsize=(12,12))
g = sns.boxplot(dataset, x="Sex", y="Age", ax=ax[0][0])
g = sns.boxplot(dataset, x="Sex", y="Age", hue="Pclass", ax=ax[0][1])
g = sns.boxplot(dataset, x="Parch", y="Age", ax=ax[1][0])
g = sns.boxplot(dataset, x="SibSp", y="Age", ax=ax[1][1])

In [None]:
dataset["Sex"] = dataset["Sex"].map({"male": 0, "female":1})

In [None]:
g = sns.heatmap(dataset[["Age","Sex", "SibSp", "Parch", "Pclass"]].corr(), cmap="coolwarm", annot=True)

In [None]:
nan_age_indices = list(dataset["Age"].isnull().index)
for n in nan_age_indices:
    age_med = dataset["Age"].median()
    
    same_par = dataset["Parch"] == dataset.iloc[n]["Parch"]
    same_sib = dataset["SibSp"] == dataset.iloc[n]["SibSp"]
    same_pcls = dataset["Pclass"] == dataset.iloc[n]["Pclass"]

    same_cond = dataset[same_par & same_sib & same_pcls]["Age"]
    if len(same_cond) == 0:
        dataset["Age"].iloc[n] = dataset[same_pcls]["Age"].median()
    else:
        dataset["Age"].iloc[n] = same_cond.median()

In [None]:
dataset[dataset["Age"].isna()]

In [None]:
g = sns.violinplot(train, x="Survived", y="Age")

# 4. Feature Engineering

## 1) Name / Title

In [None]:
dataset["Name"].head()

In [None]:
dataset[dataset["Name"].isna()]

In [None]:
dataset_title = [n.split(',')[1].split('.')[0].strip() for n in dataset["Name"]]
dataset["Title"] = pd.Series(dataset_title)
dataset["Title"].head()

In [None]:
plt.figure(figsize=(12,5))
g = sns.countplot(dataset, x="Title")
g.set_xticklabels(g.get_xticklabels(), rotation=30)
plt.show()

In [None]:
dataset["Title"] = dataset["Title"].replace(["Don", "Rev", "Dr", "Major", "Lady", "Sir", "the Countess", "Jonkheer", "Dona", "Col", "Capt"], "Rare")
dataset["Title_int"] = dataset["Title"].map({"Mr": 0, "Mrs": 1, "Miss":2, "Master":3, "Ms":2, "Mme":2, "Mlle":2, "Rare":4})
dataset["Title_int"] = dataset["Title_int"].astype(int)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(16,5))
g = sns.countplot(dataset, x="Title_int", ax=ax[0])
g.set_xticklabels(["Mr", "Mrs", "MS/Miss","Master", "Rare"])
g.set_xticklabels(g.get_xticklabels(), rotation=30)

g = sns.barplot(dataset, x="Title_int", y="Survived", ax=ax[1])
g.set_xticklabels(["Mr", "Mrs", "MS/Miss","Master", "Rare"])
g.set_xticklabels(g.get_xticklabels(), rotation=30)
plt.show()

In [None]:
dataset.drop(labels=["Name"], axis=1, inplace=True)

In [None]:
dataset = pd.get_dummies(dataset, columns=["Title"])

## 2) Family Size

In [None]:
dataset["FamilySize"] = dataset["SibSp"] + dataset["Parch"] + 1

In [None]:
sns.catplot(dataset, x="FamilySize", y="Survived", kind="point", aspect=1.5)

In [None]:
dataset["Single"] = dataset["FamilySize"].map(lambda x: 1 if x==1 else 0)
dataset["SmallF"] = dataset["FamilySize"].map(lambda x: 1 if x==2 else 0)
dataset["MidF"] = dataset["FamilySize"].map(lambda x: 1 if 2<x<5 else 0)
dataset["LargeF"] = dataset["FamilySize"].map(lambda x: 1 if 4<x else 0)

In [None]:
fig, ax = plt.subplots(2,2, figsize=(12,12))
sns.barplot(dataset, x="Single", y="Survived", ax=ax[0][0])
sns.barplot(dataset, x="SmallF", y="Survived", ax=ax[0][1])
sns.barplot(dataset, x="MidF", y="Survived", ax=ax[1][0])
sns.barplot(dataset, x="LargeF", y="Survived", ax=ax[1][1])

## 3) Cabin

In [None]:
dataset[dataset["Cabin"].isna()]

In [None]:
dataset["Cabin"] = pd.Series([i[0] if not pd.isnull(i) else "X" for i in dataset["Cabin"]])

In [None]:
set(dataset["Cabin"])

In [None]:
fig, ax = plt.subplots(1,2, figsize=(12,6))
sns.countplot(dataset, x="Cabin", order=['A', 'B', 'C', 'D', 'E', 'F', 'G', 'T', 'X'], ax=ax[0])
sns.barplot(dataset, x="Cabin", y="Survived", order=['A', 'B', 'C', 'D', 'E', 'F', 'G', 'T', 'X'], ax=ax[1])
plt.show()

In [None]:
dataset = pd.get_dummies(dataset, columns=["Cabin"], prefix="Cabin")

## 4) Ticket

In [None]:
dataset["Ticket"].head()

In [None]:
ticket = []
for i in list(dataset.Ticket):
    if not i.isdigit():
        ticket_prefix = i.replace('.', "").replace("/", "").strip().split(' ')[0]
        ticket.append(ticket_prefix)
    else:
        ticket.append("X")    

dataset["Ticket"] = ticket
dataset["Ticket"].head()

In [None]:
dataset = pd.get_dummies(dataset, columns=["Ticket"], prefix="T")

# 5. Other labels

In [None]:
dataset["Pclass"] = dataset["Pclass"].astype("category")
dataset["Pclass"]

In [None]:
dataset = pd.get_dummies(dataset, columns=["Pclass"], prefix="Pcls")

In [None]:
dataset.drop(labels=["PassengerId"], axis=1, inplace=True)

In [None]:
dataset.drop(["Title_int"], axis=1,inplace=True)

In [None]:
dataset.head()

In [None]:
dataset.columns

# 6. Dataset

In [None]:
train = dataset[:train_len]
test = dataset[train_len:]
test.drop(["Survived"], axis=1, inplace=True)

Y_train = train["Survived"].astype(int)
X_train = train.drop(["Survived"], axis=1)

# 7. Modeling

In [None]:
kfold = StratifiedKFold(n_splits=10)

In [None]:
random_state = 2
classifiers = []

classifiers.append(SVC(random_state=random_state))
classifiers.append(DecisionTreeClassifier(random_state=random_state))
classifiers.append(AdaBoostClassifier(DecisionTreeClassifier(random_state=random_state), \
                                      random_state=random_state, learning_rate=0.1))
classifiers.append(RandomForestClassifier(random_state=random_state))
classifiers.append(ExtraTreesClassifier(random_state=random_state))
classifiers.append(GradientBoostingClassifier(random_state=random_state))
classifiers.append(MLPClassifier(random_state=random_state))
classifiers.append(KNeighborsClassifier())
classifiers.append(LogisticRegression(random_state=random_state))
classifiers.append(LinearDiscriminantAnalysis())

In [None]:
cv_results = []
for classifier in classifiers:
    cv_results.append(cross_val_score(classifier, X_train, y=Y_train, scoring="accuracy", cv=kfold, n_jobs=4))

cv_means = []
cv_std = []
for cv_result in cv_results:
    cv_means.append(cv_result.mean())
    cv_std.append(cv_result.std())

In [None]:
cv_dict = {
    "CrossValMeans": cv_means,
    "CrossValerrors": cv_std,
    "Algorithm": ["SVC", "DecisionTree", "AdaBoost", "RandomForest", "ExtraTrees", "GradientBoosting",\
                 "MLP", "K-NN", "LogisticRegression", "LDA"]
}
cv_res = pd.DataFrame(cv_dict)
sns.barplot(cv_res, x="CrossValMeans", y="Algorithm", **{"xerr":cv_std})

In [None]:
cls_order = np.argsort(cv_means)[::-1]
for cls in cls_order[:5]:
    print(cv_dict["Algorithm"][cls], cv_means[cls])

## 1) Hyperparams tunning for selected models

### a. GradientBoosting

In [None]:
GBC = GradientBoostingClassifier()
gb_param_grid = {"loss": ["deviance"],
                "n_estimators": [100,200,300],
                "learning_rate": [0.1, 0.05, 0.01],
                 "max_depth": [4,8],
                 "min_samples_leaf": [100, 150],
                 "max_features": [0.3, 0.1]
                }

gs_gbc = GridSearchCV(GBC, param_grid=gb_param_grid, cv=kfold, scoring="accuracy", n_jobs=4, verbose=1)
gs_gbc.fit(X_train, Y_train)

GBC_best = gs_gbc.best_estimator_
print(gs_gbc.best_score_)

### b. AdaBoost

In [None]:
DTC = DecisionTreeClassifier()
adaDTC = AdaBoostClassifier(DTC, random_state=7)
ada_param_grid = {
    "base_estimator__criterion": ["gini", "entropy"],
    "base_estimator__splitter": ["best", "random"],
    "algorithm": ["SAMME", "SAMME_R"],
    "n_estimators": [1, 2],
    "learning_rate": [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3, 1.5]
}

gs_adaDTC = GridSearchCV(adaDTC, param_grid=ada_param_grid, cv=kfold, scoring="accuracy", n_jobs=4, verbose=1)
gs_adaDTC.fit(X_train, Y_train)

ada_best = gs_adaDTC.best_estimator_
print(gs_adaDTC.best_score_)

### c. ExtraTrees

In [None]:
ExtC = ExtraTreesClassifier()

ex_param_grid = {
    "max_depth": [None],
    "max_features": [1,3,10],
    "min_samples_split": [2,3,10],
    "min_samples_leaf":[1,3,10],
    "bootstrap": [False],
    "n_estimators":[100,300],
    "criterion":["gini"]
}

gs_ExtC = GridSearchCV(ExtC, ex_param_grid, cv=kfold, scoring="accuracy", n_jobs=4, verbose=1)
gs_ExtC.fit(X_train, Y_train)

ExtC_best = gs_ExtC.best_estimator_

print(gs_ExtC.best_score_)

### d. RandomForeset

In [None]:
RFC = RandomForestClassifier()

rf_param_grid = {"max_depth":[None],
                 "max_features":[1,3,10],
                 "min_samples_split": [2, 3, 10],
                 "min_samples_leaf": [1, 3, 10],
                 "bootstrap": [False],
                 "n_estimators": [100, 300],
                 "criterion": ["gini"]
                }

gs_RFC = GridSearchCV(RFC, rf_param_grid, cv=kfold, scoring="accuracy", n_jobs=4, verbose=1)
gs_RFC.fit(X_train, Y_train)

RFC_best = gs_RFC.best_estimator_

print(gs_RFC.best_score_)

### e. SVC

In [None]:
SVMC = SVC(probability=True)
svc_param_grid = {
    "kernel": ['rbf'],
    'gamma': [0.001, 0.01, 0.1, 1],
    "C": [1, 10, 50, 100, 200, 300, 1000]
}

gs_SVMC = GridSearchCV(SVMC, svc_param_grid, cv=kfold, scoring="accuracy", n_jobs=4, verbose=1)
gs_SVMC.fit(X_train, Y_train)

best_SVMC = gs_SVMC.best_estimator_
print(gs_SVMC.best_score_)

## 2) Plot LR Curves

In [None]:
def plot_lr_curve(estimator, title, X, y, ylim=None, cv=None, \
                  n_jobs=-1, train_sizes=np.linspace(0.1, 1.0, 5)):
    plt.figure()
    plt.title(title)

    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")

    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes
    )

    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)

    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean-train_scores_std,
                     train_scores_mean+train_scores_std, alpha=0.1,
                     color="r"
                    )
    plt.fill_between(train_sizes, test_scores_mean-test_scores_std,
                     test_scores_mean+test_scores_std, alpha=0.1,
                     color="g"
                    )

    plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")
    plt.legend(loc="best")

    return plt

In [None]:
g = plot_lr_curve(gs_gbc.best_estimator_, "Gradient Boosting learning curve", X_train, Y_train, cv=kfold)
g = plot_lr_curve(gs_adaDTC.best_estimator_, "AdaBoost learning curve", X_train, Y_train, cv=kfold)
g = plot_lr_curve(gs_ExtC.best_estimator_, "ExtraTrees learning curve", X_train, Y_train, cv=kfold)
g = plot_lr_curve(gs_RFC.best_estimator_, "RandomForest learning curve", X_train, Y_train, cv=kfold)
g = plot_lr_curve(gs_SVMC.best_estimator_, "SVM learning curve", X_train, Y_train, cv=kfold)

## 3) Feature Importance

In [None]:
fig, ax = plt.subplots(2, 2, sharex="all", figsize=(15,15))
name_classifiers = [("AdaBoosting", ada_best), ("ExtraTrees", ExtC_best),
                   ("RandomForest", RFC_best), ("GradientBoosting", GBC_best)]

n_classifier = 0
for row in range(2):
    for col in range(2):
        name, classifier = name_classifiers[n_classifier]
        indices = np.argsort(classifier.feature_importances_)[::-1][:40]

        x_val = classifier.feature_importances_[indices]
        y_val = X_train.columns[indices]
        
        g = sns.barplot(x=x_val, y=y_val, orient="h", ax=ax[row][col])
        g.set_xlabel("Relative importance")
        g.set_ylabel("Features")
        g.tick_params(labelsize=9)
        g.set_title(name + " feature importance")
        n_classifier += 1
        

In [None]:
nan_age = dataset[dataset["Pcls_3"]==True]["Age"].median()
test["Age"] = test["Age"].fillna(nan_age)

In [None]:
test_survived_RFC = pd.Series(RFC_best.predict(test), name="RFC")
test_survived_ExtC = pd.Series(ExtC_best.predict(test), name="Ext")
test_survived_SVMC = pd.Series(best_SVMC.predict(test), name="SVM")
test_survived_adaC = pd.Series(ada_best.predict(test), name="Ada")
test_survived_GBC = pd.Series(GBC_best.predict(test), name="GBC")

ensemble_results = pd.concat([test_survived_RFC, test_survived_ExtC, test_survived_SVMC,
                             test_survived_adaC, test_survived_GBC], axis=1)
g = sns.heatmap(ensemble_results.corr(), annot=True)

## 4) Ensamble Modeling

In [None]:
votingC = VotingClassifier(estimators=[("rfc", RFC_best), ('extc', ExtC_best), ("svm", best_SVMC),
                                      ("ada", ada_best), ("gcb", GBC_best)], voting="soft", n_jobs=4)
votingC = votingC.fit(X_train, Y_train)

In [None]:
test_survived = pd.Series(votingC.predict(test), name="Survived")
results = pd.concat([IDtest, test_survived], axis=1)

results.to_csv("./submission.csv", index=False)