In [9]:
# ===========================================================================
# ============ LIBRARIES IMPORT, CONSTANT and HELPER FUNCTION ===============

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
import warnings


# To get rid of the annoying ConvergenceWarning
warnings.filterwarnings('ignore', 'The max_iter.*')

# Help to print wilder, and see more of a dataframe on screen
DESIRED_WIDTH = 500
pd.set_option("display.width", DESIRED_WIDTH)


# Little helper function, because we re-use it; it prints wilder, so that we see more columns
def print_wilder(to_be_printed):
    with pd.option_context("display.max_rows", None, "display.max_columns", 205):
        print(to_be_printed)


# Helper function for the re-using of the function to get the different scoring associated with the cross-validation
def cross_validate_and_metrics(name_of_model, trained_model):
    scores = cross_val_score(trained_model, X_train, y_train, cv=5, scoring='accuracy')
    scores2 = cross_val_score(trained_model, X_train, y_train, cv=5, scoring='f1')
    scores3 = cross_val_score(trained_model, X_train, y_train, cv=5, scoring='precision')
    scores4 = cross_val_score(trained_model, X_train, y_train, cv=5, scoring='roc_auc')
    print()
    # print(scores)
    print(f"Metrics for {name_of_model}:")
    print(f"    Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    print(f"    F1: %0.2f (+/- %0.2f)" % (scores2.mean(), scores2.std() * 2))
    print(f"    Precision: %0.2f (+/- %0.2f)" % (scores3.mean(), scores3.std() * 2))
    print(f"    ROC AUC: %0.2f (+/- %0.2f)" % (scores4.mean(), scores4.std() * 2))


# ===========================================================================
# ================= IMPORTING DATA, OVERVIEW, CHOOSING FEATURES  ============

# load data
train = pd.read_csv("trainTitanic.csv")
test = pd.read_csv("testTitanic.csv")
# save PassengerId for final submission
passengerId = test.PassengerId

# merge train and test, to avoid doing things twice
df = train.append(test, ignore_index=True, sort=False)
# create indexes to separate data later on
train_idx = len(train) - 1
test_idx = len(df) - len(test)

print_wilder(df.describe())  # Among other things, good to find features with NaN

# Choosing baseline features
features_kept = ["Pclass", "Sex", "Age", "Fare", "Embarked", "Survived", "Name", "Cabin", "Parch", "SibSp"]
all_data_baseline = df.copy()
data_baseline = all_data_baseline.loc[:, features_kept]


# ===========================================================================
# ============== FEATURES ENGINEERING, IMPUTING MISSING DATA ================

# create a new feature to extract title names from the Name column
data_baseline["Title"] = data_baseline["Name"].apply(lambda name: name.split(",")[1].split(".")[0].strip())

normalized_titles = {
    "Capt":       "Officer",
    "Col":        "Officer",
    "Major":      "Officer",
    "Jonkheer":   "Royalty",
    "Don":        "Royalty",
    "Sir":       "Royalty",
    "Dr":         "Officer",
    "Rev":        "Officer",
    "the Countess": "Royalty",
    "Dona":       "Royalty",
    "Mme":        "Mrs",
    "Mlle":       "Miss",
    "Ms":         "Mrs",
    "Mr":        "Mr",
    "Mrs":       "Mrs",
    "Miss":      "Miss",
    "Master":    "Master",
    "Lady":      "Royalty"
}

# map the normalized titles to the current titles
data_baseline["Title"] = data_baseline["Title"].map(normalized_titles)

# group by Sex, Pclass, and Title
subGroupFeatures = data_baseline.groupby(["Sex", "Pclass", "Title"])

# apply the grouped median value on the Age NaN
data_baseline["Age"] = subGroupFeatures.Age.apply(lambda x: x.fillna(x.median()))

# fill Cabin NaN with U for unknown
data_baseline["Cabin"] = data_baseline["Cabin"].fillna('U')
# find most frequent Embarked value and store it
most_embarked = data_baseline["Embarked"].value_counts().index[0]

# fill NaN with most_embarked value
data_baseline["Embarked"] = data_baseline["Embarked"].fillna(most_embarked)
# fill NaN with median fare
data_baseline["Fare"] = data_baseline["Fare"].fillna(data_baseline["Fare"].median())

# size of families (including the passenger)
data_baseline["FamilySize"] = data_baseline["Parch"] + data_baseline["SibSp"] + 1
data_baseline = data_baseline.drop(["Parch", "SibSp"], axis=1)  # axis=1 because it's a column
family_size_cat = {1: "Single", 2: "Small", 3: "Small", 4: "Small", 5: "Large", 6: "Large", 7: "Large", 8: "Large",
                   11: "Large"}
data_baseline["FamilySize"] = data_baseline["FamilySize"].map(family_size_cat)

# map first letter of cabin to itself
data_baseline.Cabin = data_baseline.Cabin.map(lambda x: x[0])  # first letter is the section


# ===========================================================================
# =================== DUMMIFYING CATEGORICAL VARIABLES ======================

# dummify the categorical variables Sex, Embarked, Title, FamilySize, Cabin
# drop_first nous laisse n-1 dummies
dummies_sex = pd.get_dummies(data_baseline.loc[:, "Sex"], drop_first=True, prefix="Sex")
dummies_embarked = pd.get_dummies(data_baseline.loc[:, "Embarked"], drop_first=True, prefix="Embarked")
dummies_title = pd.get_dummies(data_baseline.loc[:, "Title"], drop_first=True, prefix="Title")
dummies_family = pd.get_dummies(data_baseline.loc[:, "FamilySize"], drop_first=True, prefix="Family")
dummies_cabin = pd.get_dummies(data_baseline.loc[:, "Cabin"], drop_first=True, prefix="Cabin")
data = pd.concat([data_baseline, dummies_sex, dummies_embarked, dummies_title, dummies_family, dummies_cabin], axis=1)
data = data.drop(["Sex", "Embarked", "Title", "Name", "Cabin", "FamilySize"], axis=1)
print_wilder(data.head())


# ===========================================================================
# ======================== RE SPLIT OF TRAIN/TEST ===========================

# create train and test data
df_train = data.loc[: train_idx]
test = data[test_idx:]
# convert Survived back to int
train.loc[:, "Survived"] = train.loc[:, "Survived"].astype(int)  # Note for me, first instance of the warning for .loc

# create X and y for data and target values
X_train = df_train.drop("Survived", axis=1).values
y_train = df_train.loc[:, "Survived"].values
# create array for test set
X_test = test.drop("Survived", axis=1).values


# ===========================================================================
# =============== MODEL 1: SIMPLE REGRESSION AS BASELINE ====================

trained_model_regression = LogisticRegression(random_state=0, solver='sag')
cross_validate_and_metrics("Regression", trained_model_regression)


# ===========================================================================
# ====================== MODEL 2: RANDOM FOREST =============================

trained_model_random_forest = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
cross_validate_and_metrics("Random forest", trained_model_random_forest)


# ===========================================================================
# ======================== MODEL 3: ADABOOST ================================

trained_model_adaboost = AdaBoostClassifier()
cross_validate_and_metrics("Adaboost", trained_model_adaboost)


# ===========================================================================
# ==================== MODEL 4: GRADIENT BOOSTING ===========================

trained_model_gradient = GradientBoostingClassifier(criterion="friedman_mse", learning_rate=0.2, loss="deviance",
                                                    max_depth=3, max_features="sqrt", min_samples_leaf=0.1,
                                                    min_samples_split=0.17273, n_estimators=10, subsample=1)
cross_validate_and_metrics("Gradient boosting", trained_model_gradient)


# ===========================================================================
# =========================== MODEL 5: SVM ==================================

trained_model_svm = SVC()
cross_validate_and_metrics("SVM", trained_model_svm)


# ===========================================================================
# ============================ SOUMISSION ===================================

# re-training before final predict
# trained_model_gradient = GradientBoostingClassifier(criterion="friedman_mse", learning_rate=0.2, loss="deviance",
#                                                     max_depth=3, max_features="sqrt", min_samples_leaf=0.1,
#                                                     min_samples_split=0.17273, n_estimators=10,
#                                                     subsample=1).fit(X_train, y_train)

trained_model_random_forest = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                                                     max_depth=12, max_features='auto', max_leaf_nodes=None,
                                                     min_impurity_decrease=0.0, min_impurity_split=None,
                                                     min_samples_leaf=4, min_samples_split=10,
                                                     min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=None,
                                                     oob_score=False, random_state=None, verbose=0,
                                                     warm_start=False).fit(X_train, y_train)

trained_model_adaboost = AdaBoostClassifier().fit(X_train, y_train)

y_pred_final = trained_model_adaboost.predict(X_test)
submission = pd.DataFrame({"PassengerId": passengerId, "Survived": y_pred_final})

filename = "Titanic-Submission.csv"
submission.to_csv(filename, encoding="utf-8", index=False)


       PassengerId    Survived       Pclass          Age        SibSp        Parch         Fare
count  1309.000000  891.000000  1309.000000  1046.000000  1309.000000  1309.000000  1308.000000
mean    655.000000    0.383838     2.294882    29.881138     0.498854     0.385027    33.295479
std     378.020061    0.486592     0.837836    14.413493     1.041658     0.865560    51.758668
min       1.000000    0.000000     1.000000     0.170000     0.000000     0.000000     0.000000
25%     328.000000    0.000000     2.000000    21.000000     0.000000     0.000000     7.895800
50%     655.000000    0.000000     3.000000    28.000000     0.000000     0.000000    14.454200
75%     982.000000    1.000000     3.000000    39.000000     1.000000     0.000000    31.275000
max    1309.000000    1.000000     3.000000    80.000000     8.000000     9.000000   512.329200
   Pclass   Age     Fare  Survived  Sex_male  Embarked_Q  Embarked_S  Title_Miss  Title_Mr  Title_Mrs  Title_Officer  Title_Royalty  Fam