# Load Data

In [1]:
# define data load function from csv files with pandas
import pandas as pd

PATH = "datasets/titanic/"
FILE_GENDER_SUBMISSION = PATH + "gender_submission.csv"
FILE_TEST = PATH + "test.csv"
FILE_TRAIN = PATH + "train.csv"

def load_data():
    return pd.read_csv(FILE_TRAIN), pd.read_csv(FILE_TEST), pd.read_csv(FILE_GENDER_SUBMISSION)

In [2]:
# get data from file
data_train, data_test, data_gender_submission = load_data()
ori_data_train, ori_data_test = data_train.copy(), data_test.copy()

OSError: File b'datasets/titanic/train.csv' does not exist

# View Data

In [None]:
data_train.head()

In [None]:
data_train.info()

In [None]:
data_train["Embarked"].value_counts()

In [None]:
data_train["Ticket"].value_counts()

In [None]:
data_train["Cabin"].value_counts()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
data_train.hist(figsize=(14,10))

In [None]:
data_test.head()

In [None]:
data_test.info()

In [None]:
data_test.hist(figsize=(14,10))

In [None]:
data_gender_submission.head()

In [None]:
data_gender_submission.info()

In [None]:
data_gender_submission.hist(figsize=(8,4))

In [None]:
corr_matrix = data_train.corr()

In [None]:
corr_matrix["Survived"].sort_values(ascending=False)

## Conclusion
* none of the data features are symmetric
* some data samples are missing / NaN
* some attributes have to be converted from labels to an proper encoding

# Preprocess Data

## Encoding

Value | Label  | Encoding
----- | ------ | --------
Sex:  | male   | 1
      | female | 0
      
Value | Label  | Encoding
----- | ------ | --------
Embarked: | Southampton | 1 0 0
          | Queenstown  | 0 1 0
          | Cherbourg   | 0 0 1

The Cabin number will be removed and the encoding will be as shown below: 

Value | Label  | Encoding
----- | ------ | --------
Cabin: | A | 1
       | B | 2
       | C | 3
       | D | 4
       | E | 5
       | F | 6
       | G | 7

The age and fare attribute will be re-scaled / normalized.

The name, passengerid and ticket attribute will be dropped!

In [None]:
# drop unused attributes
for set_ in (data_train, data_test):
    set_.drop("Name", axis=1, inplace=True)
    set_.drop("Ticket", axis=1, inplace=True)
    set_.drop("PassengerId", axis=1, inplace=True)

In [None]:
# verify drop
data_train.head()

In [None]:
# set gender 
for set_ in (data_train, data_test):
    set_["Sex"] = (set_["Sex"] == "male").astype(int)

In [None]:
# verify gender set
data_train.head()

In [None]:
# encode embarked
from sklearn.preprocessing import LabelBinarizer

encoder = LabelBinarizer()

In [None]:
# get embarked labels
import numpy as np

for set_ in (data_train, data_test):
    set_["Embarked"] = set_["Embarked"].replace(np.nan, "X")

In [None]:
embarked_labels = set(data_train["Embarked"])
print(embarked_labels)
print(set(data_test["Embarked"]))

In [None]:
for set_ in (data_train, data_test):
    embarked_encoded = set_["Embarked"].str.get_dummies()
    set_["Embarked_C"] = embarked_encoded["C"]
    set_["Embarked_Q"] = embarked_encoded["Q"]
    set_["Embarked_S"] = embarked_encoded["S"]

In [None]:
data_train = data_train.drop("Embarked", axis=1)
data_test = data_test.drop("Embarked", axis=1)

data_train.head()

In [None]:
for set_ in (data_train, data_test):
    set_["Cabin"] = set_["Cabin"].replace(np.nan, "X")

data_train["Cabin"] = data_train["Cabin"].str[:1]
data_test["Cabin"] = data_test["Cabin"].str[:1]

In [None]:
print(set(data_train["Cabin"]))
print(set(data_test["Cabin"]))

In [None]:
for set_ in (data_train, data_test):
    embarked_encoded = set_["Cabin"].str.get_dummies()
    set_["Cabin_C"] = embarked_encoded["C"]
    set_["Cabin_D"] = embarked_encoded["D"]
    set_["Cabin_F"] = embarked_encoded["F"]
    set_["Cabin_B"] = embarked_encoded["B"] 
    set_["Cabin_A"] = embarked_encoded["A"]
    set_["Cabin_E"] = embarked_encoded["E"]
    set_["Cabin_G"] = embarked_encoded["G"]

In [None]:
data_train = data_train.drop("Cabin", axis=1)
data_test = data_test.drop("Cabin", axis=1)

In [None]:
data_train = data_train.fillna(data_train.mean())
data_test = data_test.fillna(data_test.mean())
data_train.info()
data_test.info()

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
for set_ in (data_train, data_test):
    set_[["Age", "Fare"]] = scaler.fit_transform(set_[["Age", "Fare"]])

In [None]:
data_train.head()

In [None]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder()

for set_ in (data_train, data_test):
    embarked_encoded = pd.DataFrame(encoder.fit_transform(set_["Pclass"].values.reshape(-1, 1)).toarray())
    set_["Pclass_1"] = embarked_encoded[0]
    set_["Pclass_2"] = embarked_encoded[1]
    set_["Pclass_3"] = embarked_encoded[2]

In [None]:
data_train = data_train.drop("Pclass", axis=1)
data_test = data_test.drop("Pclass", axis=1)

In [None]:
data_train.head()

In [None]:
data_test.head()

In [None]:
X_train = data_train.drop("Survived", axis=1)
y_train = data_train["Survived"]

In [None]:
X_train.head()

In [None]:
y_train.head()

# Feature Engineering

In [None]:
# view 
corr_matrix = ori_data_train.corr()
corr_matrix["Survived"].sort_values(ascending=False)

In [None]:
ax = ori_data_train.plot(figsize=(12, 8), kind="scatter", x="Age", y="Fare", alpha=0.3,
            c=ori_data_train["Survived"], cmap=plt.get_cmap("PiYG"), colorbar=True)
# set the xaxis label
plt.setp(ax.get_xticklabels(), visible=True, rotation=0, ha='right')
# turn off minor ticks
plt.minorticks_off()
plt.show()

In [None]:
ax = ori_data_train.plot(figsize=(12, 8), kind="scatter", x="Age", y="Fare", alpha=0.3,
            c=ori_data_train["Survived"], cmap=plt.get_cmap("PiYG"), colorbar=True)
plt.axis([0, 80, 0, 50])
# set the xaxis label
plt.setp(ax.get_xticklabels(), visible=True, rotation=0, ha='right')
# turn off minor ticks
plt.minorticks_off()
plt.show()

In [None]:
classes = ori_data_train.groupby(['Pclass']).count()
classes

# Feature Selection

In [None]:
# filter columns to see if models perfom better

#X_train = X_train.drop("Fare", axis=1)

#X_train = X_train.drop("Embarked_C", axis=1)
#X_train = X_train.drop("Embarked_Q", axis=1)
#X_train = X_train.drop("Embarked_S", axis=1)

#X_train = X_train.drop("Cabin_C", axis=1)
#X_train = X_train.drop("Cabin_D", axis=1)
#X_train = X_train.drop("Cabin_F", axis=1)
#X_train = X_train.drop("Cabin_B", axis=1)
#X_train = X_train.drop("Cabin_A", axis=1)
#X_train = X_train.drop("Cabin_E", axis=1)
#X_train = X_train.drop("Cabin_G", axis=1)

#X_train = X_train.drop("Pclass_1", axis=1)
#X_train = X_train.drop("Pclass_2", axis=1)
#X_train = X_train.drop("Pclass_3", axis=1)

# Train Models

## Train Random Forest Classifier

In [None]:
# try another model using Random Forests which builds a model on top of many models, also called Ensemble Learning
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict

forest_clf = RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)
y_train_pred_forest = cross_val_predict(forest_clf, X_train, y_train, cv=3)

In [None]:
from sklearn.metrics import confusion_matrix

y_train_pred = cross_val_predict(forest_clf, X_train, y_train, cv=3)
confusion_matrix(y_train, y_train_pred)

## Train SVM Classifier

In [None]:
from sklearn.svm import SVC

svc_clf = SVC()
y_svc_train_pred = cross_val_predict(svc_clf, X_train, y_train, cv=3)
confusion_matrix(y_train, y_svc_train_pred)

## Train KNN Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier(weights="distance", n_neighbors=4)
y_svc_train_pred = cross_val_predict(knn_clf, X_train, y_train, cv=3)
confusion_matrix(y_train, y_svc_train_pred)

## Train MLP Classifier

In [None]:
from sklearn.neural_network import MLPClassifier

nn_clf = MLPClassifier(solver="lbfgs", alpha=1)
y_train_pred = cross_val_predict(nn_clf, X_train, y_train, cv=3)
confusion_matrix(y_train, y_train_pred)

## Train Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt_clf = DecisionTreeClassifier(max_depth=5)
y_train_pred = cross_val_predict(dt_clf, X_train, y_train, cv=3)
confusion_matrix(y_train, y_train_pred)

## Train Gaussian Process Classifier

In [None]:
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

gp_clf = GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True)
y_train_pred = cross_val_predict(dt_clf, X_train, y_train, cv=3)
confusion_matrix(y_train, y_train_pred)

# Models Evaluation

## Analyse Random Forest Classifier

In [None]:
from sklearn.metrics import precision_score, recall_score

precisions = precision_score(y_train, y_train_pred_forest)
print("Precision:", precisions)
recalls = recall_score(y_train, y_train_pred_forest)
print("Recall:", recalls)

In [None]:
from sklearn.metrics import precision_recall_curve

y_probas_forest = cross_val_predict(forest_clf, X_train, y_train, cv=3,
                             method="predict_proba")
y_scores_forest = y_probas_forest[:, 1]
precisions_forest, recalls_forest, thresholds_forest = precision_recall_curve(y_train, y_scores_forest)

In [None]:
def plot_precision_vs_recall(precisions, recalls):
    plt.plot(recalls, precisions, "b-")
    plt.xlabel("Recall", fontsize=16)
    plt.ylabel("Precision", fontsize=16)
    plt.axis([0, 1, 0, 1])

In [None]:
plt.figure(figsize=(8, 6))
plot_precision_vs_recall(precisions_forest, recalls_forest)
plt.show()

In [None]:
from sklearn.metrics import roc_curve

y_probas_forest = cross_val_predict(forest_clf, X_train, y_train, cv=3,
                             method="predict_proba")
y_scores_forest = y_probas_forest[:, 1]
fpr_forest, tpr_forest, thresholds_forest = roc_curve(y_train, y_scores_forest)

In [None]:
def plot_roc_curve(fpr, tpr, label=None):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.axis([0, 1, 0, 1])
    plt.xlabel('False Positive Rate', fontsize=16)
    plt.ylabel('True Positive Rate', fontsize=16)

In [None]:
plt.figure(figsize=(8, 6))
plot_roc_curve(fpr_forest, tpr_forest, "Random Forest")
plt.legend(loc="lower right", fontsize=16)
plt.show()

In [None]:
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
    plt.plot(thresholds, precisions[:-1], "b--", label="Precision", linewidth=2)
    plt.plot(thresholds, recalls[:-1], "g-", label="Recall", linewidth=2)
    plt.xlabel("Threshold", fontsize=16)
    plt.legend(loc="lower left", fontsize=16)
    plt.ylim([0, 1])

In [None]:
precisions, recalls, thresholds = precision_recall_curve(y_train, y_scores_forest)
plt.figure(figsize=(8, 4))
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
plt.show()

## Analyse SVM

In [None]:
svc_clf = SVC(gamma=0.1, C=1, kernel="rbf")

y_scores = cross_val_predict(svc_clf, X_train, y_train, cv=3,
                             method="decision_function")
precisions, recalls, thresholds = precision_recall_curve(y_train, y_scores)

In [None]:
precisions, recalls, thresholds = precision_recall_curve(y_train, y_scores_forest)
plt.figure(figsize=(8, 4))
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
plot_precision_vs_recall(precisions_forest, recalls_forest)
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
plot_roc_curve(fpr_forest, tpr_forest)
plt.show()

# Fine-Tune Models

## Tune SVM

In [None]:
from sklearn.model_selection import GridSearchCV

svc_param_clf = [
    {'kernel': ["linear", "rbf"], 'C': [10**-5, 10**-3, 0.01, 0.1, 1, 10, 30, 50, 100], 'gamma': [1e-3, 1e-2, 1e-1, 1, 1e1, 1e2]}
]

svc_clf = SVC()

grid_search_svr = GridSearchCV(svc_clf, svc_param_clf, cv=3, scoring='neg_mean_squared_error', verbose=2, n_jobs=4)
grid_search_svr.fit(X_train, y_train)
print(grid_search_svr.best_params_)
print(grid_search_svr.best_estimator_)


In [None]:
svc_clf = SVC(gamma=0.1, C=1, kernel="rbf")

y_svc_train_pred = cross_val_predict(svc_clf, X_train, y_train, cv=3)
confusion_matrix(y_train, y_svc_train_pred)

In [None]:
svc_clf = SVC(gamma=0.1, C=1, kernel="rbf")
svc_clf.fit(X_train, y_train)

y_scores = cross_val_predict(svc_clf, X_train, y_train, cv=3,
                             method="decision_function")
precisions, recalls, thresholds = precision_recall_curve(y_train, y_scores)

precisions, recalls, thresholds = precision_recall_curve(y_train, y_scores_forest)
plt.figure(figsize=(8, 4))
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
plt.show()

plt.figure(figsize=(8, 6))
plot_precision_vs_recall(precisions_forest, recalls_forest)
plt.show()

plt.figure(figsize=(8, 6))
plot_roc_curve(fpr_forest, tpr_forest)
plt.show()

from sklearn.metrics import f1_score
y_pred = cross_val_predict(svc_clf, X_train, y_train, cv=3)
f1_score(y_train, y_pred)

# Test Model

In [None]:
data_test.head()

In [None]:
ori_data_test.head()

In [None]:
predictions = svc_clf.predict(data_test)

In [None]:
result = pd.DataFrame(np.c_[ori_data_test['PassengerId'], predictions])
result.rename(columns={0: 'PassengerId', 1: 'Survived'}, inplace=True)

In [None]:
# write to csv file for submission
result.to_csv(PATH + 'result.csv', index=False)