## Competition Description
The sinking of the RMS Titanic is one of the most infamous shipwrecks in history. On April 15, 1912, during her maiden voyage, the Titanic sank after colliding with an iceberg, killing 1502 out of 2224 passengers and crew. This sensational tragedy shocked the international community and led to better safety regulations for ships.

One of the reasons that the shipwreck led to such loss of life was that there were not enough lifeboats for the passengers and crew. Although there was some element of luck involved in surviving the sinking, some groups of people were more likely to survive than others, such as women, children, and the upper-class.

In this challenge, we ask you to complete the analysis of what sorts of people were likely to survive. In particular, we ask you to apply the tools of machine learning to predict which passengers survived the tragedy.

In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
train_data = pd.read_csv("train.csv")

In [3]:
# 抽离Label项：Survived
y_train = train_data["Survived"].copy()
x_train = train_data.drop(columns=["Survived"])

In [4]:
# 数据预处理 num pipeline
from sklearn.base import BaseEstimator
class NumberPreprocesser(BaseEstimator):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        # 数字项：Pclass, Age, SibSp, Parch, Fare
        x_num = X[["Pclass", "Age", "Fare", "Sex"]].copy()
        # 调整船舱级别数值，越高级数字越高
        x_num["Pclass"].replace({1:3, 3:1}, inplace=True)
        # Sex 分类转换为数字项
        x_num["Sex"].replace({"male":0, "female":1}, inplace=True)
        # 添加计算属性
        x_num["Parch_b"] = X["Parch"] > 0
        x_num["SibSp_b"] = X["SibSp"] > 0
        # x_train_num["single_dog"] = (X["Parch"] == 0) & (X["SibSp"] == 0)
        x_num["Has_family"] = (X["Parch"] > 0) | (X["SibSp"] > 0)
        return x_num

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer, StandardScaler
num_pipeline = Pipeline([
    ("NumberPreprecess", NumberPreprocesser()),
    ("Imputer", Imputer(strategy="median")),
#     ("SandardScaler", StandardScaler()), # 考虑这里加上Scaler
])
num_attribs = ["Pclass", "Age", "Fare", "Sex", "Parch_b", "SibSp_b", "Has_family"]
# x_train_num = num_pipeline.fit_transform(x_train)
# x_num_df = pd.DataFrame(x_train_num, columns=num_attribs)
# x_num_df.info()

In [6]:
# 分类数据预处理
class CagetoryPreprosser(BaseEstimator):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        # process Embarked category
        embarked = X[["Embarked"]].copy()
        embarked = pd.get_dummies(embarked)
        # process Cabin
        cabin = x_train["Cabin"].copy()
        cabin.replace(to_replace="A.*", value="A", regex=True, inplace=True)
        cabin.replace(to_replace="B.*", value="B", regex=True, inplace=True)
        cabin.replace(to_replace="C.*", value="C", regex=True, inplace=True)
        cabin.replace(to_replace="D.*", value="D", regex=True, inplace=True)
        cabin.replace(to_replace="E.*", value="E", regex=True, inplace=True)
        cabin.replace(to_replace="F.*", value="F", regex=True, inplace=True)
        cabin.replace(to_replace="G.*", value="G", regex=True, inplace=True)
        cabin.replace(to_replace="T.*", value="T", regex=True, inplace=True)
        cabin = pd.get_dummies(cabin)
#         return np.hstack((embarked.values, cabin.values))
        return embarked.values

In [7]:
cat_pipeline = Pipeline([
    ("CagetoryPreprosser", CagetoryPreprosser()),
])

cat_attribs = ["Embarked_C","Embarked_Q","Embarked_S"]
# x_train_cat = cat_pipeline.fit_transform(x_train)
# x_cat_df = pd.DataFrame(x_train_cat, columns = cat_attribs)
# x_cat_df.info()

In [8]:
# 数据预处理 Full  pipeline
from sklearn.pipeline import FeatureUnion
full_pipeline = FeatureUnion(transformer_list=[
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline),
])
x_train_prepared = full_pipeline.fit_transform(x_train)
full_attribs = num_attribs + cat_attribs
x_prepared_df = pd.DataFrame(x_train_prepared, columns = full_attribs)
x_prepared_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
Pclass        891 non-null float64
Age           891 non-null float64
Fare          891 non-null float64
Sex           891 non-null float64
Parch_b       891 non-null float64
SibSp_b       891 non-null float64
Has_family    891 non-null float64
Embarked_C    891 non-null float64
Embarked_Q    891 non-null float64
Embarked_S    891 non-null float64
dtypes: float64(10)
memory usage: 69.7 KB


In [9]:
x_prepared_df.head()

Unnamed: 0,Pclass,Age,Fare,Sex,Parch_b,SibSp_b,Has_family,Embarked_C,Embarked_Q,Embarked_S
0,1.0,22.0,7.25,0.0,0.0,1.0,1.0,0.0,0.0,1.0
1,3.0,38.0,71.2833,1.0,0.0,1.0,1.0,1.0,0.0,0.0
2,1.0,26.0,7.925,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,3.0,35.0,53.1,1.0,0.0,1.0,1.0,0.0,0.0,1.0
4,1.0,35.0,8.05,0.0,0.0,0.0,0.0,0.0,0.0,1.0


## Try Different Model

In [10]:
# Cross Validation
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [11]:
# SGDClassifier
from sklearn.linear_model import SGDClassifier
sgd_clf = SGDClassifier()
sgd_clf.fit(x_train_prepared, y_train)

y_train_pred = cross_val_predict(sgd_clf, x_train_prepared, y_train, cv=3)
confusion_matrix(y_train, y_train_pred)

print("precision:", precision_score(y_train, y_train_pred))
print("recall:", recall_score(y_train, y_train_pred))
print("f1:", f1_score(y_train, y_train_pred))

cv_score = cross_val_score(sgd_clf, x_train_prepared, y_train, cv=3, scoring="accuracy")
print("cross_val_score:", cv_score)

precision: 0.42
recall: 0.5526315789473685
f1: 0.4772727272727273
cross_val_score: [0.43097643 0.68350168 0.4040404 ]


In [12]:
# RandomForestTree
from sklearn.ensemble import RandomForestClassifier
forest_clf = RandomForestClassifier(random_state=42)
forest_clf.fit(x_train_prepared, y_train)

y_train_pred = cross_val_predict(forest_clf, x_train_prepared, y_train, cv=3)
confusion_matrix(y_train, y_train_pred)

print("precision:", precision_score(y_train, y_train_pred))
print("recall:", recall_score(y_train, y_train_pred))
print("f1:", f1_score(y_train, y_train_pred))

cv_score = cross_val_score(forest_clf, x_train_prepared, y_train, cv=3, scoring="accuracy")
print("cross_val_score:", cv_score)

precision: 0.7412140575079872
recall: 0.6783625730994152
f1: 0.7083969465648854
cross_val_score: [0.77104377 0.79124579 0.79461279]


In [85]:
# Randomize Search CV
# from sklearn.model_selection import RandomizedSearchCV
# from scipy.stats import randint as sp_randint

# param_dict = {
#     "n_estimators": np.arange(1, 100, 3),
#     "max_features": np.arange(1, len(full_attribs) + 1, 1),
#     "bootstrap": [True, False],
# }
# randomized_search = RandomizedSearchCV(forest_clf, param_distributions=param_dict, n_iter=10, cv=3, scoring="f1", return_train_score=True)
# randomized_search.fit(x_train_prepared, y_train)
# randomized_search.cv_results_
# for mean_score, params in zip(cvres['mean_test_score'], cvres['params']):
#     print(mean_score, params)

In [14]:
# GridSearchCV
from sklearn.model_selection import GridSearchCV
param_grid = [
    {
    "n_estimators": [100, 300, 1000, 3000, 10000],
    "max_features": [8, 9, 10],
    "bootstrap": [True],
    },
]
grid_search = GridSearchCV(forest_clf, param_grid, cv=3, scoring="f1", return_train_score=True, n_jobs=3)
grid_search.fit(x_train_prepared, y_train)
cvres = grid_search.cv_results_
for mean_score, params in sorted(zip(cvres['mean_test_score'], cvres['params']), key=lambda x: x[0], reverse=True):
    print(mean_score, params)

0.7423924066801599 {'bootstrap': True, 'max_features': 10, 'n_estimators': 10000}
0.7412881634234504 {'bootstrap': True, 'max_features': 10, 'n_estimators': 3000}
0.7405567905811394 {'bootstrap': True, 'max_features': 10, 'n_estimators': 1000}
0.7384988132862503 {'bootstrap': True, 'max_features': 8, 'n_estimators': 1000}
0.7383937868835224 {'bootstrap': True, 'max_features': 9, 'n_estimators': 1000}
0.7373981651267787 {'bootstrap': True, 'max_features': 8, 'n_estimators': 3000}
0.7366895704150607 {'bootstrap': True, 'max_features': 9, 'n_estimators': 3000}
0.7365204515244805 {'bootstrap': True, 'max_features': 9, 'n_estimators': 10000}
0.7363172070565733 {'bootstrap': True, 'max_features': 8, 'n_estimators': 10000}
0.7357852778433378 {'bootstrap': True, 'max_features': 9, 'n_estimators': 300}
0.735575485799701 {'bootstrap': True, 'max_features': 10, 'n_estimators': 300}
0.7288692508015012 {'bootstrap': True, 'max_features': 8, 'n_estimators': 100}
0.7285513131101365 {'bootstrap': True

In [106]:
feature_importances = grid_search.best_estimator_.feature_importances_
sorted(zip(feature_importances, full_attribs), reverse=True)

[(0.3063945858677551, 'Sex'),
 (0.263049328444382, 'Fare'),
 (0.2462522455919866, 'Age'),
 (0.11233433102644412, 'Pclass'),
 (0.015971489372220656, 'Parch_b'),
 (0.01549810260578968, 'Embarked_S'),
 (0.013101673398162825, 'SibSp_b'),
 (0.010880133909198276, 'Embarked_C'),
 (0.009932244688338632, 'Has_family'),
 (0.006585865095724557, 'Embarked_Q')]

In [128]:
# Nerual Network
from sklearn.neural_network import MLPClassifier
mlp_nb_clf = MLPClassifier(solver='lbfgs')

param_grid = [
    {
#         "solver": ['lbfgs'],
        "alpha":[0.03, 0.1, 0.3, 1.0],
        "learning_rate": ['constant', 'invscaling', 'adaptive'],
        "activation": ['logistic', 'relu'],
        "hidden_layer_sizes": [3, 10, 30],
    },
]
grid_search = GridSearchCV(mlp_nb_clf, param_grid, cv=3, scoring="f1", return_train_score=True, n_jobs=3)
grid_search.fit(x_train_prepared, y_train)
cvres = grid_search.cv_results_
for mean_score, params in sorted(zip(cvres['mean_test_score'], cvres['params']), key=lambda x: x[0], reverse=True):
    print(mean_score, params)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


0.7240640813654924 {'activation': 'logistic', 'alpha': 0.1, 'hidden_layer_sizes': 3, 'learning_rate': 'constant'}
0.7209210751583631 {'activation': 'relu', 'alpha': 0.1, 'hidden_layer_sizes': 30, 'learning_rate': 'adaptive'}
0.7206441119063111 {'activation': 'relu', 'alpha': 0.03, 'hidden_layer_sizes': 10, 'learning_rate': 'constant'}
0.7205534764810477 {'activation': 'relu', 'alpha': 0.03, 'hidden_layer_sizes': 10, 'learning_rate': 'adaptive'}
0.7196887731249583 {'activation': 'logistic', 'alpha': 1.0, 'hidden_layer_sizes': 30, 'learning_rate': 'invscaling'}
0.7195849695849694 {'activation': 'relu', 'alpha': 0.1, 'hidden_layer_sizes': 3, 'learning_rate': 'adaptive'}
0.7188497880846091 {'activation': 'logistic', 'alpha': 0.3, 'hidden_layer_sizes': 30, 'learning_rate': 'invscaling'}
0.7175294315336967 {'activation': 'relu', 'alpha': 0.03, 'hidden_layer_sizes': 3, 'learning_rate': 'adaptive'}
0.7168215173778517 {'activation': 'logistic', 'alpha': 0.03, 'hidden_layer_sizes': 30, 'learning

In [None]:
# Support Vector Machine
from sklearn.svm import SVC
svc_clf = SVC()

param_grid = [
    {
        "C": [0.03, 0.1, 0.3, 1.0],
        "kernel": ['linear', 'poly', 'rbf', 'sigmoid'],
        "probability": [True, False],
        "shrinking": [True, False],
    },
]
grid_search = GridSearchCV(svc_clf, param_grid, cv=3, scoring="f1", return_train_score=True, n_jobs=3)
grid_search.fit(x_train_prepared, y_train)
cvres = grid_search.cv_results_
for mean_score, params in sorted(zip(cvres['mean_test_score'], cvres['params']), key=lambda x: x[0], reverse=True):
    print(mean_score, params)