# Titanic: Machine Learning from Disaster
In this project I will use various machine learning models such as
logistic regressions, k-neighbours and random forests to predict which
passengers survived the Titanic disaster.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_selection import RFECV
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

%matplotlib inline

In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

## Data Cleaning
The dataset includes the following variables:
- __Survived__ - passenger survived (0 = No, 1 = Yes)
- __Pclass__ - ticket class (1 = 1st, 2 = 2nd and 3 = 3rd)
- __Sex__ - Sex
- __Age__ - age in years
- __Sibsp__ - number of siblings / spouses on board
- __Parch__ - number of parents / children on board
- __Ticket__ - ticket number
- __Fare__ - passenger fare
- __Cabin__ - cabin number
- __Embarked__ - port of embarkation

In [3]:
# Determine variables that have missing data
print(train.isnull().sum())
print(test.isnull().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


In [4]:
# Fill missing values with -0.5 and categorise age variable
def pre_process_age(df):
    df['Age'] = df['Age'].fillna(-0.5)
    cut_points = [-1,0,12,18,35,60,100]
    label_names = ['Missing','Child','Teenager','Young Adult','Adult','Senior']
    df['Age_category'] = pd.cut(df['Age'],cut_points,labels=label_names)
    return df

In [5]:
# Fill missing fare value with its mean and fill missing embarked values with 'S' for Southampton
def pre_process_missing_fare_embarked(df):
    df['Fare'] = df['Fare'].fillna(train['Fare'].mean())
    df['Embarked'] = df['Embarked'].fillna('S')
    return df

In [6]:
# Categorise fares
def pre_process_fare(df):
    cut_points = [-1,12,50,100,1000]
    label_names = ["0-12","12-50","50-100","100+"]
    df["Fare_category"] = pd.cut(df["Fare"],cut_points,labels=label_names)
    return df

In [7]:
# The first letter indicates where the cabin was located. Strip first letter and fill missing values with 'Unknown'
def pre_process_cabin(df):
    df['Cabin_type'] = df['Cabin'].str[0]
    df['Cabin_type'] = df['Cabin_type'].fillna('Unknown')
    df = df.drop('Cabin',axis=1)
    return df

In [8]:
# The are a number of different titles which may be useful to indicate socio-economic status and potentially probability of survival
def pre_process_name(df):
    titles = {
        'Mr':        'Mr',
        'Mme':       'Mrs',
        'Ms':        'Mrs',
        'Mrs':       'Mrs',
        'Master':    'Master',
        'Mlle':      'Miss',
        'Miss':      'Miss',
        'Capt':      'Officer',
        'Col':       'Officer',
        'Major':     'Officer',
        'Dr':        'Officer',
        'Rev':       'Officer',
        'Jonkheer':  'Royalty',
        'Don':       'Royalty',
        'Sir':       'Royalty',
        'Countess':  'Royalty',
        'Dona':      'Royalty',
        'Lady':      'Royalty'
    }
    extracted_titles = df['Name'].str.extract(' ([A-Za-z]+)\.',expand=False)
    df['Title'] = extracted_titles.map(titles)
    return df

In [9]:
# Create dummy columns
def pre_process_create_dummies(df,column_name):
    dummies = pd.get_dummies(df[column_name],prefix=column_name)
    df = pd.concat([df,dummies],axis=1)
    return df

In [10]:
def pre_process_data(df):
    df = pre_process_age(df)
    df = pre_process_missing_fare_embarked(df)
    df = pre_process_fare(df)
    df = pre_process_cabin(df)
    df = pre_process_name(df)

    for col in ['Age_category','Fare_category','Title','Cabin_type','Sex']:
        df = pre_process_create_dummies(df,col)

    return df

train = pre_process_data(train)
test = pre_process_data(test)

### Data Exploration
For a detailed data analysis, please look at the Titanic_Visualisation file.

### Feature Selection

In [14]:
def select_features(df):
    # Remove non-numeric columns, columns that have null values
    df = df.select_dtypes([np.number]).dropna(axis=1)
    all_X = df.drop(["Survived","PassengerId"],axis=1)
    all_y = df["Survived"]

    clf = RandomForestClassifier()
    selector = RFECV(clf,cv=10)
    selector.fit(all_X,all_y)

    best_columns = list(all_X.columns[selector.support_])
    print("Best Columns \n"+"-"*12+"\n{}\n".format(best_columns))

    return best_columns

cols = select_features(train)

Best Columns 
------------
['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Age_category_Missing', 'Age_category_Child', 'Age_category_Teenager', 'Age_category_Young Adult', 'Age_category_Adult', 'Age_category_Senior', 'Fare_category_0-12', 'Fare_category_12-50', 'Fare_category_50-100', 'Fare_category_100+', 'Title_Master', 'Title_Miss', 'Title_Mr', 'Title_Mrs', 'Title_Officer', 'Cabin_type_A', 'Cabin_type_B', 'Cabin_type_C', 'Cabin_type_D', 'Cabin_type_E', 'Cabin_type_Unknown', 'Sex_female', 'Sex_male']



### Model Selection

In [17]:
def select_model(df,features):
    all_X = df[features]
    all_y = df["Survived"]

    # List of dictionaries, each containing a model name,
    # it's estimator and a dict of hyperparameters
    models = [
        {
            "name": "LogisticRegression",
            "estimator": LogisticRegression(),
            "hyperparameters":
                {
                    "solver": ["newton-cg", "lbfgs", "liblinear"]
                }
        },
        {
            "name": "KNeighborsClassifier",
            "estimator": KNeighborsClassifier(),
            "hyperparameters":
                {
                    "n_neighbors": range(1,50,2),
                    "weights": ["distance", "uniform"],
                    "algorithm": ["ball_tree", "kd_tree", "brute"],
                    "p": [1,2]
                }
        },
        {
            "name": "RandomForestClassifier",
            "estimator": RandomForestClassifier(),
            "hyperparameters":
                {
                    "n_estimators": [2,4,6,8,10,12],
                    "criterion": ["entropy", "gini"],
                    "max_depth": [2,5,10,15,20],
                    "max_features": ["log2", "sqrt"],
                    "min_samples_leaf": [1,2,4,6,8,10],
                    "min_samples_split": [1,2,4,6,8,10]

                }
        }
    ]

    for model in models:
        print(model['name'])
        print('-'*len(model['name']))

        grid = GridSearchCV(model["estimator"],
                            param_grid=model["hyperparameters"],
                            cv=10)
        grid.fit(all_X,all_y)
        model["best_params"] = grid.best_params_
        model["best_score"] = grid.best_score_
        model["best_model"] = grid.best_estimator_

        print("Best Score: {}".format(model["best_score"]))
        print("Best Parameters: {}\n".format(model["best_params"]))

    return models

result = select_model(train,cols)

LogisticRegression
------------------
Best Score: 0.8249313358302123
Best Parameters: {'solver': 'lbfgs'}

KNeighborsClassifier
--------------------
Best Score: 0.7755805243445691
Best Parameters: {'algorithm': 'ball_tree', 'n_neighbors': 7, 'p': 1, 'weights': 'uniform'}

RandomForestClassifier
----------------------
Best Score: 0.8473657927590512
Best Parameters: {'criterion': 'gini', 'max_depth': 15, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 10}



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

### Export data to submission file

In [18]:
def save_submission_file(model,cols,filename="submission.csv"):
    test_data = test[cols]
    predictions = model.predict(test_data)

    test_ids = test["PassengerId"]
    submission_df = {"PassengerId": test_ids,
                 "Survived": predictions}
    submission = pd.DataFrame(submission_df)

    submission.to_csv(filename,index=False)

best_rf_model = result[2]["best_model"]
save_submission_file(best_rf_model,cols)
