## Setup

Import necessary libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import os
# To enable import from future_encoders.py
import sys
sys.path.append(os.path.join(os.getcwd(), '../'))

%matplotlib inline

np.random.seed(42)

## Read Data

In [2]:
DATASETS_PATH = os.path.join(os.getcwd(), '../', 'datasets')
TITANIC_PATH = os.path.join(DATASETS_PATH, 'titanic')

def load_titanic_data(filename):
    target_path = os.path.join(TITANIC_PATH, filename)
    return pd.read_csv(target_path)

train_raw = load_titanic_data('train.csv')
test_raw = load_titanic_data('test.csv')

train_raw.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Preapare Data

In [3]:
train_raw.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


Drop columns with 50% missing value, get_missing_values_table function got from https://towardsdatascience.com/a-complete-machine-learning-walk-through-in-python-part-one-c62152f39420

In [4]:
# Function to calculate missing values by column# Funct 
def get_missing_values_table(df):
        # Total missing values
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        
        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        
        # Return the dataframe with missing information
        return mis_val_table_ren_columns

missing_values_table = get_missing_values_table(train_raw)
missing_values_table

Your selected dataframe has 12 columns.
There are 3 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
Cabin,687,77.1
Age,177,19.9
Embarked,2,0.2


In [5]:
cols_with_over_50_missing = missing_values_table[missing_values_table["% of Total Values"] > 50].index
train_useful = train_raw.drop(list(cols_with_over_50_missing), axis=1)
train_useful.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


Here we split out the features and target

In [6]:
X_train = train_useful.drop('Survived', axis=1)
y_train = train_useful['Survived'].copy()

## Create Pipeline

The Name, PassengerId and Ticket columns are skipped due to too many categories

In [7]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, Imputer
from future_encoders import OneHotEncoder

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attributes):
        self.attributes = attributes
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        return X[self.attributes]
    
class MostFrequentImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X], index=X.columns)
        return self
    def transform(self, X, y=None):
        return X.fillna(self.most_frequent_)

cat_cols = ['Sex', 'Embarked', 'Pclass']
num_cols = ['Age', 'SibSp', 'Parch', 'Fare']

num_pipeline = Pipeline([
    ('num_selector', DataFrameSelector(num_cols)),
    ('median_imputer', Imputer(strategy='median')),
    ('std_scaler', StandardScaler()),
])

cat_pipeline = Pipeline([
    ('cat_selector', DataFrameSelector(cat_cols)),
    ('most_frequent_imputer', MostFrequentImputer()),
    ('one_hot_encoder', OneHotEncoder(sparse=False)),
])

data_preparation_pipeline = FeatureUnion(transformer_list=[
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline),
])

X_train_prep = data_preparation_pipeline.fit_transform(X_train)

print("X_train_prep shape: ", X_train_prep.shape)
print("X_train_prep single row: ", X_train_prep[0])

X_train_prep shape:  (891, 12)
X_train_prep single row:  [-0.56573646  0.43279337 -0.47367361 -0.50244517  0.          1.
  0.          0.          1.          0.          0.          1.        ]


## Model Evaluation

Import metrics functions for evalutaion. For models, let's try 3 simple models that are great for categorization: SVM, K-neighbors Classifier and Random Forest

In [8]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

def show_top_k_cv_params(cv_model, k):
    cv_results = cv_model.cv_results_
    score_param_list = list(zip(cv_results['mean_test_score'], cv_results['params']))
    sorted_score_param_list = sorted(score_param_list, key=lambda tup: tup[0], reverse=True)[:k]
    for score_param in sorted_score_param_list:
        print(score_param)

def show_best_score_results(cv_model, X_features, y_label):
    y_predict = cv_model.best_estimator_.predict(X_features)
    cv_scores = cross_val_score(cv_model.best_estimator_, X_features, y_label, cv=10)
    print("f1 score: ", f1_score(y_label, y_predict))
    print("precision score: ", precision_score(y_label, y_predict))
    print("recall score: ", recall_score(y_label, y_predict))
    print("accuracy score: ", accuracy_score(y_label, y_predict))
    print("cross validation score: ", cv_scores.mean())

### K-neighbors

In [9]:
kn_params = {
    'weights': ["uniform", "distance"],
    'n_neighbors': [3, 4, 5, 6, 7],
    'leaf_size': [3, 10, 30, 100, 300, 1000, 3000]
}
kn_cls = RandomizedSearchCV(
    KNeighborsClassifier(),
    param_distributions=kn_params,
    n_iter=50,
    cv=3,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)
kn_cls.fit(X_train_prep, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:    1.0s finished


RandomizedSearchCV(cv=3, error_score='raise',
          estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
          fit_params=None, iid=True, n_iter=50, n_jobs=-1,
          param_distributions={'n_neighbors': [3, 4, 5, 6, 7], 'weights': ['uniform', 'distance'], 'leaf_size': [3, 10, 30, 100, 300, 1000, 3000]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring='accuracy', verbose=1)

#### K-neighbors results

In [10]:
show_top_k_cv_params(kn_cls, 5)
show_best_score_results(kn_cls, X_train_prep, y_train)

(0.8013468013468014, {'n_neighbors': 6, 'weights': 'uniform', 'leaf_size': 10})
(0.8013468013468014, {'n_neighbors': 6, 'weights': 'uniform', 'leaf_size': 100})
(0.8013468013468014, {'n_neighbors': 6, 'weights': 'uniform', 'leaf_size': 30})
(0.8002244668911336, {'n_neighbors': 6, 'weights': 'uniform', 'leaf_size': 300})
(0.8002244668911336, {'n_neighbors': 6, 'weights': 'uniform', 'leaf_size': 1000})
f1 score:  0.7845659163987139
precision score:  0.8714285714285714
recall score:  0.7134502923976608
accuracy score:  0.8496071829405163
cross validation score:  0.809170355237771


### Random Forest

In [11]:
rf_params = {
    'bootstrap': [True, False],
    'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
    'max_features': ['auto', 'sqrt'],
    'min_samples_leaf': [1, 2, 4],
    'min_samples_split': [2, 5, 10],
    'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
}
rf_cls = RandomizedSearchCV(
    RandomForestClassifier(random_state=42),
    param_distributions=rf_params,
    n_iter=100,
    cv=3,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)
rf_cls.fit(X_train_prep, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   53.4s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  7.1min finished


RandomizedSearchCV(cv=3, error_score='raise',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False),
          fit_params=None, iid=True, n_iter=100, n_jobs=-1,
          param_distributions={'bootstrap': [True, False], 'min_samples_split': [2, 5, 10], 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'min_samples_leaf': [1, 2, 4], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None], 'max_features': ['auto', 'sqrt']},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring='accuracy', verbose=1)

#### Random Forest results

In [12]:
show_top_k_cv_params(rf_cls, 5)
show_best_score_results(rf_cls, X_train_prep, y_train)

(0.8282828282828283, {'bootstrap': True, 'min_samples_split': 10, 'n_estimators': 1800, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': 90})
(0.8282828282828283, {'bootstrap': True, 'min_samples_split': 10, 'n_estimators': 1400, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 20})
(0.8282828282828283, {'bootstrap': True, 'min_samples_split': 10, 'n_estimators': 1800, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 20})
(0.8282828282828283, {'bootstrap': True, 'min_samples_split': 10, 'n_estimators': 1400, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': 50})
(0.8282828282828283, {'bootstrap': True, 'min_samples_split': 10, 'n_estimators': 1200, 'min_samples_leaf': 2, 'max_features': 'auto', 'max_depth': 100})
f1 score:  0.8713178294573645
precision score:  0.9273927392739274
recall score:  0.8216374269005848
accuracy score:  0.9068462401795735
cross validation score:  0.8283855408012712


## Evaluate on Test Set

We already found the best model with tuned hyper parameter which is rf_cls, let's output our predictions!

In [13]:
X_test_prep = data_preparation_pipeline.transform(test_raw)
y_test_pred = rf_cls.best_estimator_.predict(X_test_prep)
result_array = [test_raw['PassengerId'], pd.Series(y_test_pred, name='Survived')]
result_df = pd.concat(result_array, axis=1)
result_df.to_csv('result.csv', index=False)