In [76]:
# Import Modules

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, ShuffleSplit

Load the relevant data (test, train and gender_submission)

In [77]:
df_train = pd.read_csv('../input/titanic/train.csv')
df_apply_model = pd.read_csv('../input/titanic/test.csv')
df_gender_submission = pd.read_csv('../input/titanic/gender_submission.csv')

View dataframes

In [78]:
df_train.head()

In [79]:
df_apply_model.head()

In [80]:
df_gender_submission.head()

Perform data exploration on train dataset

In [81]:
df_train.isna().sum()

In [82]:
df_train.shape

Perform data exploration on test dataset

In [83]:
df_apply_model.isna().sum()

In [84]:
df_apply_model.shape

Fill empty ['Age'] values with median age

In [85]:
df_train['Age'] = df_train['Age'].fillna(df_train['Age'].median())

In [86]:
df_apply_model['Age'] = df_apply_model['Age'].fillna(df_apply_model['Age'].median())

Columns of non-importance = PassengerId, Name, SibSp, Parch, Ticket, Cabin, Embarked.
Non-important columns dropped from training dataset.
Non-important columns, except 'Survived', dropped from test dataset

In [87]:
df_train = df_train.drop(['PassengerId', 'Name', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked'], axis=1)

In [88]:
df_apply_model = df_apply_model.drop(['PassengerId', 'Name', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked'], axis=1)

Drop empty rows from test dataset

In [89]:
df_apply_model = df_apply_model.dropna()

Convert Sex to numeric via dummies

In [90]:
train_dummies = pd.get_dummies(df_train.Sex, drop_first=True)
df_train = pd.concat([df_train, train_dummies], axis=1)
df_train = df_train.drop(['Sex'], axis=1)

In [91]:
test_dummies = pd.get_dummies(df_apply_model.Sex, drop_first=True)
df_apply_model = pd.concat([df_apply_model, test_dummies], axis=1)
df_apply_model = df_apply_model.drop(['Sex'], axis=1)

Assign key variables after splitting train dataset

In [92]:
x = df_train.drop(['Survived'], axis=1)
y = df_train.Survived

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=10)

Perform a GridSearchCV between LinearRegression, RandomForrest, LogisticRegression, and DecisionTree

In [93]:
def best_model(x, y):
    algos = {
        'linear regression' :{
            'model' : LinearRegression(),
            'params': {
                
            }
        },
        'logistic regression' : {
            'model' : LogisticRegression(),
            'params': {
                'penalty' : ['l1', 'l2', 'elasticnet', 'none']
            }
        },
        'random forest': {
            'model' : RandomForestClassifier(),
            'params': {
                'n_estimators': [1, 5, 10, 20, 50],
                'criterion' : ['gini', 'entropy']
            }
        },
        'decision tree' : {
            'model' : DecisionTreeClassifier(),
            'params' : {
                'criterion' : ['gini', 'entropy'],
                'splitter' : ['best', 'random']
            }
        }
    }
    
    scores = []
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=10)
    
    for algo_names, config in algos.items():
        gs = GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)
        gs.fit(x, y)
        scores.append({
            'model': algo_names,
            'best_score' : gs.best_score_,
            'best_params' :  gs.best_params_
        })
    
    df_best_model = pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])
    
    return df_best_model
    

In [94]:
best_model(x,y)

Random Forest with criterion='entropy' and n_estimators=50 yields predictive capacity of 81.79%

Create Model


In [95]:
clf = RandomForestClassifier(criterion='entropy', n_estimators=50)
clf.fit(x_train, y_train.values)
clf.score(x_test, y_test)

Apply model to df_apply_model

In [96]:
prediction_values = pd.DataFrame(clf.predict(df_apply_model.values), columns=['Survived'])

In [97]:
prediction_values

In [98]:
df_gender_submission = df_gender_submission.drop(['Survived'], axis=1)
df_gender_submission = pd.concat([df_gender_submission, prediction_values], axis=1, ignore_index=True)

In [107]:
df_gender_submission.rename(columns= {0: 'PassengerId', 1:'Survived'})

In [None]:
df_gender_submission.to_csv('./TitanicSubmission.csv')