# Titanic

### Load Dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import random as rnd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process

In [None]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
print(train_df.shape, test_df.shape)

In [None]:
train_df.head()

In [None]:
train_df.columns.values

In [None]:
train_df.info()
print('-'*40)
test_df.info()

In [None]:
train_df.describe()

### Visualization

In [None]:
train_df[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean().sort_values(by='Survived', ascending=False)


In [None]:
train_df[["Sex", "Survived"]].groupby(['Sex'], as_index=False).mean().sort_values(by='Survived', ascending=False)

In [None]:
train_df[["SibSp", "Survived"]].groupby(['SibSp'], as_index=False).mean().sort_values(by='Survived', ascending=False)

In [None]:
train_df[["Parch", "Survived"]].groupby(['Parch'], as_index=False).mean().sort_values(by='Survived', ascending=False)

In [None]:
g = sns.FacetGrid(train_df, col='Survived')
g.map(plt.hist, 'Age', bins=20)

In [None]:
g = sns.FacetGrid(train_df, col='Pclass', hue='Survived')
g.map(plt.hist, 'Age', alpha=.5, bins=20)
g.add_legend()

In [None]:
g = sns.FacetGrid(train_df, col='Sex', hue='Survived')
g.map(plt.hist, 'Pclass', alpha=.5, bins=20)
g.add_legend()

In [None]:
train_df = train_df.drop(['PassengerId', 'Ticket', 'Cabin'], axis=1)
test_df = test_df.drop(['PassengerId', 'Ticket', 'Cabin'], axis=1)

### Preprocess

In [None]:
label = LabelEncoder()

for df in [train_df, test_df]:  
    # Complete
    df['Age'].fillna(df['Age'].median(), inplace=True)
    df['Fare'].fillna(df['Fare'].median(), inplace=True)
    df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
    
    # Create
    df['FamilySize'] = 1 + df['SibSp'] + df['Parch']
    df['IsAlone'] = 1
    df['IsAlone'].loc[df['FamilySize'] > 1] = 0
    df['Title'] = df['Name'].str.split(", ", expand=True)[1].str.split(".", expand=True)[0]
    title_names = (df['Title'].value_counts() < 10) 
    df['Title'] = df['Title'].apply(lambda a: 'Misc' if title_names.loc[a] == True else a)
    df['FareBin'] = pd.qcut(df['Fare'], 4)
    df['AgeBin'] = pd.cut(df['Age'].astype(int), 5)
    
    # Convert
    df['Sex'] = label.fit_transform(df['Sex'])
    df['Embarked'] = label.fit_transform(df['Embarked'])
    df['Title'] = label.fit_transform(df['Title'])
    df['AgeBin'] = label.fit_transform(df['AgeBin'])
    df['FareBin'] = label.fit_transform(df['FareBin'])

In [None]:
def correlation_heatmap(df):
    _ , ax = plt.subplots(figsize =(14, 12))
    colormap = sns.diverging_palette(220, 10, as_cmap = True)
    
    _ = sns.heatmap(
        df.corr(), 
        cmap = colormap,
        square=True, 
        cbar_kws={'shrink':.9 }, 
        ax=ax,
        annot=True, 
        linewidths=0.1,vmax=1.0, linecolor='white',
        annot_kws={'fontsize':12 }
    )
    
    plt.title('Pearson Correlation of Features', y=1.05, size=15)

correlation_heatmap(train_df)

In [None]:
X = train_df.drop(['Survived', 'Name'], axis=1)
y = train_df['Survived']

X_pred = test_df.drop("Name", axis=1).copy()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=2)

### Model Training + Evaluation

In [None]:
MLA = [
    # Ensemble Methods
    ensemble.AdaBoostClassifier(),
    ensemble.BaggingClassifier(),
    ensemble.ExtraTreesClassifier(),
    ensemble.GradientBoostingClassifier(),
    ensemble.RandomForestClassifier(),

    # Gaussian Processes
    gaussian_process.GaussianProcessClassifier(),
    
    # GLM
    linear_model.LogisticRegressionCV(),
    linear_model.PassiveAggressiveClassifier(),
    linear_model.RidgeClassifierCV(),
    linear_model.SGDClassifier(),
    linear_model.Perceptron(),
    
    # Navies Bayes
    naive_bayes.BernoulliNB(),
    naive_bayes.GaussianNB(),
    
    # Nearest Neighbor
    neighbors.KNeighborsClassifier(),
    
    # SVM
    svm.SVC(probability=True),
    svm.NuSVC(probability=True),
    svm.LinearSVC(),
    
    # Trees    
    tree.DecisionTreeClassifier(),
    tree.ExtraTreeClassifier(),
    
    # Discriminant Analysis
    discriminant_analysis.LinearDiscriminantAnalysis(),
    discriminant_analysis.QuadraticDiscriminantAnalysis(),
    ]

In [None]:
MLA_compare = pd.DataFrame(columns=['Model', 'Train Accuracy', 'Test Accuracy'])

MLA_pred = y

i = 0
for alg in MLA:
    
    MLA_name = alg.__class__.__name__
    MLA_compare.loc[i, 'Model'] = MLA_name
    alg.fit(X_train, y_train)
    
    MLA_compare.loc[i, 'Train Accuracy'] = alg.score(X_train, y_train)
    MLA_compare.loc[i, 'Test Accuracy'] = alg.score(X_test, y_test)
   
    MLA_pred[MLA_name] = alg.predict(X_pred)
    
    i+=1

MLA_compare.sort_values(by=['Test Accuracy'], ascending = False, inplace = True)

In [None]:
MLA_compare

In [None]:
MLA_pred

In [None]:
'''
Inspiration
1. https://www.kaggle.com/startupsci/titanic-data-science-solutions
2. https://www.kaggle.com/ldfreeman3/a-data-science-framework-to-achieve-99-accuracy
'''