# Titanic

Following the Keggle kernel https://www.kaggle.com/ldfreeman3/a-data-science-framework-to-achieve-99-accuracy/notebook

## Configure environment

In [None]:
'''
Load Packages
'''

import sys # access to system parameters
print('Python version: {}'.format(sys.version))

import pandas as pd # library of functions for data processing
print('pandas version: {}'.format(pd.__version__))

import matplotlib # functions for visualization
print('matplotlib version: {}'.format(matplotlib.__version__))

import numpy as np # package for scientific computing
print('NumPy version: {}'.format(np.__version__))

import scipy as sp # functions for scientific computing and advanced mathematics
print('SciPy version: {}'.format(sp.__version__))

import IPython.display
from IPython import display # printing fancy in Jupyter notebook
print('IPython version: {}'.format(IPython.__version__))

import sklearn # collection of machine learning algorithms
print('scikit-learn version: {}'.format(sklearn.__version__))

import random
import time

import warnings
warnings.filterwarnings('ignore')
#print('-'*25)

# Input datafiles
#from subprocess import check_output
#print(check_output(['ls','/data']).decode('utf8'))

In [None]:
'''
Load data modelling libraries
'''
# Model algorithms
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
#from xgboost import XGBClassifier # only works on osx and linux

# Common Model Helpers
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics

# Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
from pandas.tools.plotting import scatter_matrix

#Configure visualization defaults
%matplotlib inline
mpl.style.use('ggplot')
sns.set_style('white')
pylab.rcParams['figure.figsize'] = 12,8

## Load and investigate data

In [None]:
'''
First look at the data
'''

# data for training and testing
data_raw = pd.read_csv('data/train.csv') 

# final validation set for competition submission
data_val = pd.read_csv('data/test.csv') 

# create a copy of the data to use
data1 = data_raw.copy(deep=True)

# reference for convenience. Helpful for cleaning operations
data_cleaner = [data1, data_val]

print(data_raw.info())
print(data_val.info())
#data_raw.head()
#data_raw.tail()
#data_raw.sample(10)

In [None]:
print('Train columns with null values:\n', data1.isnull().sum())
print('-'*25)

print('Validation columns with null values:\n', data_val.isnull().sum())
print('-'*25)

data_raw.describe(include = 'all')

## Data cleaning: Correcting, Completing, Creating and Converting

In [None]:
# No need for correction (no misstakes or extreme outliers)

# Completing
for dataset in data_cleaner: # apply to both training and validation set
    # complete missing age with median
    dataset['Age'].fillna(dataset['Age'].median(),inplace=True)
    
    # complete embark with mode
    dataset['Embarked'].fillna(dataset['Embarked'].mode()[0],inplace=True)
    
    # complete missing fare with median
    dataset['Fare'].fillna(dataset['Fare'].median(),inplace=True)
    
# delete the features to exclude in training dataset
drop_column = ['PassengerId','Cabin','Ticket']
data1.drop(drop_column,axis=1,inplace=True)


In [None]:
# Have we completed?
print(data1.isnull().sum())
print('-'*25)
print(data_val.isnull().sum())
# Yes

In [None]:
# Create: feature engineering from train and validation dataset
for dataset in data_cleaner:
    # Discrete variables
    
    # Size of families
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
    
    # Travelin alone
    dataset['IsAlone'] = 1
    dataset['IsAlone'].loc[dataset['FamilySize']>1] = 0
    
    # Extract title from name
    dataset['Title'] = dataset['Name'].str.split(", ",expand=True)[1].str.split(".",expand=True)[0]
    
    # Continous variables
    
    # Bin Fares in quantiles
    dataset['FareBin'] = pd.qcut(dataset['Fare'],4)
    
    # Bin Ages in bins of equal width
    dataset['AgeBin'] = pd.cut(dataset['Age'].astype(int),5)
    
# clean rare title names
stat_min = 10
title_names_data1 = (data1['Title'].value_counts() < stat_min)
data1['Title'] = data1['Title'].apply(lambda x: 'Misc' if title_names_data1.loc[x] == True else x)

title_names_data_val = (data_val['Title'].value_counts() < stat_min)
data_val['Title'] = data_val['Title'].apply(lambda x: 'Misc' if title_names_data_val.loc[x] == True else x)

# Preview data
#print(data1['Title'].value_counts())
#print(data_val['Title'].value_counts())
#data1.info()
#data_val.info()

In [None]:
# Converting formats: Dummy variables for mathematical analysis using Label Encoder

# Code categorical data

for dataset in data_cleaner:
    dataset['Sex_code'] = LabelEncoder().fit_transform(dataset['Sex'])
    dataset['Embarked_code'] = LabelEncoder().fit_transform(dataset['Embarked'])
    dataset['Title_code'] = LabelEncoder().fit_transform(dataset['Title'])
    dataset['AgeBin_code'] = LabelEncoder().fit_transform(dataset['AgeBin'])
    dataset['FareBin_code'] = LabelEncoder().fit_transform(dataset['FareBin'])

# define yvariable aka Target/outcome 
Target = ['Survived']

data1_x = [
    'Sex','Pclass', 'Embarked', 
    'Title', 'SibSp', 'Parch', 
    'Age', 'Fare', 'FamilySize', 
    'IsAlone'
]

data1_x_calc = [
    'Sex_code','Pclass', 'Embarked_code', 
    'Title_code', 'SibSp', 'Parch', 
    'Age', 'Fare', 'FamilySize', 
    'IsAlone'
]

data1_x_bin = [
    'Sex_code','Pclass', 'Embarked_code', 
    'Title_code', 'FamilySize', 'AgeBin_code',
    'FareBin_code'
]

data1_xy = Target + data1_x
data1_xy_bin = Target + data1_x_bin

print('Original X Y: ', data1_xy, '\n')
print('Bin X Y: ', data1_xy_bin, '\n')

# Define x and y variables for dummy feature original
data1_dummy = pd.get_dummies(data1[data1_x])
data1_x_dummy = data1_dummy.columns.tolist()
data1_xy_dummy = Target + data1_x_dummy

print('Dummy X Y: ', data1_xy_dummy, '\n')

In [None]:
print('Train columns with null values: \n', data1.isnull().sum())
print('-'*25)
print(data1.info())
print('-'*25)

print('Validation columns with null values: \n', data_val.isnull().sum())
print('-'*25)
print(data_val.info())
print('-'*25)

data_raw.describe(include = 'all')

# Split training and testing data