## Titanic Challenge
#### Predicting the survival of Titanic's passengers based on data about the passengers.
#### Based on https://www.kaggle.com/startupsci/titanic-data-science-solutions

In [1]:
import pandas as pd
import numpy as np
import random as rnd

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
##Load  the train and test data into dataframes
train_dataframe = pd.read_csv('./Input/train2.csv')
test_dataframe = pd.read_csv('./Input/test2.csv')
combine = [train_dataframe, test_dataframe]
train_dataframe.shape, test_dataframe.shape

((891, 12), (418, 11))

In [3]:
print(train_dataframe.columns.values)

['PassengerId' 'Survived' 'Pclass' 'Name' 'Sex' 'Age' 'SibSp' 'Parch'
 'Ticket' 'Fare' 'Cabin' 'Embarked']


In [4]:
train_dataframe.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [5]:
##Check if there is a correlation between ticket class and survival
train_dataframe[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Pclass,Survived
0,1,0.62963
1,2,0.472826
2,3,0.242363


In [6]:
##drop ticket and cabin as these data sets were incomplete
train_dataframe = train_dataframe.drop(['Ticket', 'Cabin'], axis=1)
test_dataframe = test_dataframe.drop(['Ticket', 'Cabin'], axis=1)
combine = [train_dataframe, test_dataframe]

In [7]:
##Extract the title from the name column, this may correlate to survival
for dataset in combine:
    dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
    
pd.crosstab(train_dataframe['Title'], train_dataframe['Sex'])

Sex,female,male
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Capt,0,1
Col,0,2
Countess,1,0
Don,0,1
Dr,1,6
Jonkheer,0,1
Lady,1,0
Major,0,2
Master,0,40
Miss,182,0


In [8]:
##Group the Titles into 5 categories, and check if these categories correlate to survuval
for dataset in combine:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major',\
        'Rev', 'Sir', 'Jonkheer'], 'Rare')
    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
    
train_dataframe[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()

Unnamed: 0,Title,Survived
0,Master,0.575
1,Miss,0.702703
2,Mr,0.156673
3,Mrs,0.793651
4,Rare,0.347826


In [9]:
##Since the Title categories do correlate to survival, give them ordinal 
##values from least to most likely to survive
title_mapping = {"Mr":1, "Rare":2, "Master":3, "Miss":4, "Mrs":5}
for dataset in combine:
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)

train_dataframe.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,5
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S,4
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S,5
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S,1


In [10]:
##Now that we have extracted all useful information from 'Name', drop it.
##Also drop passengerId from both datasets as is does not correlate to survival
train_dataframe = train_dataframe.drop(['Name', 'PassengerId'], axis=1)
test_dataframe = test_dataframe.drop(['Name', 'PassengerId'], axis=1)

combine = [train_dataframe, test_dataframe]
train_dataframe.shape, test_dataframe.shape

((891, 9), (418, 8))

In [11]:
##Sex is currently a string, so we will convert it to a more useful int. 0 is male, 1 is female
for dataset in combine:
    dataset['Sex'] = dataset['Sex'].map( {'male':0, 'female':1} ).astype(int)

train_dataframe.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,0,3,0,22.0,1,0,7.25,S,1
1,1,1,1,38.0,1,0,71.2833,C,5
2,1,3,1,26.0,0,0,7.925,S,4
3,1,1,1,35.0,1,0,53.1,S,5
4,0,3,0,35.0,0,0,8.05,S,1


In [12]:
##Some age values are not present, so we will guess the missing values based on the median age in each Pclass
##and the sex of the passenger

guess_ages = np.zeros((2,3))   # Matrix that will contain these median values

for dataset in combine:
    for i in range(0, 2):
        for j in range(0, 3):
            # For each sex and class, take the median value of the ages, and convert these to integers
            guess_dataframe = dataset[(dataset['Sex'] == i) & (dataset['Pclass'] == j+1)]['Age'].dropna()
            age_guess = guess_dataframe.median()
            guess_ages[i,j] = int( age_guess/0.5 + 0.5 ) * 0.5
    
    for i in range(0, 2):
        for j in range(0, 3):
            # For each sex and class, locate the people that fall into this category and have no age listed
            # Set this persons age to the mean age for their sex and class
            dataset.loc[ (dataset.Age.isnull()) & (dataset.Sex == i) & (dataset.Pclass == j+1), \
                'Age'] = guess_ages[i, j]
    
    dataset['Age'] = dataset['Age'].astype(int)

train_dataframe.head()


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,0,3,0,22,1,0,7.25,S,1
1,1,1,1,38,1,0,71.2833,C,5
2,1,3,1,26,0,0,7.925,S,4
3,1,1,1,35,1,0,53.1,S,5
4,0,3,0,35,0,0,8.05,S,1


In [13]:
##Fill in any missing port values with the most common port
freq_port = train_dataframe.Embarked.dropna().mode()[0]

for dataset in combine:
    dataset['Embarked'] = dataset['Embarked'].fillna(freq_port)
freq_port

'S'

In [14]:
##Convert the char port values to integers
for dataset in combine:
    dataset['Embarked'] = dataset['Embarked'].map({'S':0, 'C':1, 'Q':2}).astype(int)

In [15]:
##There are missing missing fares, so set missing fares equal to average fare for each Pclass
for dataset in combine:
    for i in range (1, 4):
        pClassAvg = dataset[dataset['Pclass'] == i]['Fare'].median()
        dataset.loc[ dataset.Fare.isnull() & (dataset.Pclass == i), 'Fare'] = pClassAvg 

In [16]:
test_dataframe.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,3,0,34,0,0,7.8292,2,1.0
1,3,1,47,1,0,7.0,0,5.0
2,2,0,62,0,0,9.6875,2,1.0
3,3,0,27,0,0,8.6625,0,1.0
4,3,1,22,1,1,12.2875,0,5.0


In [17]:
train_dataframe.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,0,3,0,22,1,0,7.25,0,1
1,1,1,1,38,1,0,71.2833,1,5
2,1,3,1,26,0,0,7.925,0,4
3,1,1,1,35,1,0,53.1,0,5
4,0,3,0,35,0,0,8.05,0,1


In [18]:
##Get X and Y training values, and X testing values, to run models on
X_train = train_dataframe.drop("Survived", axis=1)
Y_train = train_dataframe["Survived"]
X_test = test_dataframe.copy()
X_train.shape, Y_train.shape, X_test.shape
#pd.set_option('max_rows', 500)
#X_test

((891, 8), (891,), (418, 8))

In [35]:
#Logistic regression model
logreg = LogisticRegression()
logreg.fit(X_train, Y_train)

Y_pred = logreg.predict(X_test)

logreg.score(X_train, Y_train)



0.8249158249158249

In [30]:
#Random Forest Model
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)
Y_pred = random_forest.predict(X_test)
random_forest.score(X_train, Y_train)
#acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
#acc_random_forest

0.9820426487093153