In [1]:
# Import for data cleaning and visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Set data
titanic = pd.read_excel("data-Titanic.xlsx", index_col=0)
titanic

Unnamed: 0_level_0,Survived,Pclass,Name,Gender,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [3]:
# Checks for empty data
titanic.isnull().sum()

Survived      0
Pclass        0
Name          0
Gender        0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
dtype: int64

In [4]:
# Removes all lines in the set that have null values for age
titanic = titanic.drop(titanic.index[titanic.Age.isnull()])

# Show that they are gone
titanic.isnull().sum()

Survived      0
Pclass        0
Name          0
Gender        0
Age           0
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       529
Embarked      2
dtype: int64

In [5]:
# Set up survived table for later use
titanic_df_survived = titanic['Survived']

# Removed rubbish data - 'abstraction'
titanic_df = titanic.drop(['Name', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Survived'], axis=1)
titanic_df

Unnamed: 0_level_0,Pclass,Gender,Age,SibSp,Parch
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,3,male,22.0,1,0
2,1,female,38.0,1,0
3,3,female,26.0,0,0
4,1,female,35.0,1,0
5,3,male,35.0,0,0
...,...,...,...,...,...
886,3,female,39.0,0,5
887,2,male,27.0,0,0
888,1,female,19.0,0,0
890,1,male,26.0,0,0


In [6]:
# Make Male & Female = 1's & 0's

gender_id = {
    'male': 1,
    'female': 0
}

titanic_df['Gender'] = [gender_id[i] for i in titanic_df['Gender'].values]

In [7]:
# New data
titanic_df

Unnamed: 0_level_0,Pclass,Gender,Age,SibSp,Parch
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,3,1,22.0,1,0
2,1,0,38.0,1,0
3,3,0,26.0,0,0
4,1,0,35.0,1,0
5,3,1,35.0,0,0
...,...,...,...,...,...
886,3,0,39.0,0,5
887,2,1,27.0,0,0
888,1,0,19.0,0,0
890,1,1,26.0,0,0


# ANN

In [8]:
# Import sklearn to split the data
from sklearn.model_selection import train_test_split

# Set data - x_train/x_test is the training data - y_train/y_test is the testing data - test_size splits up the data from 0-1
# closer to 0 being more training data, closer to 1 being more testing data
x_train, x_test, y_train, y_test = train_test_split(titanic_df, titanic_df_survived, test_size = 0.1)

In [9]:
# Get the ANN
from sklearn.neural_network import MLPClassifier

# Setup ANN
mlp = MLPClassifier(hidden_layer_sizes=(10), max_iter=15000)

# Set the data
mlp.fit(x_train, y_train)

MLPClassifier(hidden_layer_sizes=10, max_iter=15000)

In [10]:
# Import metrics to find the stats about the accuracy
from sklearn import metrics

# Calculate the results of the training data
y_pred = mlp.predict(x_train)
# Compares it with the results (y_train)
print('Training accuracy: \t', metrics.accuracy_score(y_train, y_pred))

Training accuracy: 	 0.8099688473520249


# Decision Tree

In [11]:
# Import Tree
from sklearn.tree import DecisionTreeClassifier

# Setup tree
t_model = DecisionTreeClassifier(max_depth = 4)
t_model.fit(x_train, y_train)

DecisionTreeClassifier(max_depth=4)

In [12]:
# See accuracy of training data
y_predict = t_model.predict(x_train)
print('Training accuracy: \t', metrics.accuracy_score(y_train, y_predict))

Training accuracy: 	 0.8271028037383178


In [13]:
# See accuracy of testing data
y_pred = t_model.predict(x_test) # Testing on the input of the testing partition (new / out-of-sample data)
print('Testing accuracy: \t', metrics.accuracy_score(y_test, y_pred))

Testing accuracy: 	 0.8611111111111112
