# Kaggle | Titanic: Machine Learning from Disaster
### My first Kaggle Problem, I will be using GraphLab Create to solve this one...

### Getting the required modules and packages

In [4]:
import graphlab
graphlab.canvas.set_target('ipynb')

import numpy as np

%matplotlib inline
import matplotlib.pyplot as mp
mp.rcParams['figure.figsize'] = (15.0, 8.0)


In [11]:
titanic_data = graphlab.SFrame.read_csv('train.csv', 
                                        column_type_hints={'Survived': int,
                                                     'Pclass': int,
                                                     'Age' : float,
                                                     'SibSp': int,
                                                     'Parch': int,
                                                     'Fare' : float})

In [12]:
titanic_data.head()

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs ...",female,38.0,1,0,PC 17599
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel) ...",female,35.0,1,0,113803
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450
6,0,3,"Moran, Mr. James",male,,0,0,330877
7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463
8,0,3,"Palsson, Master. Gosta Leonard ...",male,2.0,3,1,349909
9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina ...",female,27.0,0,2,347742
10,1,2,"Nasser, Mrs. Nicholas (Adele Achem) ...",female,14.0,1,0,237736

Fare,Cabin,Embarked
7.25,,S
71.2833,C85,C
7.925,,S
53.1,C123,S
8.05,,S
8.4583,,Q
51.8625,E46,S
21.075,,S
11.1333,,S
30.0708,,C


In [13]:
titanic_data.show()

as we can see, the Age column has 177 null-values (<code>num_undefined</code>), It contains almost 20% of missing values. This proportion is likely small enough for reasonable replacement with some form of imputation. Before we start filling in missing data, let's see what can be learned from the data we have. Putting some simple data visualization tools to work can take us a long way toward understanding what might influence the outcome we're trying to predict, i.e. whether or not a passenger survived.

In [14]:
titanic_data = titanic_data.fillna("Age",titanic_data["Age"].mean())
titanic_data.show()

Let's first print the distribution of the passenger class variable. Then, we further investigate how the proportion of survival changes for each class.

We use the show method offered by the SArray structure, i.e. the Pclass column, in order to see the number of passengers for each class and plot a simple histogram.

In [15]:
pclass = titanic_data["Pclass"].astype(str)
pclass.show()

Now we know that majority of the passengers were from the 3rd class.

Let's investigate the proportion of survived for each class. In this case we use the groupby aggregator to aggregate passengers by their class (Pclass) and their destiny (Survived) and to count the number of passengers for each group. Finally, we use the filter_by method in order to get the number of survived and the number of died in each class and plot the stacked bar plot. The percentage over each bar represents the percentage of survived of the corresponding passenger class.

In [17]:
titanic_data['family'] = titanic_data['SibSp']+titanic_data['Parch'] >3

In [18]:
titanic_data['Child'] = titanic_data['Age'] < 15

In [20]:
import re
def extTitle(name):
    match = re.search("(Dr|Mrs?|Ms|Miss|Master|Rev|Capt|Mlle|Col|Major|Sir|Jonkheer|Lady|the Countess|Mme|Don)\\.",name)
    if match:
        title = match.group(0)
        if (title == 'Don.' or title == 'Major.' or title == 'Capt.'):
            title = 'Sir.'
        if (title == 'Mlle.' or title == 'Mme.'):
            title = 'Miss.'
        return title
    else:
        return "Other"

titanic_data['Title'] = titanic_data['Name'].apply(extTitle)
titeDat = titanic_data['Title'].astype(str)
titeDat.show()

In [22]:
from graphlab.toolkits.feature_engineering import *

binner = graphlab.feature_engineering.create(titanic_data, FeatureBinner(features = ['Fare'],strategy='quantile',num_bins = 5)) 
fit_binner = binner.fit(titanic_data)
passengers_binned = fit_binner.transform(titanic_data)
passengers_binned["Fare"].show()

In [23]:
mah_feat = ["Pclass","Sex","Age","family","Child","Fare","Title"]

In [24]:
train_data, test_data = passengers_binned.random_split(0.8,seed=0)
model = graphlab.logistic_classifier.create(passengers_binned,target="Survived",features=mah_feat,validation_set=test_data)


In [25]:
model.evaluate(test_data,metric='roc_curve')

{'roc_curve': Columns:
 	threshold	float
 	fpr	float
 	tpr	float
 	p	int
 	n	int
 
 Rows: 100001
 
 Data:
 +-----------+-----+-----+----+-----+
 | threshold | fpr | tpr | p  |  n  |
 +-----------+-----+-----+----+-----+
 |    0.0    | 1.0 | 1.0 | 75 | 113 |
 |   1e-05   | 1.0 | 1.0 | 75 | 113 |
 |   2e-05   | 1.0 | 1.0 | 75 | 113 |
 |   3e-05   | 1.0 | 1.0 | 75 | 113 |
 |   4e-05   | 1.0 | 1.0 | 75 | 113 |
 |   5e-05   | 1.0 | 1.0 | 75 | 113 |
 |   6e-05   | 1.0 | 1.0 | 75 | 113 |
 |   7e-05   | 1.0 | 1.0 | 75 | 113 |
 |   8e-05   | 1.0 | 1.0 | 75 | 113 |
 |   9e-05   | 1.0 | 1.0 | 75 | 113 |
 +-----------+-----+-----+----+-----+
 [100001 rows x 5 columns]
 Note: Only the head of the SFrame is printed.
 You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.}

In [26]:
model.show(view='Evaluation')

In [27]:
model2 = graphlab.logistic_classifier.create(passengers_binned,target='Survived',features=mah_feat,validation_set=None)

In [29]:
model2.evaluate(test_data,metric='roc_curve')

{'roc_curve': Columns:
 	threshold	float
 	fpr	float
 	tpr	float
 	p	int
 	n	int
 
 Rows: 100001
 
 Data:
 +-----------+-----+-----+----+-----+
 | threshold | fpr | tpr | p  |  n  |
 +-----------+-----+-----+----+-----+
 |    0.0    | 1.0 | 1.0 | 75 | 113 |
 |   1e-05   | 1.0 | 1.0 | 75 | 113 |
 |   2e-05   | 1.0 | 1.0 | 75 | 113 |
 |   3e-05   | 1.0 | 1.0 | 75 | 113 |
 |   4e-05   | 1.0 | 1.0 | 75 | 113 |
 |   5e-05   | 1.0 | 1.0 | 75 | 113 |
 |   6e-05   | 1.0 | 1.0 | 75 | 113 |
 |   7e-05   | 1.0 | 1.0 | 75 | 113 |
 |   8e-05   | 1.0 | 1.0 | 75 | 113 |
 |   9e-05   | 1.0 | 1.0 | 75 | 113 |
 +-----------+-----+-----+----+-----+
 [100001 rows x 5 columns]
 Note: Only the head of the SFrame is printed.
 You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.}

In [30]:
model2.show(view='Evaluation')

In [31]:
submission = graphlab.SFrame('test.csv')

------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[long,long,str,str,float,long,long,str,float,str,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


In [32]:
submission.show()

In [34]:
submission['family'] = submission['SibSp']+submission['Parch'] >3
submission["Child"] = submission["Age"]<15
submission["Title"] = submission["Name"].apply(extTitle)
binner = graphlab.feature_engineering.create(submission, FeatureBinner(features = ['Fare'],strategy='quantile',num_bins = 5)) 
fit_binner = binner.fit(submission)
passengers_submission_binned = fit_binner.transform(submission)

titanic_data["Pclass","Sex","Age","family","Child","Fare","Title"].show()

In [37]:
prediction = model.predict(passengers_submission_binned,output_type='class')
submission["Survived"] = prediction
result = submission["PassengerId","Survived"]
result
result.save('submission.csv')

In [38]:
result

PassengerId,Survived
892,0
893,1
894,0
895,0
896,1
897,0
898,1
899,0
900,1
901,0
