## Kaggle with Titanic
Predict survival on the Titanic using given training dataset

In [1]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

import pandas as pd
# for training
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn import svm
# to ignore errors
pd.options.mode.chained_assignment = None

Use pandas to import CSV file to Notebook

In [2]:
dataset_train = pd.read_csv('datasets/titanic/train.csv')
dataset_test = pd.read_csv('datasets/titanic/test.csv')

Lets find out the number of rows and display the first fews rows of the training dataset

In [3]:
print("(rows, columns) =", dataset_train.shape)
dataset_train.head(4)

(rows, columns) = (891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S


In [4]:
# The target column
columns_target = ["Survived"]
# column that we will used for training
columns_training = ["Pclass", "Sex", "Age"]
columns_result = ['PassengerId', 'Survived']

dataset_target = dataset_train[columns_target]
dataset_training = dataset_train[columns_training]
dataset_result_training = dataset_test[columns_training]

Preprocess the data to remove any null/incompatable data inside the dataset

In [5]:
print("Pclass null rows: ", dataset_training["Pclass"].isnull().sum())
print("Sex null rows:    ", dataset_training["Sex"].isnull().sum())
print("Age null rows:    ", dataset_training["Age"].isnull().sum())

Pclass null rows:  0
Sex null rows:     0
Age null rows:     177


Instead of removing all null value rows, lets fill them with the median of that row. This way we will not loose signifacant amount of data.

In [6]:
dataset_result_training['Age'] = dataset_result_training['Age'].fillna(dataset_training['Age'].mean())
dataset_training["Age"] = dataset_training["Age"].fillna(dataset_training["Age"].median())
dataset_training["Age"].head(4)

0    22.0
1    38.0
2    26.0
3    35.0
Name: Age, dtype: float64

Null value checking

In [7]:
print("Pclass null rows: ", dataset_training["Pclass"].isnull().sum())
print("Sex null rows:    ", dataset_training["Sex"].isnull().sum())
print("Age null rows:    ", dataset_training["Age"].isnull().sum())

Pclass null rows:  0
Sex null rows:     0
Age null rows:     0


No more null values in our dataset, lets continue <br>
Now lets modify "sex" row to a categorical variable, set Male = 0, Female = 1

In [8]:
sex_obj = {
    "male": 0,
    "female": 1
}
dataset_result_training['Sex'] = dataset_result_training['Sex'].apply(lambda x:sex_obj[x])
dataset_training["Sex"] = dataset_training["Sex"].apply(lambda x:sex_obj[x])
dataset_training["Sex"].head(4)

0    0
1    1
2    1
3    1
Name: Sex, dtype: int64

Now lets take a look at of modified dataset

In [9]:
dataset_training.head(4)

Unnamed: 0,Pclass,Sex,Age
0,3,0,22.0
1,1,1,38.0
2,3,1,26.0
3,1,1,35.0


Lets start training<br>
First divide the dataset to "test" and "train" set

In [10]:
dataset_training_train, dataset_training_test, dataset_target_train, dataset_target_test = \
    train_test_split(dataset_training, dataset_target, test_size = 0.33, random_state = 42)

Lets create a clasifire

In [11]:
clasifire = svm.LinearSVC()

# here we chain our model
clasifire.fit(dataset_training_train, dataset_target_train.values.ravel())

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

Now lets do some prediction with our model

In [12]:
# predict 1
print(clasifire.predict(dataset_training_test[0:1]))
# predict multiple
print(clasifire.predict(dataset_training_test[0:16]))

[0]
[0 0 0 1 1 1 1 0 1 1 0 0 1 0 0 1]


Now, lets confirm the accuracy o the model

In [13]:
print("Accuracy is: ", np.around(clasifire.score(dataset_training_test,\
                                              dataset_target_test), decimals = 4) * 100, "%")

Accuracy is:  80.68 %


Finally, lets generate and save our results

In [14]:
dataset_test['Survived'] = clasifire.predict(dataset_result_training)
dataset_test[columns_result].to_csv('datasets/titanic/result.csv', index=False)
print (dataset_test[columns_result])

     PassengerId  Survived
0            892         0
1            893         0
2            894         0
3            895         0
4            896         1
5            897         0
6            898         1
7            899         0
8            900         1
9            901         0
10           902         0
11           903         0
12           904         1
13           905         0
14           906         1
15           907         1
16           908         0
17           909         0
18           910         1
19           911         1
20           912         0
21           913         0
22           914         1
23           915         0
24           916         1
25           917         0
26           918         1
27           919         0
28           920         0
29           921         0
..           ...       ...
388         1280         0
389         1281         0
390         1282         0
391         1283         1
392         1284         0
3