# Problem description
This is my take on an introductory Kaggle Competition: https://www.kaggle.com/c/titanic/ <br>
I will be using Keras (on Tenserflow) to solve this problem.<br>

This is binary classification problem, I'll be using binary crossentropy loss function, with sigmoid activation on last network layer. Also data will require some preprocessing before it's usable for Deep Learning

In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from keras import models
from keras import layers

%matplotlib inline

#### Meet and greet data

In [13]:
train_ds = pd.read_csv('train.csv')
test_ds = pd.read_csv('test.csv')

In [15]:
print(f"Train data shape: {train_ds.shape}")
print(f"Test data shape: {test_ds.shape}")
train_ds.sample(10)

Train data shape: (891, 12)
Test data shape: (418, 11)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
119,120,0,3,"Andersson, Miss. Ellis Anna Maria",female,2.0,4,2,347082,31.275,,S
253,254,0,3,"Lobb, Mr. William Arthur",male,30.0,1,0,A/5. 3336,16.1,,S
567,568,0,3,"Palsson, Mrs. Nils (Alma Cornelia Berglund)",female,29.0,0,4,349909,21.075,,S
37,38,0,3,"Cann, Mr. Ernest Charles",male,21.0,0,0,A./5. 2152,8.05,,S
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,
355,356,0,3,"Vanden Steen, Mr. Leo Peter",male,28.0,0,0,345783,9.5,,S
631,632,0,3,"Lundahl, Mr. Johan Svensson",male,51.0,0,0,347743,7.0542,,S
420,421,0,3,"Gheorgheff, Mr. Stanio",male,,0,0,349254,7.8958,,C
827,828,1,2,"Mallet, Master. Andre",male,1.0,0,2,S.C./PARIS 2079,37.0042,,C
465,466,0,3,"Goncalves, Mr. Manuel Estanslas",male,38.0,0,0,SOTON/O.Q. 3101306,7.05,,S


In [24]:
print("          Data summary")
print(train_ds.info())
print('='*40)
print("          NaN values summary")
print("--- Train data: ")
print(train_ds.isnull().sum())
print("--- Test data: ")
print(test_ds.isnull().sum())

          Data summary
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
None
          NaN values summary
--- Train data: 
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
--- Test data: 
PassengerId      0
Pclass           0
Name             0
Sex              0
Age     

In [25]:
dataset_cleaner = [train_ds, test_ds]
drop_columns = ['Cabin', 'PassengerId','Ticket']
for dataset in dataset_cleaner:
    dataset.drop(drop_columns, axis = 1, inplace = True)
    dataset['Age'].fillna(dataset['Age'].mean(), inplace = True)
    dataset['Fare'].fillna(dataset['Fare'].median(), inplace = True)
    dataset['Embarked'].fillna(dataset['Embarked'].mode()[0], inplace = True)

In [29]:
train_ds.isnull().sum()

Survived    0
Pclass      0
Name        0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

In [30]:
train_ds.sample(5)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
132,0,3,"Robins, Mrs. Alexander A (Grace Charity Laury)",female,47.0,1,0,14.5,S
761,0,3,"Nirva, Mr. Iisakki Antino Aijo",male,41.0,0,0,7.125,S
80,0,3,"Waelens, Mr. Achille",male,22.0,0,0,9.0,S
835,1,1,"Compton, Miss. Sara Rebecca",female,39.0,1,1,83.1583,C
514,0,3,"Coleff, Mr. Satio",male,24.0,0,0,7.4958,S


#### now I need to encode qualitative data for use in neural network

In [32]:
for dataset in dataset_cleaner:
    dataset['Title'] = dataset['Name'].str.split(", ", expand=True)[1].str.split(".", expand=True)[0]
title_names = (train_ds['Title'].value_counts() < 10)

In [48]:
for dataset in dataset_cleaner:
    dataset['Title'] = dataset['Title'].apply(lambda x: 'Misc' if (x not in title_names) or (title_names.loc[x]) else x)
    dataset.drop(['Name'], axis = 1, inplace = True)

In [49]:
test_ds['Title'].value_counts()

Mr        240
Miss       78
Mrs        72
Master     21
Misc        7
Name: Title, dtype: int64

In [57]:
train_ds.sample(5)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,Sex Code,Embarked Code
661,0,3,male,40.0,0,0,7.225,C,Mr,1,0
12,0,3,male,20.0,0,0,8.05,S,Mr,1,2
582,0,2,male,54.0,0,0,26.0,S,Mr,1,2
22,1,3,female,15.0,0,0,8.0292,Q,Miss,0,1
121,0,3,male,29.699118,0,0,8.05,S,Mr,1,2


In [56]:
from sklearn.preprocessing import LabelEncoder

label = LabelEncoder()
for dataset in dataset_cleaner:
    dataset['Sex Code'] = label.fit_transform(dataset['Sex'])
    dataset['Embarked Code'] = label.fit_transform(dataset['Embarked'])