## Loading packages

In [45]:
import numpy as np
import pandas as pd
import tensorflow as tf
import os
import seaborn as sns
from google.colab import files
!pip install xlrd
import xlrd
from sklearn.preprocessing import LabelBinarizer, LabelEncoder 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score



## Titanic dataset

In [0]:
data = files.urllib.request.urlretrieve('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.xls', filename=None)[0]
df = pd.read_excel(data)

### Data preprocessing

Firstly, let's get rid of features that have too many nans, as was decided in EDA.

In [0]:
df.drop(columns=['cabin', 'boat', 'body', 'home.dest', 'name'], inplace=True)

In [0]:
for column in df.columns:
  if (df[column].dtypes in ['float64', 'int64']) and sum(df[column].isnull()):
    df[column].fillna(df[column].median(), inplace=True)  
  if (df[column].dtypes == 'object') and sum(df[column].isnull()):
    df[column].fillna(df[column].mode()[0], inplace=True)

In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 9 columns):
pclass      1309 non-null int64
survived    1309 non-null int64
sex         1309 non-null object
age         1309 non-null float64
sibsp       1309 non-null int64
parch       1309 non-null int64
ticket      1309 non-null object
fare        1309 non-null float64
embarked    1309 non-null object
dtypes: float64(2), int64(4), object(3)
memory usage: 92.1+ KB


In [50]:
df.ticket = df.ticket.str.replace(' \d+', '').str.replace('[0-9][0-9]+', '').replace([np.nan, ''], 'simple')
df.ticket.unique() # I will drop this feature due to high range of categories

array(['simple', u'PC', u'W.E.P.', u'WE/P', u'F.C.', u'P/PP', u'C.A.',
       u'C.A./SOTON', u'S.O.P.', u'F.C.C.', u'SC/AH', u'W./C.', u'S.O.C.',
       u'SC/PARIS', u'S.W./PP', u'W/C', u'SCO/W', u'SO/C', u'SC/AH Basle',
       u'SC/Paris', u'SC', u'S.O./P.P.', u'S.C./PARIS', u'SC/A.3',
       u'SW/PP', u'CA', u'SOTON/O2', u'C', u'SOTON/O.Q.', u'A/4', u'PP',
       u'A/5', u'A./5.', u'A/5.', u'SOTON/OQ', u'S.P.', u'SC/A4',
       u'AQ/3.', u'STON/O2.', u'A..', u'LINE', u'STON/O.', u'AQ/4',
       u'A/S', u'A.5.', u'A4.', u'A/4.', u'Fa', u'S.C./A.4.', u'LP',
       u'CA.', u'STON/OQ.'], dtype=object)

In [0]:
binarizer = LabelBinarizer()

In [0]:
df.sex = binarizer.fit_transform(df.sex)
df = pd.get_dummies(df)

In [53]:
df.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,ticket,fare,embarked_C,embarked_Q,embarked_S
0,1,1,0,29.0,0,0,51,211.3375,0,0,1
1,1,1,1,0.9167,1,2,51,151.55,0,0,1
2,1,0,0,2.0,1,2,51,151.55,0,0,1
3,1,0,1,30.0,1,2,51,151.55,0,0,1
4,1,0,0,25.0,1,2,51,151.55,0,0,1


### Train-test split

In [0]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['survived', 'ticket']), df.survived, test_size=0.3, random_state=42)

### Neural Network building

### TensorFlow

In [76]:
n_samples = X_train.shape[0]
print( X_train.shape )
print( y_train.shape )

(916, 9)
(916,)


In [0]:
learning_rate = 0.03
n_epochs = 15

X = tf.placeholder(tf.float32, [None, 9])
y = tf.placeholder(tf.float32, [None, 1])
weights = tf.Variable(tf.zeros([9, 1]))
bias = tf.Variable(tf.zeros([1, 1]))
logits = tf.add(tf.matmul(X, weights), bias)
loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=y))
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)
predictions = tf.rint(tf.nn.sigmoid(logits))

In [115]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())    
    for i in range(n_epochs):
      total_loss=0
      for j in range(n_samples): 
        _, l = sess.run([optimizer, loss], feed_dict={X: X_train.values[j].reshape((-1, 9)), y:y_train.values[j].reshape(-1, 1)})
      total_loss += l
      
      print('Epoch {0}: {1}'.format(i, total_loss/n_samples))
    pred = sess.run(predictions, {X: X_test})
    sess.close()

Epoch 0: 5.20701841431e-06
Epoch 1: 8.07236201144e-08
Epoch 2: 0.0099651490757
Epoch 3: 0.0205761272314
Epoch 4: 4.19329514312e-07
Epoch 5: 0.0039634157997
Epoch 6: 0.0111309742823
Epoch 7: 0.00574592003135
Epoch 8: 0.00318316028628
Epoch 9: 0.00767234839727
Epoch 10: 0.00858968938803
Epoch 11: 0.00727143662465
Epoch 12: 0.00987441779224
Epoch 13: 0.0106983091113
Epoch 14: 0.0178406061564


In [116]:
accuracy_score(y_test, pred)

0.7150127226463104