## Loading packages

In [3]:
import numpy as np
import pandas as pd
import tensorflow as tf
import os
import seaborn as sns
from google.colab import files
!pip install xlrd
import xlrd
from sklearn.preprocessing import LabelBinarizer, LabelEncoder 
from sklearn.model_selection import train_test_split

Collecting xlrd
[?25l  Downloading https://files.pythonhosted.org/packages/07/e6/e95c4eec6221bfd8528bcc4ea252a850bffcc4be88ebc367e23a1a84b0bb/xlrd-1.1.0-py2.py3-none-any.whl (108kB)
[K    100% |████████████████████████████████| 112kB 4.0MB/s 
[?25hInstalling collected packages: xlrd
Successfully installed xlrd-1.1.0


## Titanic dataset

In [0]:
data = files.urllib.request.urlretrieve('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.xls', filename=None)[0]
df = pd.read_excel(data)

### Data preprocessing

Firstly, let's get rid of features that have too many nans, as was decided in EDA.

In [0]:
df.drop(columns=['cabin', 'boat', 'body', 'home.dest', 'name'], inplace=True)

In [0]:
for column in df.columns:
  if (df[column].dtypes in ['float64', 'int64']) and sum(df[column].isnull()):
    df[column].fillna(df[column].median(), inplace=True)  
  if (df[column].dtypes == 'object') and sum(df[column].isnull()):
    df[column].fillna(df[column].mode()[0], inplace=True)

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 9 columns):
pclass      1309 non-null int64
survived    1309 non-null int64
sex         1309 non-null object
age         1309 non-null float64
sibsp       1309 non-null int64
parch       1309 non-null int64
ticket      1309 non-null object
fare        1309 non-null float64
embarked    1309 non-null object
dtypes: float64(2), int64(4), object(3)
memory usage: 92.1+ KB


In [5]:
df.ticket = df.ticket.str.replace(' \d+', '').str.replace('[0-9][0-9]+', '').replace([np.nan, ''], 'simple')
df.ticket.unique()

array(['simple', u'PC', u'W.E.P.', u'WE/P', u'F.C.', u'P/PP', u'C.A.',
       u'C.A./SOTON', u'S.O.P.', u'F.C.C.', u'SC/AH', u'W./C.', u'S.O.C.',
       u'SC/PARIS', u'S.W./PP', u'W/C', u'SCO/W', u'SO/C', u'SC/AH Basle',
       u'SC/Paris', u'SC', u'S.O./P.P.', u'S.C./PARIS', u'SC/A.3',
       u'SW/PP', u'CA', u'SOTON/O2', u'C', u'SOTON/O.Q.', u'A/4', u'PP',
       u'A/5', u'A./5.', u'A/5.', u'SOTON/OQ', u'S.P.', u'SC/A4',
       u'AQ/3.', u'STON/O2.', u'A..', u'LINE', u'STON/O.', u'AQ/4',
       u'A/S', u'A.5.', u'A4.', u'A/4.', u'Fa', u'S.C./A.4.', u'LP',
       u'CA.', u'STON/OQ.'], dtype=object)

In [0]:
binarizer = LabelBinarizer()
encoder = LabelEncoder()

In [0]:
df.sex = binarizer.fit_transform(df.sex)
df.ticket = encoder.fit_transform(df.ticket)
df = pd.get_dummies(df)

In [24]:
df.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,ticket,fare,embarked_C,embarked_Q,embarked_S
0,1,1,0,29.0,0,0,51,211.3375,0,0,1
1,1,1,1,0.9167,1,2,51,151.55,0,0,1
2,1,0,0,2.0,1,2,51,151.55,0,0,1
3,1,0,1,30.0,1,2,51,151.55,0,0,1
4,1,0,0,25.0,1,2,51,151.55,0,0,1


### Train-test split

In [0]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns='survived'), df.survived, test_size=0.3, random_state=42)

### Neural Network building

In [9]:
n_samples = X_train.shape[0]
X_train.shape

(916, 10)

In [0]:
learning_rate = 0.01
n_epochs = 20

X = tf.placeholder(tf.float32, [None, 10])
y = tf.placeholder(tf.float32, [None, 1])
weights = tf.Variable(tf.zeros([10, 1]))
bias = tf.Variable(tf.zeros([1,1]))
logits = tf.add(tf.matmul(X, weights), bias)
loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=y))
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)
predictions = tf.rint(tf.nn.sigmoid(logits))
accuracy = tf.reduce_sum(tf.cast(tf.equal(predictions, y), tf.float32))

In [55]:
with tf.Session() as sess:
    sess.run(tf.initialize_all_variables())    
    #n_batches = int(mnist.train.num_examples/batch_size)
    
    # train the model n_epochs times
    for i in range(n_epochs): 
      total_loss = 0
      _, l = sess.run([optimizer, loss], feed_dict={X: X, y: y}) 
      total_loss += l
      
      print('Average loss epoch {0}: {1}'.format(i, total_loss/n_batches))

    # test the model
    #total_correct_preds = 0

    #for i in range(n_batches):
    #    X_batch, Y_batch = mnist.test.next_batch(batch_size)
    #    accuracy_batch = sess.run(accuracy, {X: X_batch, Y:Y_batch})
    #    total_correct_preds += accuracy_batch    

    #print('Accuracy {0}'.format(total_correct_preds/mnist.test.num_examples))


0.0


In [37]:
1 / (1 + np.exp(-26))

0.999999999994891

(u'pclass', 0)
(u'sex', 0)
(u'age', 1)
(u'sibsp', 0)
(u'parch', 0)
(u'ticket', 1)
(u'fare', 0)
(u'embarked_C', 1)
(u'embarked_Q', 0)
(u'embarked_S', 0)
