# Loading packages

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import os
import seaborn as sns
from google.colab import files
!pip install xlrd
import xlrd
from sklearn.preprocessing import LabelBinarizer, LabelEncoder 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score



# Titanic dataset

In [0]:
data = files.urllib.request.urlretrieve('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.xls', filename=None)[0]
df = pd.read_excel(data)

## Data preprocessing

Firstly, let's get rid of features that have too many nans, as was decided in EDA.

In [0]:
df.drop(columns=['cabin', 'boat', 'body', 'home.dest', 'name'], inplace=True)

In [0]:
for column in df.columns:
  if (df[column].dtypes in ['float64', 'int64']) and sum(df[column].isnull()):
    df[column].fillna(df[column].median(), inplace=True)  
  if (df[column].dtypes == 'object') and sum(df[column].isnull()):
    df[column].fillna(df[column].mode()[0], inplace=True)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 9 columns):
pclass      1309 non-null int64
survived    1309 non-null int64
sex         1309 non-null object
age         1309 non-null float64
sibsp       1309 non-null int64
parch       1309 non-null int64
ticket      1309 non-null object
fare        1309 non-null float64
embarked    1309 non-null object
dtypes: float64(2), int64(4), object(3)
memory usage: 92.1+ KB


In [6]:
df.ticket = df.ticket.str.replace(' \d+', '').str.replace('[0-9][0-9]+', '').replace([np.nan, ''], 'simple')
df.ticket.unique() # I will drop this feature due to high range of categories

array(['simple', 'PC', 'W.E.P.', 'WE/P', 'F.C.', 'P/PP', 'C.A.',
       'C.A./SOTON', 'S.O.P.', 'F.C.C.', 'SC/AH', 'W./C.', 'S.O.C.',
       'SC/PARIS', 'S.W./PP', 'W/C', 'SCO/W', 'SO/C', 'SC/AH Basle',
       'SC/Paris', 'SC', 'S.O./P.P.', 'S.C./PARIS', 'SC/A.3', 'SW/PP',
       'CA', 'SOTON/O2', 'C', 'SOTON/O.Q.', 'A/4', 'PP', 'A/5', 'A./5.',
       'A/5.', 'SOTON/OQ', 'S.P.', 'SC/A4', 'AQ/3.', 'STON/O2.', 'A..',
       'LINE', 'STON/O.', 'AQ/4', 'A/S', 'A.5.', 'A4.', 'A/4.', 'Fa',
       'S.C./A.4.', 'LP', 'CA.', 'STON/OQ.'], dtype=object)

In [0]:
df.drop(columns='ticket', inplace=True)

In [0]:
binarizer = LabelBinarizer()
df.sex = binarizer.fit_transform(df.sex)
df = pd.get_dummies(df)

In [9]:
df.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,embarked_C,embarked_Q,embarked_S
0,1,1,0,29.0,0,0,211.3375,0,0,1
1,1,1,1,0.9167,1,2,151.55,0,0,1
2,1,0,0,2.0,1,2,151.55,0,0,1
3,1,0,1,30.0,1,2,151.55,0,0,1
4,1,0,0,25.0,1,2,151.55,0,0,1


## Train-test split

In [0]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns='survived'), df.survived, test_size=0.3, random_state=42)

## Neural Network building

### TensorFlow

In [11]:
n_samples = X_train.shape[0]
print( X_train.shape )
print( y_train.shape )

(916, 9)
(916,)


In [0]:
learning_rate = 0.03
n_epochs = 15

X = tf.placeholder(tf.float32, [None, 9])
y = tf.placeholder(tf.float32, [None, 1])
weights = tf.Variable(tf.random_normal([9, 1], 0.0, 0.01, tf.float32))
bias = tf.Variable(tf.zeros([1, 1]))
logits = tf.add(tf.matmul(X, weights), bias)
loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=y))
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)
predictions = tf.rint(tf.nn.sigmoid(logits))
accuracy = tf.reduce_sum(tf.cast(tf.equal(predictions, y), tf.float32))

In [13]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())    
    for i in range(n_epochs):
      total_loss=0
      for j in range(n_samples): 
        _, l = sess.run([optimizer, loss], feed_dict={X: X_train.values[j].reshape((-1, 9)), y:y_train.values[j].reshape(-1, 1)})
      total_loss += l
      pred = sess.run(predictions, {X: X_test})
      score = sess.run(accuracy, {predictions: pred, y: y_test.values.reshape(-1, 1)})
      print('Epoch {0}: {1}, Accuracy {0}: {2}'.format(i, total_loss/n_samples, score/len(pred)))
    sess.close()

Epoch 0: 4.912745843320683e-10, Accuracy 0: 0.6590330788804071
Epoch 1: 0.000208523368471054, Accuracy 1: 0.6717557251908397
Epoch 2: 0.00258746912385699, Accuracy 2: 0.6590330788804071
Epoch 3: 0.020662490977991096, Accuracy 3: 0.6921119592875318
Epoch 4: 0.005992527611911557, Accuracy 4: 0.6768447837150128
Epoch 5: 0.0024768894936840606, Accuracy 5: 0.6870229007633588
Epoch 6: 0.0030937049066135456, Accuracy 6: 0.6844783715012722
Epoch 7: 0.003826468792544702, Accuracy 7: 0.6743002544529262
Epoch 8: 0.005919622542035632, Accuracy 8: 0.6768447837150128
Epoch 9: 0.003255336565742326, Accuracy 9: 0.6870229007633588
Epoch 10: 0.009772465218623133, Accuracy 10: 0.6921119592875318
Epoch 11: 0.01703639530198543, Accuracy 11: 0.7175572519083969
Epoch 12: 0.015253047235147402, Accuracy 12: 0.712468193384224
Epoch 13: 0.013967974217177478, Accuracy 13: 0.7099236641221374
Epoch 14: 0.016106195324893602, Accuracy 14: 0.712468193384224


# Thyroid dataset

In [0]:
data2 = files.urllib.request.urlretrieve('https://www.openml.org/data/get_csv/57/dataset_57_hypothyroid.arff', filename=None)[0]
df2 = pd.read_csv(data2)

## Data preprocessing

In [0]:
df2.replace('?', np.nan, inplace=True)
df2.drop(columns='TBG', inplace=True)

In [0]:
for i in df2:
  try:
    df[i] = df[i].astype(float)
  except:
    continue

In [0]:
df.age[1364] = df.age.median()
df.age.fillna(df.age.median(), inplace=True)

In [160]:
import sys
sys.version

'2.7.14 (default, Sep 23 2017, 22:06:14) \n[GCC 7.2.0]'