# Import needed libraries

In [4]:
import tensorflow as tf
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
import os
from sklearn.pipeline import make_pipeline
import category_encoders
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

from keras.models import Sequential
from keras.optimizers import SGD, RMSprop, Adam
from keras.layers import Dense, Activation, Dropout

import matplotlib.pylab as plt
%matplotlib inline

# Titanic dataset

## Data import and simple preprocessing.

In [0]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

Preprocessing is taken from https://www.kaggle.com/abevallerian/titanic-with-tensorflow

In [0]:
#These columns are not useful for survive prediction
del train['Name']
del train['Ticket']
del train['Fare']
del train['Embarked']

train = train.fillna(value=0.0)

#Replacing sex with 0 (Female) or 1 (Male)
for i in range(train.shape[0]):
    if train.at[i, 'Sex'] == 'male':
        train.at[i, 'Sex'] = 1
    else:
        train.at[i, 'Sex'] = 0

#Since the age is categorical data, I group the age 8 groups: NaN, 0-10, 10-20, ..., 70-80.
# From the desribe above, it's shown that the maximum age is 80.
train['Age_group'] = 0
for i in range(train.shape[0]):
    for j in range(70, 0, -10):
        if train.at[i, 'Age'] > j:
            train.at[i, 'Age_group'] = int(j / 10)
            break
del train['Age']  # it's unnecessary anymore

#Cabin is quite interesting. It is stored in string.
# I think the format is written as Cabin Section + Cabin Number. I'm only interested in obtaining the Cabin Section.
train['Cabin_section'] = '0'
for i in range(train.shape[0]):
    if train.at[i, 'Cabin'] != 0:
        train.at[i, 'Cabin_section'] = train.at[i, 'Cabin'][0]
CABIN_SECTION = list(set(
    train['Cabin_section'].values))  # will be reused for test data
for i in range(train.shape[0]):
    train.at[i, 'Cabin_section'] = CABIN_SECTION.index(train.at[i, 'Cabin_section'])
del train['Cabin']  # it's unnecessary anymore

Converting data into numpy array

In [0]:
pclass = np.eye(train['Pclass'].values.max()+1)[train['Pclass'].values]
age_group = np.eye(train['Age_group'].values.max()+1)[train['Age_group'].values]
cabin_section = np.eye(train['Cabin_section'].values.max()+1) \
                    [train['Cabin_section'].values.astype(int)] # prevent IndexError

X = train[['Sex', 'SibSp', 'Parch']].values
X = np.concatenate([X, age_group], axis=1)
X = np.concatenate([X, pclass], axis=1)
X = np.concatenate([X, cabin_section], axis=1)
X = X.astype(float)

y = train['Survived'].values
y = y.astype(float).reshape(-1, 1)

#Dividing training data into training and validation (development) sets
X_train, X_dev, y_train, y_dev = train_test_split(X, y, test_size=0.1, random_state=0)

y_train = np.float32(y_train)

Same preprocessing for the test data

In [0]:
del test['Name']
del test['Ticket']
del test['Fare']
del test['Embarked']

test = test.fillna(value=0.0)

test['Age_group'] = 0
test['Cabin_section'] = '0'
for i in range(test.shape[0]):
    if test.at[i, 'Sex'] == 'male':
        test.at[i, 'Sex'] = 1
    else:
        test.at[i, 'Sex'] = 0

    for j in range(70, 0, -10):
        if test.at[i, 'Age'] > j:
            test.at[i, 'Age_group'] = int(j/10)
            break

    if test.at[i, 'Cabin'] != 0:
        test.at[i, 'Cabin_section'] = test.at[i, 'Cabin'][0]
    test.at[i, 'Cabin_section'] = CABIN_SECTION.index(test.at[i, 'Cabin_section'])

del test['Cabin'] # it's unnecessary anymore
del test['Age'] # it's unnecessary anymore


pclass_test = np.eye(test['Pclass'].values.max()+1)[test['Pclass'].values]
age_group_test = np.eye(test['Age_group'].values.max()+1)[test['Age_group'].values]
cabin_section_test = np.eye(test['Cabin_section'].values.max()+1) \
                    [test['Cabin_section'].values.astype(int)] # prevent IndexError

X_test = test[['Sex', 'SibSp', 'Parch']].values
X_test = np.concatenate([X_test, age_group_test], axis=1)
X_test = np.concatenate([X_test, pclass_test], axis=1)
X_test = np.concatenate([X_test, cabin_section_test], axis=1)
X_test = X_test.astype(float)

id_test = test['PassengerId'].values
id_test = id_test.reshape(-1, 1)

# Titanic classifcation with clear TensorFlow

In [0]:
tf.reset_default_graph()

N_EPOCHS = 10000
LEARNING_RATE = 0.001
input_size = X_train.shape[1]  #number of features
seed = 42

#Creating graph
graph = tf.Graph()
with graph.as_default():
    tf.set_random_seed(seed)
    np.random.seed(seed)

    #Creating placeholders for X and Y input
    X_input = tf.placeholder(dtype=tf.float32,
                             shape=[None, input_size],
                             name='X_input')
    y_input = tf.placeholder(dtype=tf.float32, shape=[None, 1], name='y_input')

    #Creating weight and bias, initialized from normal distribution
    w = tf.Variable(tf.random_normal(shape=[input_size, 1], seed=seed),
                    name='w')
    b = tf.Variable(tf.random_normal(shape=[1], seed=seed), name='b')

    #Building model to predict Y
    logits = tf.add(tf.matmul(X_input, w), b)
    sigm = tf.nn.sigmoid(logits, name='pred')
    pred = tf.cast(tf.greater_equal(sigm, 0.5), tf.float32,
                   name='pred')  # 1 if >= 0.5, 0 if < 0.5

    #Using cross enthropy loss function
    loss = tf.reduce_mean(
        tf.nn.sigmoid_cross_entropy_with_logits(labels=y_input,
                                                logits=logits,
                                                name='loss'))

    #Using Adam optimizer with learning rate of 0.001 to minimize loss
    train_steps = tf.train.AdamOptimizer(LEARNING_RATE).minimize(loss)

    #Calculating accuracy
    acc = tf.reduce_mean(tf.cast(tf.equal(pred, y_input), tf.float32),
                         name='acc')

    #Inializing all variables at once
    init_var = tf.global_variables_initializer()

#Feeding placeholders
train_feed_dict = {X_input: X_train, y_input: y_train}
dev_feed_dict = {X_input: X_dev, y_input: y_dev}
test_feed_dict = {
    X_input: X_test
}  # no y_input since the goal is to predict it

sess = tf.Session(graph=graph)
sess.run(init_var)

cur_loss = sess.run(loss, feed_dict=train_feed_dict)
train_acc = sess.run(acc, feed_dict=train_feed_dict)
dev_acc = sess.run(acc, feed_dict=dev_feed_dict)
print('step   0: loss {0:.5f}, train_acc {1:.2f}%, dev_acc {2:.2f}%'.format(
    cur_loss, 100 * train_acc, 100 * dev_acc))

for step in range(1, N_EPOCHS + 1):
    sess.run(train_steps, feed_dict=train_feed_dict)
    cur_loss = sess.run(loss, feed_dict=train_feed_dict)
    train_acc = sess.run(acc, feed_dict=train_feed_dict)
    dev_acc = sess.run(acc, feed_dict=dev_feed_dict)
    if step % 100 != 0:  # print result every 100 steps
        continue
    print(
        'step {3}: loss {0:.5f}, train_acc {1:.2f}%, dev_acc {2:.2f}%'.format(
            cur_loss, 100 * train_acc, 100 * dev_acc, step))

step   0: loss 1.76236, train_acc 35.96%, dev_acc 42.22%
step 100: loss 1.49557, train_acc 36.83%, dev_acc 41.11%
step 200: loss 1.28232, train_acc 37.83%, dev_acc 43.33%
step 300: loss 1.12105, train_acc 46.69%, dev_acc 52.22%
step 400: loss 1.00299, train_acc 47.82%, dev_acc 52.22%
step 500: loss 0.91789, train_acc 52.18%, dev_acc 52.22%
step 600: loss 0.85641, train_acc 51.56%, dev_acc 51.11%
step 700: loss 0.81103, train_acc 56.93%, dev_acc 57.78%
step 800: loss 0.77620, train_acc 57.43%, dev_acc 57.78%
step 900: loss 0.74816, train_acc 60.55%, dev_acc 64.44%
step 1000: loss 0.72459, train_acc 61.42%, dev_acc 64.44%
step 1100: loss 0.70413, train_acc 61.42%, dev_acc 64.44%
step 1200: loss 0.68602, train_acc 62.17%, dev_acc 65.56%
step 1300: loss 0.66982, train_acc 62.42%, dev_acc 64.44%
step 1400: loss 0.65523, train_acc 66.17%, dev_acc 67.78%
step 1500: loss 0.64205, train_acc 66.42%, dev_acc 67.78%
step 1600: loss 0.63008, train_acc 67.17%, dev_acc 68.89%
step 1700: loss 0.61914,

Making prediction on test set

In [0]:
y_pred = sess.run(pred, feed_dict=test_feed_dict).astype(int)
prediction = pd.DataFrame(np.concatenate([id_test, y_pred], axis=1),
                          columns=['PassengerId', 'Survived'])

In [0]:
prediction.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


# Titanic classification with Keras

In [0]:
# Initialising the NN
model = Sequential()

# layer
model.add(
    Dense(1,
          kernel_initializer='uniform',
          activation='sigmoid',
          input_dim=input_size))

# summary
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 1)                 25        
Total params: 25
Trainable params: 25
Non-trainable params: 0
_________________________________________________________________


In [0]:
N_EPOCHS = 1000
LEARNING_RATE = 0.001

#initializing optimizer
adam_opt = Adam(lr=LEARNING_RATE, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)

# Compiling the NN
model.compile(optimizer = adam_opt, loss = 'binary_crossentropy', metrics = ['accuracy'])

# Train the NN
model.fit(X_train, y_train, validation_data=(X_dev, y_dev), batch_size = 32, epochs = N_EPOCHS)

Train on 801 samples, validate on 90 samples
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000


Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
Epoch 73/1000
Epoch 74/1000
Epoch 75/1000
Epoch 76/1000
Epoch 77/1000
Epoch 78/1000
Epoch 79/1000
Epoch 80/1000
Epoch 81/1000
Epoch 82/1000
Epoch 83/1000
Epoch 84/1000
Epoch 85/1000
Epoch 86/1000
Epoch 87/1000
Epoch 88/1000
Epoch 89/1000
Epoch 90/1000
Epoch 91/1000
Epoch 92/1000
Epoch 93/1000
Epoch 94/1000
Epoch 95/1000
Epoch 96/1000
Epoch 97/1000
Epoch 98/1000
Epoch 99/1000
Epoch 100/1000
Epoch 101/1000
Epoch 102/1000
Epoch 103/1000
Epoch 104/1000
Epoch 105/1000
Epoch 106/1000
Epoch 107/1000
Epoch 108/1000
Epoch 109/1000
Epoch 110/1000
Epoch 111/1000
Epoch 112/1000
Epoch 113/1000
Epoch 114/1000
Epoch 115/1000
Epoch 116/1000
Epoch 117/1000
Epoch 118/1000
Epoch 119/1000
Epoch 120/1000


Epoch 121/1000
Epoch 122/1000
Epoch 123/1000
Epoch 124/1000
Epoch 125/1000
Epoch 126/1000
Epoch 127/1000
Epoch 128/1000
Epoch 129/1000
Epoch 130/1000
Epoch 131/1000
Epoch 132/1000
Epoch 133/1000
Epoch 134/1000
Epoch 135/1000
Epoch 136/1000
Epoch 137/1000
Epoch 138/1000
Epoch 139/1000
Epoch 140/1000
Epoch 141/1000
Epoch 142/1000
Epoch 143/1000
Epoch 144/1000
Epoch 145/1000
Epoch 146/1000
Epoch 147/1000
Epoch 148/1000
Epoch 149/1000
Epoch 150/1000
Epoch 151/1000
Epoch 152/1000
Epoch 153/1000
Epoch 154/1000
Epoch 155/1000
Epoch 156/1000
Epoch 157/1000
Epoch 158/1000
Epoch 159/1000
Epoch 160/1000
Epoch 161/1000
Epoch 162/1000
Epoch 163/1000
Epoch 164/1000
Epoch 165/1000
Epoch 166/1000
Epoch 167/1000
Epoch 168/1000
Epoch 169/1000
Epoch 170/1000
Epoch 171/1000
Epoch 172/1000
Epoch 173/1000
Epoch 174/1000
Epoch 175/1000
Epoch 176/1000
Epoch 177/1000
Epoch 178/1000
Epoch 179/1000
Epoch 180/1000


Epoch 181/1000
Epoch 182/1000
Epoch 183/1000
Epoch 184/1000
Epoch 185/1000
Epoch 186/1000
Epoch 187/1000
Epoch 188/1000
Epoch 189/1000
Epoch 190/1000
Epoch 191/1000
Epoch 192/1000
Epoch 193/1000
Epoch 194/1000
Epoch 195/1000
Epoch 196/1000
Epoch 197/1000
Epoch 198/1000
Epoch 199/1000
Epoch 200/1000
Epoch 201/1000
Epoch 202/1000
Epoch 203/1000
Epoch 204/1000
Epoch 205/1000
Epoch 206/1000
Epoch 207/1000
Epoch 208/1000
Epoch 209/1000
Epoch 210/1000
Epoch 211/1000
Epoch 212/1000
Epoch 213/1000
Epoch 214/1000
Epoch 215/1000
Epoch 216/1000
Epoch 217/1000
Epoch 218/1000
Epoch 219/1000
Epoch 220/1000
Epoch 221/1000
Epoch 222/1000
Epoch 223/1000
Epoch 224/1000
Epoch 225/1000
Epoch 226/1000
Epoch 227/1000
Epoch 228/1000
Epoch 229/1000
Epoch 230/1000
Epoch 231/1000
Epoch 232/1000
Epoch 233/1000
Epoch 234/1000
Epoch 235/1000
Epoch 236/1000
Epoch 237/1000
Epoch 238/1000
Epoch 239/1000
Epoch 240/1000


Epoch 241/1000
Epoch 242/1000
Epoch 243/1000
Epoch 244/1000
Epoch 245/1000
Epoch 246/1000
Epoch 247/1000
Epoch 248/1000
Epoch 249/1000
Epoch 250/1000
Epoch 251/1000
Epoch 252/1000
Epoch 253/1000
Epoch 254/1000
Epoch 255/1000
Epoch 256/1000
Epoch 257/1000
Epoch 258/1000
Epoch 259/1000
Epoch 260/1000
Epoch 261/1000
Epoch 262/1000
Epoch 263/1000
Epoch 264/1000
Epoch 265/1000
Epoch 266/1000
Epoch 267/1000
Epoch 268/1000
Epoch 269/1000
Epoch 270/1000
Epoch 271/1000
Epoch 272/1000
Epoch 273/1000
Epoch 274/1000
Epoch 275/1000
Epoch 276/1000
Epoch 277/1000
Epoch 278/1000
Epoch 279/1000
Epoch 280/1000
Epoch 281/1000
Epoch 282/1000
Epoch 283/1000
Epoch 284/1000
Epoch 285/1000
Epoch 286/1000
Epoch 287/1000
Epoch 288/1000
Epoch 289/1000
Epoch 290/1000
Epoch 291/1000
Epoch 292/1000
Epoch 293/1000
Epoch 294/1000
Epoch 295/1000
Epoch 296/1000
Epoch 297/1000
Epoch 298/1000
Epoch 299/1000
Epoch 300/1000
Epoch 301/1000
Epoch 302/1000
Epoch 303/1000
Epoch 304/1000
Epoch 305/1000
Epoch 306/1000
Epoch 307/

Epoch 360/1000
Epoch 361/1000
Epoch 362/1000
Epoch 363/1000
Epoch 364/1000
Epoch 365/1000
Epoch 366/1000
Epoch 367/1000
Epoch 368/1000
Epoch 369/1000
Epoch 370/1000
Epoch 371/1000
Epoch 372/1000
Epoch 373/1000
Epoch 374/1000
Epoch 375/1000
Epoch 376/1000
Epoch 377/1000
Epoch 378/1000
Epoch 379/1000
Epoch 380/1000
Epoch 381/1000
Epoch 382/1000
Epoch 383/1000
Epoch 384/1000
Epoch 385/1000
Epoch 386/1000
Epoch 387/1000
Epoch 388/1000
Epoch 389/1000
Epoch 390/1000
Epoch 391/1000
Epoch 392/1000
Epoch 393/1000
Epoch 394/1000
Epoch 395/1000
Epoch 396/1000
Epoch 397/1000
Epoch 398/1000
Epoch 399/1000
Epoch 400/1000
Epoch 401/1000
Epoch 402/1000
Epoch 403/1000
Epoch 404/1000
Epoch 405/1000
Epoch 406/1000
Epoch 407/1000
Epoch 408/1000
Epoch 409/1000
Epoch 410/1000
Epoch 411/1000
Epoch 412/1000
Epoch 413/1000
Epoch 414/1000
Epoch 415/1000
Epoch 416/1000
Epoch 417/1000
Epoch 418/1000
Epoch 419/1000


Epoch 420/1000
Epoch 421/1000
Epoch 422/1000
Epoch 423/1000
Epoch 424/1000
Epoch 425/1000
Epoch 426/1000
Epoch 427/1000
Epoch 428/1000
Epoch 429/1000
Epoch 430/1000
Epoch 431/1000
Epoch 432/1000
Epoch 433/1000
Epoch 434/1000
Epoch 435/1000
Epoch 436/1000
Epoch 437/1000
Epoch 438/1000
Epoch 439/1000
Epoch 440/1000
Epoch 441/1000
Epoch 442/1000
Epoch 443/1000
Epoch 444/1000
Epoch 445/1000
Epoch 446/1000
Epoch 447/1000
Epoch 448/1000
Epoch 449/1000
Epoch 450/1000
Epoch 451/1000
Epoch 452/1000
Epoch 453/1000
Epoch 454/1000
Epoch 455/1000
Epoch 456/1000
Epoch 457/1000
Epoch 458/1000
Epoch 459/1000
Epoch 460/1000
Epoch 461/1000
Epoch 462/1000
Epoch 463/1000
Epoch 464/1000
Epoch 465/1000
Epoch 466/1000
Epoch 467/1000
Epoch 468/1000
Epoch 469/1000
Epoch 470/1000
Epoch 471/1000
Epoch 472/1000
Epoch 473/1000
Epoch 474/1000
Epoch 475/1000
Epoch 476/1000
Epoch 477/1000
Epoch 478/1000
Epoch 479/1000
Epoch 480/1000
Epoch 481/1000
Epoch 482/1000
Epoch 483/1000
Epoch 484/1000
Epoch 485/1000
Epoch 486/

Epoch 539/1000
Epoch 540/1000
Epoch 541/1000
Epoch 542/1000
Epoch 543/1000
Epoch 544/1000
Epoch 545/1000
Epoch 546/1000
Epoch 547/1000
Epoch 548/1000
Epoch 549/1000
Epoch 550/1000
Epoch 551/1000
Epoch 552/1000
Epoch 553/1000
Epoch 554/1000
Epoch 555/1000
Epoch 556/1000
Epoch 557/1000
Epoch 558/1000
Epoch 559/1000
Epoch 560/1000
Epoch 561/1000
Epoch 562/1000
Epoch 563/1000
Epoch 564/1000
Epoch 565/1000
Epoch 566/1000
Epoch 567/1000
Epoch 568/1000
Epoch 569/1000
Epoch 570/1000
Epoch 571/1000
Epoch 572/1000
Epoch 573/1000
Epoch 574/1000
Epoch 575/1000
Epoch 576/1000
Epoch 577/1000
Epoch 578/1000
Epoch 579/1000
Epoch 580/1000
Epoch 581/1000
Epoch 582/1000
Epoch 583/1000
Epoch 584/1000
Epoch 585/1000
Epoch 586/1000
Epoch 587/1000
Epoch 588/1000
Epoch 589/1000
Epoch 590/1000
Epoch 591/1000
Epoch 592/1000
Epoch 593/1000
Epoch 594/1000
Epoch 595/1000
Epoch 596/1000
Epoch 597/1000
Epoch 598/1000


Epoch 599/1000
Epoch 600/1000
Epoch 601/1000
Epoch 602/1000
Epoch 603/1000
Epoch 604/1000
Epoch 605/1000
Epoch 606/1000
Epoch 607/1000
Epoch 608/1000
Epoch 609/1000
Epoch 610/1000
Epoch 611/1000
Epoch 612/1000
Epoch 613/1000
Epoch 614/1000
Epoch 615/1000
Epoch 616/1000
Epoch 617/1000
Epoch 618/1000
Epoch 619/1000
Epoch 620/1000
Epoch 621/1000
Epoch 622/1000
Epoch 623/1000
Epoch 624/1000
Epoch 625/1000
Epoch 626/1000
Epoch 627/1000
Epoch 628/1000
Epoch 629/1000
Epoch 630/1000
Epoch 631/1000
Epoch 632/1000
Epoch 633/1000
Epoch 634/1000
Epoch 635/1000
Epoch 636/1000
Epoch 637/1000
Epoch 638/1000
Epoch 639/1000
Epoch 640/1000
Epoch 641/1000
Epoch 642/1000
Epoch 643/1000
Epoch 644/1000
Epoch 645/1000
Epoch 646/1000
Epoch 647/1000
Epoch 648/1000
Epoch 649/1000
Epoch 650/1000
Epoch 651/1000
Epoch 652/1000
Epoch 653/1000
Epoch 654/1000
Epoch 655/1000
Epoch 656/1000
Epoch 657/1000
Epoch 658/1000


Epoch 659/1000
Epoch 660/1000
Epoch 661/1000
Epoch 662/1000
Epoch 663/1000
Epoch 664/1000
Epoch 665/1000
Epoch 666/1000
Epoch 667/1000
Epoch 668/1000
Epoch 669/1000
Epoch 670/1000
Epoch 671/1000
Epoch 672/1000
Epoch 673/1000
Epoch 674/1000
Epoch 675/1000
Epoch 676/1000
Epoch 677/1000
Epoch 678/1000
Epoch 679/1000
Epoch 680/1000
Epoch 681/1000
Epoch 682/1000
Epoch 683/1000
Epoch 684/1000
Epoch 685/1000
Epoch 686/1000
Epoch 687/1000
Epoch 688/1000
Epoch 689/1000
Epoch 690/1000
Epoch 691/1000
Epoch 692/1000
Epoch 693/1000
Epoch 694/1000
Epoch 695/1000
Epoch 696/1000
Epoch 697/1000
Epoch 698/1000
Epoch 699/1000
Epoch 700/1000
Epoch 701/1000
Epoch 702/1000
Epoch 703/1000
Epoch 704/1000
Epoch 705/1000
Epoch 706/1000
Epoch 707/1000
Epoch 708/1000
Epoch 709/1000
Epoch 710/1000
Epoch 711/1000
Epoch 712/1000
Epoch 713/1000
Epoch 714/1000
Epoch 715/1000
Epoch 716/1000
Epoch 717/1000
Epoch 718/1000


Epoch 719/1000
Epoch 720/1000
Epoch 721/1000
Epoch 722/1000
Epoch 723/1000
Epoch 724/1000
Epoch 725/1000
Epoch 726/1000
Epoch 727/1000
Epoch 728/1000
Epoch 729/1000
Epoch 730/1000
Epoch 731/1000
Epoch 732/1000
Epoch 733/1000
Epoch 734/1000
Epoch 735/1000
Epoch 736/1000
Epoch 737/1000
Epoch 738/1000
Epoch 739/1000
Epoch 740/1000
Epoch 741/1000
Epoch 742/1000
Epoch 743/1000
Epoch 744/1000
Epoch 745/1000
Epoch 746/1000
Epoch 747/1000
Epoch 748/1000
Epoch 749/1000
Epoch 750/1000
Epoch 751/1000
Epoch 752/1000
Epoch 753/1000
Epoch 754/1000
Epoch 755/1000
Epoch 756/1000
Epoch 757/1000
Epoch 758/1000
Epoch 759/1000
Epoch 760/1000
Epoch 761/1000
Epoch 762/1000
Epoch 763/1000
Epoch 764/1000
Epoch 765/1000
Epoch 766/1000
Epoch 767/1000
Epoch 768/1000
Epoch 769/1000
Epoch 770/1000
Epoch 771/1000
Epoch 772/1000
Epoch 773/1000
Epoch 774/1000
Epoch 775/1000
Epoch 776/1000
Epoch 777/1000
Epoch 778/1000


Epoch 779/1000
Epoch 780/1000
Epoch 781/1000
Epoch 782/1000
Epoch 783/1000
Epoch 784/1000
Epoch 785/1000
Epoch 786/1000
Epoch 787/1000
Epoch 788/1000
Epoch 789/1000
Epoch 790/1000
Epoch 791/1000
Epoch 792/1000
Epoch 793/1000
Epoch 794/1000
Epoch 795/1000
Epoch 796/1000
Epoch 797/1000
Epoch 798/1000
Epoch 799/1000
Epoch 800/1000
Epoch 801/1000
Epoch 802/1000
Epoch 803/1000
Epoch 804/1000
Epoch 805/1000
Epoch 806/1000
Epoch 807/1000
Epoch 808/1000
Epoch 809/1000
Epoch 810/1000
Epoch 811/1000
Epoch 812/1000
Epoch 813/1000
Epoch 814/1000
Epoch 815/1000
Epoch 816/1000
Epoch 817/1000
Epoch 818/1000
Epoch 819/1000
Epoch 820/1000
Epoch 821/1000
Epoch 822/1000
Epoch 823/1000
Epoch 824/1000
Epoch 825/1000
Epoch 826/1000
Epoch 827/1000
Epoch 828/1000
Epoch 829/1000
Epoch 830/1000
Epoch 831/1000
Epoch 832/1000
Epoch 833/1000
Epoch 834/1000
Epoch 835/1000
Epoch 836/1000
Epoch 837/1000
Epoch 838/1000
Epoch 839/1000
Epoch 840/1000
Epoch 841/1000
Epoch 842/1000
Epoch 843/1000
Epoch 844/1000
Epoch 845/

Epoch 898/1000
Epoch 899/1000
Epoch 900/1000
Epoch 901/1000
Epoch 902/1000
Epoch 903/1000
Epoch 904/1000
Epoch 905/1000
Epoch 906/1000
Epoch 907/1000
Epoch 908/1000
Epoch 909/1000
Epoch 910/1000
Epoch 911/1000
Epoch 912/1000
Epoch 913/1000
Epoch 914/1000
Epoch 915/1000
Epoch 916/1000
Epoch 917/1000
Epoch 918/1000
Epoch 919/1000
Epoch 920/1000
Epoch 921/1000
Epoch 922/1000
Epoch 923/1000
Epoch 924/1000
Epoch 925/1000
Epoch 926/1000
Epoch 927/1000
Epoch 928/1000
Epoch 929/1000
Epoch 930/1000
Epoch 931/1000
Epoch 932/1000
Epoch 933/1000
Epoch 934/1000
Epoch 935/1000
Epoch 936/1000
Epoch 937/1000
Epoch 938/1000
Epoch 939/1000
Epoch 940/1000
Epoch 941/1000
Epoch 942/1000
Epoch 943/1000
Epoch 944/1000
Epoch 945/1000
Epoch 946/1000
Epoch 947/1000
Epoch 948/1000
Epoch 949/1000
Epoch 950/1000
Epoch 951/1000
Epoch 952/1000
Epoch 953/1000
Epoch 954/1000
Epoch 955/1000
Epoch 956/1000
Epoch 957/1000


Epoch 958/1000
Epoch 959/1000
Epoch 960/1000
Epoch 961/1000
Epoch 962/1000
Epoch 963/1000
Epoch 964/1000
Epoch 965/1000
Epoch 966/1000
Epoch 967/1000
Epoch 968/1000
Epoch 969/1000
Epoch 970/1000
Epoch 971/1000
Epoch 972/1000
Epoch 973/1000
Epoch 974/1000
Epoch 975/1000
Epoch 976/1000
Epoch 977/1000
Epoch 978/1000
Epoch 979/1000
Epoch 980/1000
Epoch 981/1000
Epoch 982/1000
Epoch 983/1000
Epoch 984/1000
Epoch 985/1000
Epoch 986/1000
Epoch 987/1000
Epoch 988/1000
Epoch 989/1000
Epoch 990/1000
Epoch 991/1000
Epoch 992/1000
Epoch 993/1000
Epoch 994/1000
Epoch 995/1000
Epoch 996/1000
Epoch 997/1000
Epoch 998/1000
Epoch 999/1000
Epoch 1000/1000


<keras.callbacks.History at 0x7f23f6932240>

Making predictions

In [0]:
y_pred = model.predict(X_test)
y_final = (y_pred > 0.5).astype(int).reshape(X_test.shape[0],1)

prediction = pd.DataFrame(np.concatenate([id_test, y_final], axis=1),
                          columns=['PassengerId', 'Survived'])

In [0]:
prediction.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


# Thyroid Desiase dataset 

## Data import and preprocessing

Let's take these steps from previous work (HW4)

In [0]:
all_data=pd.read_csv('dataset_57_hypothyroid.csv')
display(all_data.shape)

(3772, 30)

In [0]:
# Replacing ? with nan
all_data = all_data.replace('?', np.nan)
# Deleting variables with constant values
all_data = all_data.drop(['TBG', 'TBG_measured', 'hypopituitary'], 1)

# Converting categorical variables to numeric
to_numeric = ['FTI', 'T3', 'T4U', 'TSH', 'TT4', 'age']
for cat in to_numeric:
    all_data[cat] = pd.to_numeric(all_data[cat])

# Converting categorical variables into binary
all_data = all_data.replace('t', 1)
all_data = all_data.replace('f', 0)
all_data['sex'] = all_data['sex'].replace('M', 1)
all_data['sex'] = all_data['sex'].replace('F', 0)
all_data = all_data.rename(index=str, columns={"sex": "Male"})
pipeline = make_pipeline(
    category_encoders.OneHotEncoder(handle_unknown="ignore",
                                    use_cat_names=True,
                                    cols=['referral_source']))
all_data = pipeline.fit_transform(all_data)

# Deleting useless variables
all_data = all_data[all_data.columns.drop(
    list(all_data.filter(regex='measured')))]
# Deleting NA
all_data = all_data.dropna()
# Converting Male into binary
all_data['Male'] = all_data['Male'].astype(np.int64)

# Selecting numeric variables
numeric_feats = all_data.dtypes[all_data.dtypes == "float64"].index
# Numeric variables processing
all_data = all_data[all_data['age'] < 400]
all_data = all_data.drop('T4U', 1)
numeric_feats = numeric_feats.drop('T4U', 1)
# Selecting binary variables
binary_feats = all_data.columns
for feat in numeric_feats:
    binary_feats = binary_feats.drop(feat, 1)

# Deleting variables with equal ratio for sick and healthy people
# Deleting variables with constant values
all_data = all_data.drop([
    'referral_source_other', 'referral_source_SVI', 'referral_source_SVHD',
    'query_on_thyroxine', 'sick', 'query_hyperthyroid', 'lithium', 'tumor',
    'goitre', 'pregnant'
], 1)
binary_feats = binary_feats.drop([
    'referral_source_other', 'referral_source_SVI', 'referral_source_SVHD',
    'query_on_thyroxine', 'sick', 'query_hyperthyroid', 'lithium', 'tumor',
    'goitre', 'pregnant'
], 1)

# Target selecting
target = all_data.pop('Class')
binary_feats = binary_feats.drop('Class', 1)

all_data = all_data.dropna()

ros = RandomOverSampler()
X_resampled, y_resampled = ros.fit_sample(all_data, target)

In [0]:
# Converting to DataFrame
X_resampled_pd = pd.DataFrame(data=X_resampled, columns=all_data.columns)
# Converting non-binary variables which sholud be binary
X_resampled_pd[binary_feats] = X_resampled_pd[binary_feats].astype(np.int64)

display(X_resampled_pd.shape, X_resampled_pd.head())

y_resampled_pd = pd.Series(data=y_resampled, name='Class')

display(y_resampled_pd.shape, y_resampled_pd.head())

(9704, 14)

Unnamed: 0,referral_source_SVHC,referral_source_STMW,age,Male,on_thyroxine,on_antithyroid_medication,thyroid_surgery,I131_treatment,query_hypothyroid,psych,TSH,T3,TT4,FTI
0,1,0,41.0,0,0,0,0,0,0,0,1.3,2.5,125.0,109.0
1,0,0,70.0,0,0,0,0,0,0,0,0.72,1.2,61.0,70.0
2,0,0,80.0,0,0,0,0,0,0,0,2.2,0.6,80.0,115.0
3,0,0,66.0,0,0,0,0,0,0,0,0.6,2.2,123.0,132.0
4,0,0,68.0,1,0,0,0,0,0,0,2.4,1.6,83.0,93.0


(9704,)

0    negative
1    negative
2    negative
3    negative
4    negative
Name: Class, dtype: object

Let's provide target in one-hot form

In [0]:
num_labels = len(set(y_resampled_pd))

In [0]:
Counter(y_resampled_pd)

Counter({'compensated_hypothyroid': 2426,
         'negative': 2426,
         'primary_hypothyroid': 2426,
         'secondary_hypothyroid': 2426})

In [0]:
y_resampled_pd.replace('negative', 0, inplace=True)
y_resampled_pd.replace('primary_hypothyroid', 1, inplace=True)
y_resampled_pd.replace('secondary_hypothyroid', 2, inplace=True)
y_resampled_pd.replace('compensated_hypothyroid', 3, inplace=True)

In [0]:
# One-hot for y
y_resampled_pd = (np.arange(num_labels) == np.array(y_resampled_pd)[:,None]).astype(np.float32)

In [0]:
X_train_dev, X_test, y_train_dev, y_test = train_test_split(X_resampled_pd,
                                                            y_resampled_pd,
                                                            test_size=0.15,
                                                            random_state=0)
print('X_train_dev: ', X_train_dev.shape, '\nX_test: ', X_test.shape,
      '\ny_train_dev: ', y_train_dev.shape, '\ny_test: ', y_test.shape)

X_train_dev:  (8248, 14) 
X_test:  (1456, 14) 
y_train_dev:  (8248, 4) 
y_test:  (1456, 4)


In [0]:
X_train, X_dev, y_train, y_dev = train_test_split(X_train_dev,
                                                  y_train_dev,
                                                  test_size=0.2,
                                                  random_state=0)
print('X_train: ', X_train.shape, '\nX_dev: ', X_dev.shape, '\ny_train: ',
      y_train.shape, '\ny_dev: ', y_dev.shape)

X_train:  (6598, 14) 
X_dev:  (1650, 14) 
y_train:  (6598, 4) 
y_dev:  (1650, 4)


In [0]:
X_train = X_train.values.astype(float)
X_dev = X_dev.values.astype(float)
X_test = X_test.values.astype(float)

# Thyroid Desiase classification with clear TensorFlow

In [0]:
tf.reset_default_graph()

N_EPOCHS = 10000
LEARNING_RATE = 0.001
input_size = X_train.shape[1]  #number of features
seed = 42

#Creating graph
graph = tf.Graph()
with graph.as_default():
    tf.set_random_seed(seed)
    np.random.seed(seed)

    #Creating placeholders for X and Y input
    X_input = tf.placeholder(dtype=tf.float32,
                             shape=[None, input_size],
                             name='X_input')
    y_input = tf.placeholder(dtype=tf.float32,
                             shape=[None, num_labels],
                             name='y_input')

    #Creating weight and bias, initialized from normal distribution
    w = tf.Variable(tf.random_normal(shape=[input_size, num_labels],
                                     seed=seed),
                    name='w')
    b = tf.Variable(tf.random_normal(shape=[num_labels], seed=seed), name='b')

    #Building model to predict Y
    logits = tf.add(tf.matmul(X_input, w), b)
    pred = tf.nn.softmax(logits)

    #Using cross enthropy loss function
    loss = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(labels=y_input,
                                                logits=logits,
                                                name='loss'))

    #Using Adam optimizer with learning rate of 0.001 to minimize loss
    train_steps = tf.train.AdamOptimizer(LEARNING_RATE).minimize(loss)

    #Calculating accuracy
    acc = tf.reduce_mean(tf.cast(tf.equal(np.argmax(pred), np.argmax(y_input)),
                                 tf.float32),
                         name='acc')

    #Inializing all variables at once
    init_var = tf.global_variables_initializer()

#Feeding placeholders
train_feed_dict = {X_input: X_train, y_input: y_train}
dev_feed_dict = {X_input: X_dev, y_input: y_dev}
test_feed_dict = {
    X_input: X_test
}  # no y_input since the goal is to predict it

sess = tf.Session(graph=graph)
sess.run(init_var)

#Initial run
cur_loss = sess.run(loss, feed_dict=train_feed_dict)
train_acc = sess.run(acc, feed_dict=train_feed_dict)
dev_acc = sess.run(acc, feed_dict=dev_feed_dict)
print('step   0: loss {0:.5f}, train_acc {1:.2f}%, dev_acc {2:.2f}%'.format(
    cur_loss, 100 * train_acc, 100 * dev_acc))

for step in range(1, N_EPOCHS + 1):
    sess.run(train_steps, feed_dict=train_feed_dict)

    cur_loss = sess.run(loss, feed_dict=train_feed_dict)
    train_acc = sess.run(acc, feed_dict=train_feed_dict)
    dev_acc = sess.run(acc, feed_dict=dev_feed_dict)
    if step % 100 != 0:  # print result every 100 steps
        continue
    print(
        'step {3}: loss {0:.5f}, train_acc {1:.2f}%, dev_acc {2:.2f}%'.format(
            cur_loss, 100 * train_acc, 100 * dev_acc, step))

step   0: loss 205.99641, train_acc 100.00%, dev_acc 100.00%
step 100: loss 175.45625, train_acc 100.00%, dev_acc 100.00%
step 200: loss 151.16364, train_acc 100.00%, dev_acc 100.00%
step 300: loss 128.14554, train_acc 100.00%, dev_acc 100.00%
step 400: loss 105.42755, train_acc 100.00%, dev_acc 100.00%
step 500: loss 86.00930, train_acc 100.00%, dev_acc 100.00%
step 600: loss 70.67134, train_acc 100.00%, dev_acc 100.00%
step 700: loss 61.84097, train_acc 100.00%, dev_acc 100.00%
step 800: loss 55.88947, train_acc 100.00%, dev_acc 100.00%
step 900: loss 50.19123, train_acc 100.00%, dev_acc 100.00%
step 1000: loss 44.52027, train_acc 100.00%, dev_acc 100.00%
step 1100: loss 38.82970, train_acc 100.00%, dev_acc 100.00%
step 1200: loss 33.21935, train_acc 100.00%, dev_acc 100.00%
step 1300: loss 27.81915, train_acc 100.00%, dev_acc 100.00%
step 1400: loss 22.77228, train_acc 100.00%, dev_acc 100.00%
step 1500: loss 17.78319, train_acc 100.00%, dev_acc 100.00%
step 1600: loss 13.40028, tra

Making prediction on test set

In [0]:
y_pred = sess.run(pred, feed_dict=test_feed_dict)

In [0]:
y_pred[:5]

array([[9.4432153e-23, 9.9975628e-01, 5.7006046e-19, 2.4366759e-04],
       [1.4105912e-02, 2.8582759e-02, 9.3752563e-01, 1.9785790e-02],
       [1.0000000e+00, 1.4310562e-13, 3.3855738e-17, 1.9260706e-10],
       [2.0475909e-19, 9.9998355e-01, 2.3275499e-09, 1.6477563e-05],
       [2.6482583e-31, 9.9996090e-01, 4.9270070e-18, 3.9072256e-05]],
      dtype=float32)

In [0]:
y_test[:5]

array([[0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.]], dtype=float32)

In [0]:
correct = 0
for i in range(len(y_pred)):
    if(np.argmax(y_pred[i])==np.argmax(y_test[i])):
        correct+=1
print('test accuracy is ',correct/len(y_test))

test accuracy is  0.9629120879120879


# Thyroid Desiase classification with Keras

In [0]:
input_size = X_train.shape[1]

# Initialising the NN
model = Sequential()

# layer
model.add(
    Dense(num_labels,
          kernel_initializer='uniform',
          activation='softmax',
          input_dim=input_size))

# summary
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 4)                 60        
Total params: 60
Trainable params: 60
Non-trainable params: 0
_________________________________________________________________


In [0]:
N_EPOCHS = 100
LEARNING_RATE = 0.001

#initializing optimizer
adam_opt = Adam(lr=LEARNING_RATE, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)

# Compiling the NN
model.compile(optimizer = adam_opt, loss = 'categorical_crossentropy', metrics = ['accuracy'])

# Train the NN
model.fit(X_train, y_train, validation_data=(X_dev, y_dev), batch_size = 32, epochs = N_EPOCHS)

Train on 6598 samples, validate on 1650 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100

<keras.callbacks.History at 0x7f01af1970f0>

Making prediction on test set

In [0]:
y_pred = model.predict(X_test)

In [0]:
y_pred[:5]

array([[7.9889335e-17, 5.2069753e-01, 5.0540625e-06, 4.7929737e-01],
       [7.0900400e-04, 1.1808803e-02, 2.7189169e-02, 9.6029299e-01],
       [1.0000000e+00, 5.5427488e-14, 1.2631550e-14, 3.0414964e-09],
       [2.6823562e-20, 8.2109475e-01, 3.7917489e-10, 1.7890523e-01],
       [0.0000000e+00, 9.9883240e-01, 1.4575347e-37, 1.1675935e-03]],
      dtype=float32)

In [0]:
y_test[:5]

array([[0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.]], dtype=float32)

In [0]:
correct = 0
for i in range(len(y_pred)):
    if(np.argmax(y_pred[i])==np.argmax(y_test[i])):
        correct+=1
print('test accuracy is ',correct/len(y_test))

test accuracy is  0.9608516483516484
