In [1]:
import tensorflow as tf

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv('data.csv')
df.head()

Unnamed: 0,Gender,Age,Chest Pain,Genereal level of tiredness,"Pulse, resting",Blood Type,Heart disease mom/dad,Smoking,Cholesterol,Alcohol,BMI,Fitness,Use of contact lenses,Diabetes,H�matokritv�rdi,EKG,Go to doctor
0,M,4,Periodic,4,59,0 pos,N,0,1,6,22,121,N,Healthy,38,Anormalities,NO
1,M,6,Often,8,80,0 pos,N,0,6,10,35,118,Y,Type 1,39,Anormalities,OBS
2,F,15,Periodic,0,67,B pos,Y,4,1,0,22,154,N,Healthy,41,Normal,NO
3,M,15,Often,10,69,B pos,Y,6,1,0,22,150,N,Healthy,40,Anormalities,NO
4,F,17,,1,72,B pos,N,7,1,6,22,124,N,Healthy,42,Anormalities,NO


In [4]:

category_fields = ['Gender', 'Chest Pain', 'Blood Type', 'Heart disease mom/dad', 'Use of contact lenses', 
                   'Diabetes', 'EKG', 'Go to doctor']

for each in category_fields:
    dummies = pd.get_dummies(df[each], prefix=each, drop_first=False)
    df = pd.concat([df, dummies], axis=1)
    
df = df.drop(category_fields, axis=1)

pd_y = df[['Go to doctor_YES', 'Go to doctor_OBS', 'Go to doctor_NO']]
pd_x = df.drop(['Go to doctor_YES', 'Go to doctor_OBS', 'Go to doctor_NO'], axis=1)
print(pd_y.head())

   Go to doctor_YES  Go to doctor_OBS  Go to doctor_NO
0                 0                 0                1
1                 0                 1                0
2                 0                 0                1
3                 0                 0                1
4                 0                 0                1


In [5]:
X_train, X_test, y_train, y_test = train_test_split(pd_x, pd_y, 
                                                    test_size=0.2, 
                                                    random_state=0)
X_test, X_eval, y_test, y_eval = train_test_split(X_test, y_test, 
                                                    test_size=0.5, 
                                                    random_state=0)

def chunk(seq, size):
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

train_xb, train_yb = list(chunk(X_train, 8)), list(chunk(y_train, 8))

In [6]:
def hidden_layer(inputs, nodes, keep_rate, norm, training):
    outputs = tf.layers.dense(inputs, nodes, kernel_initializer=tf.contrib.layers.xavier_initializer())
    outputs = tf.nn.relu(outputs)
    if norm == 1:
        outputs = tf.layers.batch_normalization(outputs, training=training)
    outputs = tf.layers.dropout(outputs, 1-keep_rate, training=training)
    return outputs

In [7]:
def get_logits(net):
    norm_input = tf.layers.batch_normalization(net.inputs_, training=net.training_)
    hl = hidden_layer(norm_input, net.nodes, net.krate, 1, net.training_)
    hl = hidden_layer(hl, net.nodes, net.krate, 1, net.training_)
    logits = hidden_layer(hl, 3, net.krate, 1, net.training_)
    return logits

In [8]:
def get_cost(logits, targets):
    cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=targets),name='cross_entropy')
    return cross_entropy

In [9]:
def get_softmax(logits):
    return tf.nn.softmax(logits)

In [10]:
def write_cross_entropy(cost):
    with tf.name_scope('cross_entropy'):
        cross_entropy = tf.summary.scalar('cross_entropy', cost)
        return cross_entropy

In [11]:
def set_hypertune_param(cost):
    with tf.name_scope('hypertune'):
        hypertune = tf.summary.scalar('training/hptuning/metric', cost)
        return hypertune

In [12]:
def get_optimizer(cost, lr):
    with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
        optimizer = tf.train.AdamOptimizer(learning_rate=lr).minimize(cost)
        return optimizer

In [13]:
def calculate_metrics(pred, targets):
    predictions_pd = pd.DataFrame(pred[:,:], columns=['YES', 'OBS', 'NO'])
    targets_pd = pd.DataFrame(targets[:,:], columns=['YES', 'OBS', 'NO'])
    predictions_pd['pred_bins'] = predictions_pd.idxmax(axis=1)
    targets_pd['targ_bins'] = targets_pd.idxmax(axis=1)
    test_pd = pd.concat([predictions_pd, targets_pd], axis=1)
    test_pd['tp'] = 0
    test_pd['fp'] = 0
    test_pd['fn'] = 0

    for each in ['YES', 'OBS', 'NO']:
        test_pd['tp_' + each] = test_pd.apply(
            lambda x: 1 if x['pred_bins'] == each and x['targ_bins'] == each else 0, axis=1).astype(float)
        test_pd['tp'] += test_pd['tp_' + each]
    
        test_pd['fp_' + each] = test_pd.apply(
            lambda x: 1 if x['pred_bins'] == each and x['targ_bins'] != each else 0, axis=1).astype(float)
        test_pd['fp'] += test_pd['fp_' + each]

        test_pd['fn_' + each] = test_pd.apply(
            lambda x: 1 if x['pred_bins'] != each and x['targ_bins'] == each else 0, axis=1).astype(float)
        test_pd['fn'] += test_pd['fn_' + each]
    
    print(test_pd.head())
    return test_pd

In [22]:
class FFDNN:
    def __init__(self):
        print('MODEL INIT')
        tf.reset_default_graph()
        self.epochs = 100
        self.layers = 2
        self.nodes = 200
        self.krate = 0.5
        self.batchtrain = False
        self.input_shape = [None, train_xb[0].as_matrix().shape[1]]
        self.output_shape = [None, train_yb[0].as_matrix().shape[1]]
        self.inputs_ = tf.placeholder(tf.float32, shape=self.input_shape, name='inputs')
        self.targets_ = tf.placeholder(tf.float32, shape=self.output_shape, name='targets')
        self.train_learning_rate = 0.001
        self.lr_ = tf.placeholder(tf.float32, shape=[], name='lr')
        self.training_ = tf.placeholder(tf.bool, shape=[], name='training')
        self.logits = get_logits(self)
        self.softmax = get_softmax(self.logits)
        self.cost = get_cost(self.logits, self.targets_)
        self.cross_entropy = write_cross_entropy(self.cost)
        self.hypertune = set_hypertune_param(self.cost)
        self.optimizer = get_optimizer(self.cost, self.lr_)
        self.saver = tf.train.Saver()

In [23]:
def train(net):
    print('TRAIN')
    with tf.Session() as sess:
        ##sess.run(tf.global_variables_initializer())
        ckpt = './activity6.ckpt'
        net.saver.restore(sess, ckpt)
        train_writer = tf.summary.FileWriter('logs/train', sess.graph)
        eval_writer = tf.summary.FileWriter('logs/eval', sess.graph)
        
        iteration = 0
        for e in range(net.epochs):
            
            if net.batchtrain == True:
                for b_idx, b in enumerate(train_xb,0):
                    iteration += 1
                    x, y = train_xb[b_idx].as_matrix(), train_yb[b_idx].as_matrix()
                    train_merged = tf.summary.merge([net.cross_entropy])

                    # Train
                    feed = {net.inputs_: x, net.targets_: y, net.lr_: net.train_learning_rate, net.training_: True}
                    summary_train, _ = sess.run([train_merged, net.optimizer], feed_dict=feed)
                    train_writer.add_summary(summary_train, iteration)
            else:
                iteration += 1
                x, y = X_train.as_matrix(), y_train.as_matrix()
                train_merged = tf.summary.merge([net.cross_entropy])

                # Train
                feed = {net.inputs_: x, net.targets_: y, net.lr_: net.train_learning_rate, net.training_: True}
                summary_train, _ = sess.run([train_merged, net.optimizer], feed_dict=feed)
                train_writer.add_summary(summary_train, iteration)
                
            x, y = X_eval.as_matrix(), y_eval.as_matrix()

            feed = {net.inputs_: x, net.targets_: y, net.lr_: net.train_learning_rate, net.training_: False}
            summary_train_eval, cost = sess.run([train_merged, net.cost], feed_dict=feed)
            
            eval_writer.add_summary(summary_train_eval, iteration)
            
            print('Epoch ' + str(e) + ': Eval Loss = ' + str(cost))

        train_writer.close()
        eval_writer.close()
        net.saver.save(sess, './activity6.ckpt')

In [24]:
def test(net):
    print('TEST')
    with tf.Session() as sess:
        ckpt = './activity6.ckpt'
        net.saver.restore(sess, ckpt)
            
        x, y = X_test.as_matrix(), y_test.as_matrix()

        feed = {net.inputs_: x, net.targets_: y, net.lr_: net.train_learning_rate, net.training_: False}
        pred = sess.run(net.softmax, feed_dict=feed)
    
    details_test = calculate_metrics(pred, y)
    
    with tf.Session() as sess:
        net.saver.restore(sess, ckpt)
        
        x_all = pd.concat([X_train, X_test, X_eval], axis=0)
        y_all = pd.concat([y_train, y_test, y_eval], axis=0)
        x, y = x_all.as_matrix(), y_all.as_matrix()

        feed = {net.inputs_: x, net.targets_: y, net.lr_: net.train_learning_rate, net.training_: False}
        pred = sess.run(net.softmax, feed_dict=feed)
    
    details_all = calculate_metrics(pred, y)
    return details_test, details_all


In [25]:
net = FFDNN()
train(net)

MODEL INIT


AttributeError: FFDNN instance has no attribute 'batchtrain'

In [18]:
details, details_all = test(net)
print(details.shape)

TEST
INFO:tensorflow:Restoring parameters from ./activity6.ckpt
        YES       OBS        NO pred_bins  YES  OBS  NO targ_bins   tp   fp  \
0  0.557001  0.309277  0.133722       YES    1    0   0       YES  1.0  0.0   
1  0.338974  0.479035  0.181991       OBS    1    0   0       YES  0.0  1.0   
2  0.273580  0.431403  0.295017       OBS    1    0   0       YES  0.0  1.0   
3  0.765355  0.079647  0.154998       YES    0    0   1        NO  0.0  1.0   
4  0.641995  0.117322  0.240683       YES    1    0   0       YES  1.0  0.0   

    fn  tp_YES  fp_YES  fn_YES  tp_OBS  fp_OBS  fn_OBS  tp_NO  fp_NO  fn_NO  
0  0.0     1.0     0.0     0.0     0.0     0.0     0.0    0.0    0.0    0.0  
1  1.0     0.0     0.0     1.0     0.0     1.0     0.0    0.0    0.0    0.0  
2  1.0     0.0     0.0     1.0     0.0     1.0     0.0    0.0    0.0    0.0  
3  1.0     0.0     1.0     0.0     0.0     0.0     0.0    0.0    0.0    1.0  
4  0.0     1.0     0.0     0.0     0.0     0.0     0.0    0.0    0.0   

In [19]:
summaries_test = details.sum(axis=0)
summaries_all = details_all.sum(axis=0)
#summaries_test = calc_overall(summaries_test)
print("TEST SUMMARY")
print(summaries_test)
print("ALL SUMMARY")
print(summaries_all)

TEST SUMMARY
YES                                                     27.122
OBS                                                    22.0452
NO                                                     50.8329
pred_bins    YESOBSOBSYESYESNOYESNONONONONOOBSNONONONONONOO...
YES                                                         28
OBS                                                         16
NO                                                          56
targ_bins    YESYESYESNOYESNOYESNONONOYESNONONONOYESYESNONO...
tp                                                          73
fp                                                          27
fn                                                          27
tp_YES                                                      15
fp_YES                                                      10
fn_YES                                                      13
tp_OBS                                                       7
fp_OBS                                    

In [21]:
def calc_recall_precision(summary, affix):
    recall = summary['tp' + affix] / (summary['tp' + affix] + summary['fn' + affix])
    precision = summary['tp' + affix] / (summary['tp' + affix] + summary['fp' + affix])
    print(affix + ' recall: ' + str(recall) + ' ' + affix + ' precision: ' + str(precision))

In [22]:
print('TEST SET METRICS')
calc_recall_precision(summaries_test, '')
calc_recall_precision(summaries_test, '_YES')
calc_recall_precision(summaries_test, '_NO')
calc_recall_precision(summaries_test, '_OBS')

TEST SET METRICS
 recall: 0.73  precision: 0.73
_YES recall: 0.535714285714 _YES precision: 0.6
_NO recall: 0.910714285714 _NO precision: 0.879310344828
_OBS recall: 0.4375 _OBS precision: 0.411764705882


In [23]:
print('ALL DATA METRICS')
calc_recall_precision(summaries_all, '')
calc_recall_precision(summaries_all, '_YES')
calc_recall_precision(summaries_all, '_NO')
calc_recall_precision(summaries_all, '_OBS')

ALL DATA METRICS
 recall: 0.782  precision: 0.782
_YES recall: 0.548387096774 _YES precision: 0.683417085427
_NO recall: 0.917089678511 _NO precision: 0.888524590164
_OBS recall: 0.645962732919 _OBS precision: 0.544502617801
