# Table of Contents

1. [Setup](#Setup)
2. [Deep Learning Pipeline](#Machine-Learning-Pipeline)

# Setup
[Back to Top](#Table-of-Contents)

In [1]:
%matplotlib inline

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm,trange

import tensorflow as tf

tf.logging.set_verbosity(tf.logging.ERROR)

In [2]:
# ROOT_DIR - root directory
ROOT_DIR = os.getcwd()+'/'

# FEATURE_DIR - directory where feature dataframes are saved
DATA_DIR = ROOT_DIR + 'dataframes/'

In [3]:
X_train = np.loadtxt(DATA_DIR+'X_train.csv',delimiter=',')
X_test = np.loadtxt(DATA_DIR+'X_test.csv',delimiter=',')
X_val = np.loadtxt(DATA_DIR+'X_val.csv',delimiter=',')

y_train = np.loadtxt(DATA_DIR+'y_train.csv',delimiter=',')
y_test = np.loadtxt(DATA_DIR+'y_test.csv',delimiter=',')
y_val = np.loadtxt(DATA_DIR+'y_val.csv',delimiter=',')

In [4]:
from sklearn.preprocessing import StandardScaler
scl = StandardScaler().fit(X_train)
X_train_scl = scl.transform(X_train)
X_test_scl = scl.transform(X_test)
X_val_scl = scl.transform(X_val)

In [5]:
train_set = tf.contrib.learn.python.learn.datasets.base.Dataset(data=X_train_scl,target=y_train)
test_set = tf.contrib.learn.python.learn.datasets.base.Dataset(data=X_test_scl,target=y_test)
val_set = tf.contrib.learn.python.learn.datasets.base.Dataset(data=X_val_scl,target=y_val)

# Deep Learning Pipeline

## Parameter search

In [6]:
# Specify important validation metrics
validation_metrics = {'accuracy': tf.contrib.metrics.streaming_accuracy,
                      'precision': tf.contrib.metrics.streaming_precision,
                      'recall': tf.contrib.metrics.streaming_recall}

In [7]:
# Specify that all features have real-value data
feature_columns = [tf.contrib.layers.real_valued_column("", dimension=148)]

In [8]:
monitor_params = {'x':test_set.data,
                 'y':test_set.target,
                 'every_n_steps':50,
                 'metrics':validation_metrics,
                 'early_stopping_metric':'loss',
                 'early_stopping_metric_minimize':True,
                 'early_stopping_rounds':200}

clf_params = {'feature_columns':feature_columns,
              'hidden_units':[500,500,500],
              'n_classes':2,
              'activation_fn':tf.nn.relu,
              'optimizer':tf.train.RMSPropOptimizer(learning_rate=0.0001),
              'dropout':0.5,
              'config':tf.contrib.learn.RunConfig(save_checkpoints_secs=1)}

fit_params = {'x':train_set.data,
              'y':train_set.target,
              'steps':10000}

In [9]:
def paramSweep(param,vals,start=0,monitor_params=dict(monitor_params),clf_params=dict(clf_params),
               fit_params=dict(fit_params)):
    
    # Initialize Variables
    n_clfs = len(vals)
    clfs = [None]*n_clfs
    monitors = [None]*n_clfs
    
    print('Sweeping through %d values for "%s"'%(n_clfs,param))
    
    for i in range(n_clfs):
       
        # Initialize model directory
        model_dir = os.path.join('models',param,str(i+start))
        
        print('"%s" = %s'%(param,str(vals[i])))
        print('Model saved in "%s"'%model_dir)
        
        # Initialize monitor
        monitors[i] = tf.contrib.learn.monitors.ValidationMonitor(**monitor_params)
        
        # Update parameters
        clf_params[param] = vals[i]
        clf_params['model_dir'] = model_dir
        fit_params['monitors'] = [monitors[i]]

        # Initialize classifier
        clfs[i] = tf.contrib.learn.DNNClassifier(**clf_params)
        
        # Train classifier
        clfs[i].fit(**fit_params)
        
        # Test classifier
        scores = clfs[i].evaluate(x=test_set.data, y=test_set.target)
        print('Accuracy: {0:f}'.format(scores['accuracy']))
        print('AUC: {0:f}'.format(scores['auc']))

    return clfs

In [10]:
opt_clfs = paramSweep('optimizer',[tf.train.AdagradOptimizer(learning_rate=0.05),
                                   tf.train.RMSPropOptimizer(learning_rate=0.0001),
                                   tf.train.RMSPropOptimizer(learning_rate=0.0005),
                                   tf.train.RMSPropOptimizer(learning_rate=0.0001,decay=0.5),
                                   tf.train.AdamOptimizer(learning_rate=0.0001)])

Sweeping through 5 values for "optimizer"
"optimizer" = <tensorflow.python.training.adagrad.AdagradOptimizer object at 0x7f2319d9b810>
Model saved in "models/optimizer/0"
Accuracy: 0.681020
AUC: 0.749779
"optimizer" = <tensorflow.python.training.rmsprop.RMSPropOptimizer object at 0x7f2319d9b850>
Model saved in "models/optimizer/1"
Accuracy: 0.682200
AUC: 0.751229
"optimizer" = <tensorflow.python.training.rmsprop.RMSPropOptimizer object at 0x7f2319d9b990>
Model saved in "models/optimizer/2"
Accuracy: 0.674384
AUC: 0.752192
"optimizer" = <tensorflow.python.training.rmsprop.RMSPropOptimizer object at 0x7f2319d9b910>
Model saved in "models/optimizer/3"
Accuracy: 0.680873
AUC: 0.751913
"optimizer" = <tensorflow.python.training.adam.AdamOptimizer object at 0x7f23174bc390>
Model saved in "models/optimizer/4"
Accuracy: 0.684117
AUC: 0.752831


In [10]:
hidden_unit_clfs = paramSweep('hidden_units',[[100],[500],[100,200,100],[500,500,500]])

Sweeping through 4 values for "hidden_units"
"hidden_units" = [100]
Model saved in "models/hidden_units/0"
Accuracy: 0.676154
AUC: 0.746102
"hidden_units" = [500]
Model saved in "models/hidden_units/1"
Accuracy: 0.683233
AUC: 0.748555
"hidden_units" = [100, 200, 100]
Model saved in "models/hidden_units/2"
Accuracy: 0.679988
AUC: 0.751736
"hidden_units" = [500, 500, 500]
Model saved in "models/hidden_units/3"
Accuracy: 0.688247
AUC: 0.755893


In [10]:
dropout_clfs = paramSweep('dropout',[0.25,0.5,0.75])

Sweeping through 3 values for "dropout"
"dropout" = 0.25
Model saved in "models/dropout/0"
Accuracy: 0.682053
AUC: 0.753058
"dropout" = 0.5
Model saved in "models/dropout/1"
Accuracy: 0.689279
AUC: 0.755312
"dropout" = 0.75
Model saved in "models/dropout/2"
Accuracy: 0.661702
AUC: 0.753494


# Final Classifier

Best parameters:
* Optimizer - RMSProp/Adam
* Hidden Units - [500,500,500]
* Dropout: 0.5

## Construct Deep NN Classifier