## Imports & Settings

In [21]:
import warnings
warnings.filterwarnings('ignore')

In [22]:
import os
from pathlib import Path
from importlib import reload
from joblib import dump, load

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import roc_auc_score

import tensorflow as tf
from keras.models import Sequential
from keras import backend as K
from keras.wrappers.scikit_learn import KerasClassifier
from keras.layers import Dense, Dropout, Activation
from keras.callbacks import Callback, EarlyStopping, TensorBoard

In [23]:
def set_keras_backend(backend):

    if K.backend() != backend:
        os.environ['KERAS_BACKEND'] = backend
        reload(K)
        assert K.backend() == backend

set_keras_backend("tensorflow")

In [24]:
np.random.seed(42)

## Build Dataset

In [25]:
prices = (pd.read_hdf('../data/assets.h5', 'quandl/wiki/prices')
          .adj_close
          .unstack().loc['2007':])
prices.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 4706 entries, 2000-01-03 to 2018-03-27
Columns: 3199 entries, A to ZUMZ
dtypes: float64(3199)
memory usage: 114.9 MB


In [31]:
returns = (prices
           .resample('M')
           .last()
           .pct_change()
           .loc['2008': '2017']
           .dropna(axis=1)
           .sort_index(ascending=False))
returns.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 120 entries, 2017-12-31 to 2008-01-31
Freq: -1M
Columns: 2489 entries, A to ZUMZ
dtypes: float64(2489)
memory usage: 2.3 MB


In [33]:
returns.head().append(returns.tail())

ticker,A,AAL,AAN,AAON,AAP,AAPL,AAWW,ABAX,ABC,ABCB,...,ZEUS,ZIGO,ZINC,ZION,ZIOP,ZIXI,ZLC,ZMH,ZQK,ZUMZ
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-12-31,-0.032785,0.030501,0.056469,0.006859,-0.01297,-0.015246,0.015584,0.016003,0.082528,-0.028226,...,0.078815,0.0,0.0,0.025832,-0.094092,-0.004545,0.0,0.0,0.0,-0.044725
2017-11-30,0.017786,0.078385,0.025,0.041429,0.235625,0.016623,-0.05868,0.007025,0.107587,0.035491,...,0.055085,0.0,0.0,0.066509,-0.019313,-0.092784,0.0,0.0,0.0,0.235127
2017-10-31,0.061814,-0.014108,-0.156544,0.015228,-0.176008,0.096808,-0.067629,0.083987,-0.070091,-0.001043,...,-0.141818,0.0,0.0,-0.015261,-0.241042,-0.00818,0.0,0.0,0.0,-0.024862
2017-09-30,-0.008035,0.061466,-0.013832,0.057515,0.013928,-0.060244,-0.01497,-0.033968,0.031153,0.090808,...,0.205479,0.0,0.0,0.080623,-0.039124,-0.079096,0.0,0.0,0.0,0.453815
2017-08-31,0.082455,-0.111179,-0.043431,-0.035503,-0.125971,0.106251,0.124579,-0.013579,-0.140733,-0.03821,...,0.069057,0.0,0.0,-0.034067,0.155515,-0.003752,0.0,0.0,0.0,-0.019685
2008-05-31,0.23767,-0.538999,-0.122768,0.162611,0.162053,0.085082,0.020105,0.153454,0.021099,-0.073431,...,0.269937,0.026587,0.00214,-0.06206,-0.163399,-0.321053,0.051158,-0.018339,-0.122302,0.000477
2008-04-30,0.012739,-0.035915,0.178947,-0.097354,0.018502,0.212195,0.103273,0.099698,-0.010493,-0.067248,...,0.135255,-0.062701,0.210708,0.017563,0.040816,-0.018088,0.048583,-0.047521,-0.008155,0.335245
2008-03-31,-0.025482,-0.281452,0.041991,0.213204,0.017068,0.147816,0.086957,-0.204873,-0.017737,0.13929,...,0.09201,-0.023548,-0.26242,-0.046073,-0.048544,-0.012755,0.022774,0.034135,0.09,-0.107509
2008-02-29,-0.095983,-0.104046,0.067251,-0.072472,-0.062605,-0.076389,0.013216,-0.104762,-0.102822,-0.098859,...,0.223413,0.086104,0.047365,-0.120684,-0.063636,0.101124,0.180929,-0.036473,-0.055614,-0.085803
2008-01-31,-0.078389,-0.059143,-0.00927,-0.101917,-0.058173,-0.31664,-0.078938,-0.092303,0.03811,-0.063501,...,0.065594,-0.058587,-0.116676,0.172414,-0.067797,-0.226087,0.01868,0.181255,0.110723,-0.210591


In [51]:
n = len(returns)
T = 24
tcols = list(range(25))

In [None]:
data = pd.DataFrame()
for i in range(n-T-1):
    df = returns.iloc[i:i+T+1]
    data = pd.concat([data, (df.reset_index(drop=True).T
                             .assign(year=df.index[0].year,
                                     month=df.index[0].month))],
                     ignore_index=True)
data[tcols] = (data[tcols].apply(lambda x: x.clip(lower=x.quantile(.01),
                                                  upper=x.quantile(.99))))
data['label'] = (data[0] > 0).astype(int)
data = pd.get_dummies(data.drop(0, axis=1).apply(
    pd.to_numeric), columns=['year', 'month'])
data.info()

In [53]:
data.shape

(236455, 45)

In [49]:
data.to_hdf('data.h5', 'returns')

## Define Network Architecture

### Custom AUC Loss Metric

In [55]:
def auc_roc(y_true, y_pred):
    # any tensorflow metric
    value, update_op = tf.metrics.auc(y_true, y_pred)

    # find all variables created for this metric
    metric_vars = [i for i in tf.local_variables() if 'auc_roc' in i.name.split('/')[1]]

    # Add metric variables to GLOBAL_VARIABLES collection.
    # They will be initialized for new session.
    for v in metric_vars:
        tf.add_to_collection(tf.GraphKeys.GLOBAL_VARIABLES, v)

    # force to update metric values
    with tf.control_dependencies([update_op]):
        value = tf.identity(value)
        return value

### Set up `build_fn` for `keras.wrappers.scikit_learn.KerasClassifier`

In [5]:
def make_model(dense_layers, activation, dropout):
    '''Creates a multi-layer perceptron model
    
    dense_layers: List of layer sizes; one number per layer
    '''

    model = Sequential()
    for i, layer_size in enumerate(dense_layers, 1):
        if i == 1:
            model.add(Dense(layer_size, input_dim=input_dim))
            model.add(Activation(activation))
        else:
            model.add(Dense(layer_size))
            model.add(Activation(activation))
    model.add(Dropout(dropout))
    model.add(Dense(1))
    model.add(Activation('sigmoid'))

    model.compile(loss='binary_crossentropy',
                  optimizer='Adam',
                  metrics=['binary_accuracy', auc_roc])

    return model

## Run Keras with `GridSearchCV`

### Train-Test Split

In [65]:
data = pd.read_hdf('data.h5', 'returns')
features, label = data.drop('label' , axis=1), data.label

In [50]:
test_size = .1
n_splits = 5

In [49]:
X_train, X_test, y_train, y_test = train_test_split(features, label,
                                                    test_size=test_size,
                                                    random_state=42,
                                                    shuffle=True,
                                                    stratify=data.label)

### Define GridSearch inputs

In [62]:
clf = KerasClassifier(make_model, epochs=10, batch_size=32)

In [58]:
cv = StratifiedKFold(n_splits=5, shuffle=True)

In [60]:
param_grid = {'dense_layers': [[32], [32, 32], [64], [64, 64], [64, 64, 32], [64, 32], [128]],
              'activation'  : ['relu', 'tanh'],
              'dropout'     : [.25, .5, .75],
              }

In [64]:
gs = GridSearchCV(estimator=clf,
                  param_grid=param_grid,
                  scoring='roc_auc',
                  cv=cv,
                  refit=True,
                  return_train_score=True,
                  n_jobs=-1,
                  verbose=1,
                  iid=False,
                  error_score=np.nan)

In [None]:
fit_params = dict(callbacks=[EarlyStopping(monitor='auc_roc', 
                                           patience=300, 
                                           verbose=1, mode='max')],
                  verbose=2,
                  epochs=50)

In [None]:
gs.fit(X=X_train.astype(float), y=y_train, **fit_params)
print('\nBest Score: {:.2%}'.format(gs.best_score_))
print('Best Params:\n', pd.Series(gs.best_params_))

In [None]:
"""
Best Score: 50.49%
Best Params:
 activation            tanh
dense_layers    [256, 256]
dropout               0.25
optimizer          RMSprop
"""

### Persist best model and training data

In [None]:
gs.best_estimator_.model.save('best_model.h5')

In [9]:
with pd.HDFStore('data.h5') as store:
    store.put('X_train', X_train)
    store.put('X_test', X_test)
    store.put('y_train', y_train)
    store.put('y_test', y_test)

## Custom Callback

In [None]:
class roc_callback(Callback):
    def __init__(self,training_data,validation_data):
        self.x = training_data[0]
        self.y = training_data[1]
        self.x_val = validation_data[0]
        self.y_val = validation_data[1]


    def on_train_begin(self, logs={}):
        return

    def on_train_end(self, logs={}):
        return

    def on_epoch_begin(self, epoch, logs={}):
        return

    def on_epoch_end(self, epoch, logs={}):
        y_pred = self.model.predict(self.x)
        roc = roc_auc_score(self.y, y_pred)
        y_pred_val = self.model.predict(self.x_val)
        roc_val = roc_auc_score(self.y_val, y_pred_val)
        print('\rroc-auc: %s - roc-auc_val: %s' % (str(round(roc,4)),str(round(roc_val,4))),end=100*' '+'\n')
        return

    def on_batch_begin(self, batch, logs={}):
        return

    def on_batch_end(self, batch, logs={}):
        return