# Eval - DNN Evaluations

In [1]:
import pyspark

sparkConfig = {
    'spark.executor.memory': '30g',
    'spark.driver.memory': '60g',
    'spark.master': 'local[*]',
    'spark.default.parallelism': '30',
    'spark.driver.maxResultSize': '4g',
}
conf = pyspark.SparkConf() \
    .setMaster('local[*]') \
    .setAppName('Eval - DNN Evaluations')
for k,v in sparkConfig.items():
    conf = conf.set(k, v)
sc = pyspark.SparkContext(conf=conf)

In [2]:
import datetime
import importlib
import json, pickle
import numpy as N
import numpy.random as NR
import matplotlib.pyplot as pyplot
import seaborn
import sklearn.metrics as SkM
import pandas as P
from pathlib import Path
from os import listdir
import tqdm
import re

seaborn.set_style('whitegrid')

import keras as K
import keras.backend as KB
import keras.callbacks as KCb
import keras.layers as KL
import keras.models as KM
import keras.regularizers as KR
import keras.optimizers as KO

## Load Data

In [3]:
import common.data

pathChirps = Path('datasets/Chirps')
pathStocks = Path('datasets/Stocks')
pathAnalyticsChirp = Path('analytics/Chirps')
pathModelEmbedding = Path("models/embedding/word2vec")
pathStage1 = Path("models/stage1")

startTrain = datetime.date(2017, 1, 1)
endTrain = datetime.date(2019, 1, 1)
startTest = endTrain
endTest = datetime.date(2019, 7, 1)

pathTestInstances = pathStage1 / 'instances_test'

with open(pathStage1 / 'properties.json', 'r') as f:
    sampleProperties = json.load(f)
    
# Word2Vec

with open(pathModelEmbedding / 'properties.json', 'r') as f:
    embeddingProperties = json.load(f)
with open(pathModelEmbedding / 'dict.pickle', 'rb') as f:
    word2vec = pickle.load(f)
print(f"{len(word2vec)} embedding entries loaded")

# Stock data

ticker = "^DJI"
stockDf = P.read_csv(pathStocks / f'{ticker}.csv').set_index('Date')
stockDf.index = stockDf.index.map(datetime.datetime.fromisoformat).map(lambda x:x.date())
stockDf_train = stockDf[stockDf.index.map(lambda x:startTrain <= x and x < endTrain)]
stockDf_test = stockDf[stockDf.index.map(lambda x:startTest <= x and x < endTest)]

77176 embedding entries loaded


In [4]:
data_to_sample_transform = common.data.get_data_to_sample_transform(word2vec, embeddingProperties, stockDf['Y'], sc=sc)

rddTest = sc.pickleFile(str(pathTestInstances), 32) \
    .cache()
rddTest.takeSample(False, 2)

[(datetime.date(2019, 5, 8),
  array(['reuters journalists free in myanmar', 'separate kids to parents',
         'news tell a story', ..., 'barr to look into the material g',
         'start from the ground with dirt', 'the family get a settlement'],
        dtype='<U129')),
 (datetime.date(2019, 1, 22),
  array(['bet he\'ll be heading to the caribbean soon... gofundme "trump wall" donors have give million',
         'to pull e-cigarettes off the market', 'rig polls for trump', ...,
         'the moment mr dunkerton financially supported people',
         'a court have order four', 'trump laud football players'],
        dtype='<U114'))]

In [5]:
test_samples = rddTest.collect()
rddTest.unpersist()

MapPartitionsRDD[1] at objectFile at NativeMethodAccessorImpl.java:0

## Load models

In [6]:
import tensorflow as tf
tf_strategy = tf.distribute.MirroredStrategy()
def sign_accuracy(y_true, y_pred):
    return KB.mean((y_true * y_pred) >= 0)
custom_objects = {'sign_accuracy': sign_accuracy}
with tf_strategy.scope():
    model_cnn = KM.load_model(f'models/nn-conv-{ticker}/e10', custom_objects=custom_objects)
    model_cnnatt = KM.load_model(f'models/nn-convatt-{ticker}/e10', custom_objects=custom_objects)

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1')


In [7]:
def model_evaluate(model):
    # Read test samples in batches
    batch_size = 8
    n_batches = int(N.ceil(len(test_samples) / batch_size))
    df = P.DataFrame()
    for i in tqdm.tqdm(range(n_batches)):
        batch = test_samples[i*batch_size:(i+1)*batch_size]
        batch_embed = N.array([data_to_sample_transform(x)[0] for x in batch])
        
        y = model.predict(batch_embed)[...,0]
        
        batch_result = {
            'date': [d for d,_ in batch],
            'Yhat': y,
        }
        
        df = P.concat([df, P.DataFrame(batch_result)], axis=0)
    return df

df_cnn = model_evaluate(model_cnn)
df_cnn.to_csv(f'analytics/cnn_{ticker}_raw.csv', index=False)
df_cnn

  0%|          | 0/496 [00:00<?, ?it/s]

Instructions for updating:
Use `tf.data.Iterator.get_next_as_optional()` instead.


100%|██████████| 496/496 [15:04<00:00,  1.82s/it]


Unnamed: 0,date,Yhat
0,2019-02-13,-0.487070
1,2019-02-13,-0.621753
2,2019-02-13,-0.095621
3,2019-02-13,-0.305309
4,2019-02-13,-0.114439
...,...,...
3,2019-06-21,-1.003469
4,2019-06-21,-0.252180
5,2019-06-21,-1.429891
6,2019-06-21,-0.145209


In [8]:

df_cnnatt = model_evaluate(model_cnnatt)
df_cnnatt.to_csv(f'analytics/cnnatt_{ticker}_raw.csv', index=False)
df_cnnatt

100%|██████████| 496/496 [14:58<00:00,  1.81s/it]


Unnamed: 0,date,Yhat
0,2019-02-13,0.320398
1,2019-02-13,0.205320
2,2019-02-13,-0.329300
3,2019-02-13,0.312234
4,2019-02-13,-0.313654
...,...,...
3,2019-06-21,0.155694
4,2019-06-21,-0.199397
5,2019-06-21,0.169672
6,2019-06-21,0.113658
