# Model - Stage2 Type2 - CNN-Attention

This model uses Stage 1 - Word Embedding.

In [1]:
import pyspark

sparkConfig = {
    'spark.executor.memory': '30g',
    'spark.driver.memory': '60g',
    'spark.master': 'local[*]',
    'spark.default.parallelism': '30',
    'spark.driver.maxResultSize': '4g',
}
conf = pyspark.SparkConf() \
    .setMaster('local[*]') \
    .setAppName('Model - Stage2 Type2 - CNN-Attention based')
for k,v in sparkConfig.items():
    conf = conf.set(k, v)
sc = pyspark.SparkContext(conf=conf)
nWorkers = 0 # Set this number to the number of GPU machines on your cluster. If 0 it forces single machine training.

In [2]:
import datetime
import importlib
import json, pickle
import numpy as N
import numpy.random as NR
import matplotlib.pyplot as pyplot
import seaborn
import pandas as P
from pathlib import Path
import pyspark.mllib as SM
import pyspark.mllib.feature as SMF
import os

seaborn.set_style('whitegrid')

import keras as K
import keras.backend as KB
import keras.callbacks as KCb
import keras.layers as KL
import keras.models as KM
import keras.regularizers as KR
import keras.optimizers as KO
import elephas as E
import elephas.spark_model as ESm



In [3]:
import tensorflow as tf
tf_strategy = tf.distribute.MirroredStrategy()

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1')


## Load Data

In [4]:
import common.data
importlib.reload(common.data)

<module 'common.data' from '/data/common/data.py'>

In [5]:
pathChirps = Path('datasets/Chirps')
pathStocks = Path('datasets/Stocks')
pathAnalyticsChirp = Path('analytics/Chirps')
pathModelEmbedding = Path("models/embedding/word2vec")
pathStage1 = Path("models/stage1")

startTrain = datetime.date(2017, 1, 1)
endTrain = datetime.date(2019, 1, 1)
startTest = endTrain
endTest = datetime.date(2019, 7, 1)

pathTrainInstances = pathStage1 / 'instances_train'
pathTestInstances = pathStage1 / 'instances_test'

with open(pathStage1 / 'properties.json', 'r') as f:
    sampleProperties = json.load(f)

In [6]:
# Word2Vec

with open(pathModelEmbedding / 'properties.json', 'r') as f:
    embeddingProperties = json.load(f)
with open(pathModelEmbedding / 'dict.pickle', 'rb') as f:
    word2vec = pickle.load(f)
print(f"{len(word2vec)} embedding entries loaded")

77176 embedding entries loaded


In [7]:
# Stock data

ticker = "^DJI"
stockDf = P.read_csv(pathStocks / f'{ticker}.csv').set_index('Date')
stockDf.index = stockDf.index.map(datetime.datetime.fromisoformat).map(lambda x:x.date())
stockDf_train = stockDf[stockDf.index.map(lambda x:startTrain <= x and x < endTrain)]
stockDf_test = stockDf[stockDf.index.map(lambda x:startTest <= x and x < endTest)]
stockDf_train

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,S,X,sigma_hat,Y
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2017-01-03,19872.859375,19938.529297,19775.929688,19881.759766,339180000,0,0,19877.309570,19875.675886,57.021305,0.673759
2017-01-04,19890.939453,19956.140625,19878.830078,19942.160156,280010000,0,0,19916.549805,19914.094491,62.168018,-0.087547
2017-01-05,19924.560547,19948.599609,19811.119141,19899.289062,269920000,0,0,19911.924805,19908.651881,58.908224,0.384209
2017-01-06,19906.960938,19999.630859,19834.080078,19963.800781,277700000,0,0,19935.380859,19931.284969,73.430061,-0.387225
2017-01-09,19931.410156,19943.779297,19887.380859,19887.380859,287510000,0,0,19909.395508,19902.851029,54.289081,-0.815226
...,...,...,...,...,...,...,...,...,...,...,...
2018-12-24,22317.279297,22339.869141,21792.199219,21792.199219,308420000,0,0,22054.739258,21409.960941,1493.130066,0.202531
2018-12-26,21857.730469,22878.919922,21712.529297,22878.449219,433080000,0,0,22368.089844,21712.365957,959.602075,0.520857
2018-12-27,22629.060547,23138.890625,22267.419922,23138.820312,407940000,0,0,22883.940430,22212.181458,579.844457,0.423706
2018-12-28,23213.609375,23381.880859,22981.330078,23062.400391,336510000,0,0,23138.004883,22457.864894,587.052455,0.165055


In [8]:
data_to_sample_transform = common.data.get_data_to_sample_transform(word2vec, embeddingProperties, stockDf['Y'], sc=sc)

rddTrain = sc.pickleFile(str(pathTrainInstances), 32) \
    .cache()
rddTrain.takeSample(False, 2)

[(datetime.date(2017, 3, 16),
  array(['play resume after tea', 'lack could cost millions',
         'to waive off farm loans in uttar pradesh', ...,
         "judge put donald trump's travel ban", 'people have get notices',
         'trumps budget would cut epa funding'], dtype='<U103')),
 (datetime.date(2018, 6, 1),
  array(['ray mckinnon to be appoint as greenock morton manager',
         'on twitter realdonaldtrump have a lot',
         'donald trump must hold via usatoday', ...,
         'cnbc - oil prices be tank as russia',
         'archaeologists find his 2,000-year-old remains',
         'with trump takeover gop'], dtype='<U118'))]

## Model

In [9]:
pathModel = Path(f'models/nn-convatt-{ticker}')
pathModel.mkdir(exist_ok=True, parents=True)
nWorkers = 0

In [10]:

def time_dist(l):
    return KL.TimeDistributed(l, name='td_' + l.name)
def create_model(name="Stage2Type2"):
    
    def get_act(n):
        return KL.LeakyReLU(0.01, name=n)
    def get_kernel_reg():
        return KR.l2(1e-4)
    lIn = KL.Input((sampleProperties['sampleSize'], embeddingProperties['tweet_len'], embeddingProperties['embedding_size']), name="Input")
    layers = [
        # 
        time_dist(KL.Conv1D(64, 4, 1, kernel_regularizer=get_kernel_reg(), padding='same', name='Conv1')),
        get_act('Conv1A'),
        time_dist(KL.MaxPooling1D(2, name='Pool1')),
        time_dist(KL.Conv1D(64, 4, 1, kernel_regularizer=get_kernel_reg(), padding='same', name='Conv2')),
        get_act('Conv2A'),
        time_dist(KL.MaxPooling1D(2, name='Pool2')),
        time_dist(KL.Conv1D(6, 4, 1, kernel_regularizer=get_kernel_reg(), padding='same', name='Conv3')),
        get_act('Conv3A'),
        time_dist(KL.Flatten(name="Flatten")),
    ]
    out = lIn
    for l in layers:
        out = l(out)
    
    query_out = out
    query_out = KL.Dense(36, activation='sigmoid', name='AttentionQueryDense')(query_out)
    query_out = KL.GlobalMaxPooling1D(name="QueryPool")(query_out)
    query_out = KL.Reshape(target_shape=(1,36), name="QueryReshape")(query_out)
    out = KL.Attention(dropout=0.1, name="Attention")([query_out, out])
    
    layers = [
        KL.Flatten(name="Flatten2"),
        KL.Dense(32, kernel_regularizer=get_kernel_reg(), name="Dense1"),
        get_act('Dense1A'),
        KL.Dense(1, kernel_regularizer=get_kernel_reg(), activation=None, name="DenseO"),
    ]
    for l in layers:
        out = l(out)
        
    return KM.Model(inputs=lIn, outputs=out, name=name)

if nWorkers > 0:
    model = create_model()
else:
    with tf_strategy.scope():
        model = create_model()
model.summary()

Model: "Stage2Type2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Input (InputLayer)              [(None, 2048, 26, 12 0                                            
__________________________________________________________________________________________________
td_Conv1 (TimeDistributed)      (None, 2048, 26, 64) 32832       Input[0][0]                      
__________________________________________________________________________________________________
Conv1A (LeakyReLU)              (None, 2048, 26, 64) 0           td_Conv1[0][0]                   
__________________________________________________________________________________________________
td_Pool1 (TimeDistributed)      (None, 2048, 13, 64) 0           Conv1A[0][0]                     
________________________________________________________________________________________

In [11]:
def sign_accuracy(y_true, y_pred):
    return KB.mean((y_true * y_pred) >= 0)
model.compile(loss='mse', metrics=['mse', sign_accuracy], optimizer='adam')
callback_checkpoint = KCb.ModelCheckpoint(str(pathModel / 'e{epoch:02d}'))

In [12]:
nEpochs = 10
if nWorkers > 0:
    sparkModel = ESm.SparkModel(model, frequency='epoch', mode='asynchronous')
    sparkModel.master_optimizer = KO.Adam()
    hist = sparkModel.fit(rddTrain.repartition(nWorkers), epochs=nEpochs, batch_size=32, verbose=1)
    model.save(pathModel / 'e10')
else:
    batch_size = 16
    repeat_batch = 4
    def sample_generator():
        while True:
            x = rddTrain.takeSample(False, batch_size*repeat_batch)
            x = [data_to_sample_transform(s) for s in x]
            x, y = zip(*x)
            x = N.array(x)
            y = N.array(y)
            for i in range(repeat_batch):
                slic = slice(i*batch_size, (i+1)*batch_size)
                yield x[slic], y[slic]
    hist = model.fit(x=sample_generator(), initial_epoch=1, epochs=nEpochs,  steps_per_epoch=512, callbacks=[callback_checkpoint])

Epoch 2/10
Instructions for updating:
Use `tf.data.Iterator.get_next_as_optional()` instead.
INFO:tensorflow:batch_all_reduce: 12 all-reduces with algorithm = nccl, num_packs = 1
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:

In [13]:
with open(pathModel / 'hist.json', 'w') as f:
    json.dump({
        'params': hist.params,
        'history': hist.history,
        'epoch': hist.epoch,
    }, f)