# Train and test the LSTM model 

The initial goal is to predict the highest month gainers
We will also try for week and day
Vary the input data to 50-100 days before prediction.
Initial testing will be done on the AAPL dataset

## 0. Imports

In [27]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from collections import deque
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
import os
import numpy as np
import pandas as pd
import random
import json
from sklearn.metrics import r2_score
import pendulum
import talib as ta


if not os.path.isdir("results"):
    os.mkdir("results")
if not os.path.isdir("logs"):
    os.mkdir("logs")
if not os.path.isdir("data"):
    os.mkdir("data")

f = open('tradeData_22_08_2022.json')
tradeDataJson = json.load(f)
for k in tradeDataJson:
    # only need the adjusted values so drop all the other columns
    tradeDataJson[k]['data'] = pd.DataFrame(tradeDataJson[k]['data']).drop(columns=['close','high','low','open','volume','divCash','splitFactor']).set_index('date')


## 1. Add some features to the data
- SME
- RSI
- SMA
- Corr
- SAR
- ADX

In [29]:
AAPL = tradeDataJson['AAPL']['data'].copy(deep=True)
window = 10
AAPL['RSI'] = ta.RSI(np.array(AAPL['adjClose'].shift(1)), timeperiod=window)
AAPL['SMA'] = AAPL['adjClose'].shift(1).rolling(window=window).mean()
AAPL['Corr'] = AAPL['adjClose'].shift(1).rolling(window=window).corr(AAPL['SMA'].shift(1))
AAPL['SAR'] = ta.SAR(np.array(AAPL['adjHigh'].shift(1)), np.array(AAPL['adjLow'].shift(1)),
                   0.2, 0.2)
AAPL['ADX'] = ta.ADX(np.array(AAPL['adjHigh'].shift(1)), np.array(AAPL['adjLow'].shift(1)),
                   np.array(AAPL['adjOpen']), timeperiod=window)

AAPL.dropna(inplace=True)
AAPL.head()

Unnamed: 0_level_0,adjClose,adjHigh,adjLow,adjOpen,adjVolume,RSI,SMA,Corr,SAR,ADX
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2017-09-20T00:00:00.000Z,37.028129,37.547714,36.496682,37.462303,206772956,44.908471,37.945588,0.43916,38.199591,16.080041
2017-09-21T00:00:00.000Z,36.39229,36.964071,36.240448,36.964071,146573528,35.197181,37.807032,0.711825,38.079162,17.105807
2017-09-22T00:00:00.000Z,36.03641,36.126567,35.720863,36.067253,184457696,28.337284,37.620314,0.83098,37.905967,18.743902
2017-09-25T00:00:00.000Z,35.718491,36.022175,35.388709,35.585629,175689336,25.273935,37.460405,0.923875,37.572863,21.411638
2017-09-26T00:00:00.000Z,36.332977,36.518035,35.98896,36.010312,141883940,22.824782,37.200612,0.946329,37.202463,24.42957


## 2. Split the data into test and train sets

In [42]:
def generate_x_and_y_sets(data,LOOKUP_STEP =1,N_STEPS =60):
    n = LOOKUP_STEP  - 1
    x_ray = []
    y_ray = []

    for i in range(N_STEPS ,len(data)-n):
        x_ray.append(data[i-N_STEPS :i])
        y_ray.append(data[i+n,1])

    return np.array(x_ray), np.array(y_ray)

    
def generate_test_and_train(data,N_STEPS  = 60, LOOKUP_STEP =1,train_percentage=0.8):
    scaler = MinMaxScaler(feature_range=(0,1))
    scaledData = scaler.fit_transform(data)
    train_data = (scaledData[:int((scaledData.shape[0])*train_percentage)])
    test_data = (scaledData[int((scaledData.shape[0])*train_percentage)-N_STEPS:])


    x_train, y_train = generate_x_and_y_sets(train_data,LOOKUP_STEP =1,N_STEPS=N_STEPS)
    x_test, y_test = generate_x_and_y_sets(test_data,LOOKUP_STEP =1,N_STEPS=N_STEPS)

    return x_train, y_train, x_test, y_test#, scaler


dataDict = {}

dataDict['n_steps_50-lookup_step_20'] = generate_test_and_train(AAPL, 50,20)
dataDict['n_steps_50-lookup_step_5'] = generate_test_and_train(AAPL, 50,5)
dataDict['n_steps_50-lookup_step_1'] = generate_test_and_train(AAPL, 50,1)

## 3. Train the models

### 3.1 Set the model parameters

In [37]:
def create_model(sequence_length, n_features, units=256, cell=LSTM, n_layers=2, dropout=0.3,
                loss="mean_absolute_error", optimizer="rmsprop", bidirectional=False):
    model = Sequential()
    for i in range(n_layers):
        if i == 0:
            # first layer
            if bidirectional:
                model.add(Bidirectional(cell(units, return_sequences=True), batch_input_shape=(None, sequence_length, n_features)))
            else:
                model.add(cell(units, return_sequences=True, batch_input_shape=(None, sequence_length, n_features)))
        elif i == n_layers - 1:
            # last layer
            if bidirectional:
                model.add(Bidirectional(cell(units, return_sequences=False)))
            else:
                model.add(cell(units, return_sequences=False))
        else:
            # hidden layers
            if bidirectional:
                model.add(Bidirectional(cell(units, return_sequences=True)))
            else:
                model.add(cell(units, return_sequences=True))
        # add dropout after each layer
        model.add(Dropout(dropout))
    model.add(Dense(1, activation="linear"))
    model.compile(loss=loss, metrics=["mean_absolute_error"], optimizer=optimizer)
    return model

model_n_steps_50_lookup_step_20 = create_model(50,10,loss = 'huber_loss',optimizer = 'adam')
model_n_steps_50_lookup_step_5 = create_model(50,10,loss = 'huber_loss',optimizer = 'adam')
model_n_steps_50_lookup_step_1 = create_model(50,10,loss = 'huber_loss',optimizer = 'adam')


### 3.2 Train the models

In [47]:
callback = EarlyStopping(monitor='loss',patience=3)

model_n_steps_50_lookup_step_20_history = model_n_steps_50_lookup_step_20.fit(dataDict['n_steps_50-lookup_step_20'][0], dataDict['n_steps_50-lookup_step_20'][1],
                                                                                batch_size=64,
                                                                                epochs=500,
                                                                                validation_data=(dataDict['n_steps_50-lookup_step_20'][2], dataDict['n_steps_50-lookup_step_20'][3]),
                                                                                callbacks=[callback],
                                                                                verbose=1)

model_n_steps_50_lookup_step_5_history = model_n_steps_50_lookup_step_5.fit(dataDict['n_steps_50-lookup_step_5'][0], dataDict['n_steps_50-lookup_step_5'][1],
                                                                                batch_size=64,
                                                                                epochs=500,
                                                                                validation_data=(dataDict['n_steps_50-lookup_step_5'][2], dataDict['n_steps_50-lookup_step_5'][3]),
                                                                                callbacks=[callback],
                                                                                verbose=1)

model_n_steps_50_lookup_step_1_history = model_n_steps_50_lookup_step_1.fit(dataDict['n_steps_50-lookup_step_1'][0], dataDict['n_steps_50-lookup_step_1'][1],
                                                                                batch_size=64,
                                                                                epochs=500,
                                                                                validation_data=(dataDict['n_steps_50-lookup_step_1'][2], dataDict['n_steps_50-lookup_step_1'][3]),
                                                                                callbacks=[callback],
                                                                                verbose=1)

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500


## 4. Graphs

In [None]:
predictions = model.predict(x)