In [85]:
import os
import sys
import time
import warnings
import numpy as np
import rrsBdtDevDependencies
import dataFunctions as dataFun
from datetime import datetime as dt
from datetime import timedelta as td
import matplotlib.pyplot as plt
import pandas as pd
import quandl
QAPIKEY = "YpAydSEsKoSAfuQ9UKhu"
quandl.ApiConfig.api_key = QAPIKEY
import yfinance as yf
import pickle
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import logging
import tensorflow as tf
from keras.models import Sequential, load_model
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers import Dense
from keras import optimizers
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, CSVLogger
from tqdm._tqdm_notebook import tqdm_notebook


In [447]:
### CONFIGURE ###

# Cost parameters set by the task for running ship for one day
barrels = 750000
costPerDay = 30000
daysToPredict = 1

# Data split for training and testing.
trainDataDate = '2017-01-01'
testSplitDate = '2019-11-26'

# Parameters for the model.
params = {
    "batch_size": 20,  # 20<16<10, 25 was a bust
    "epochs": 300,
    "lr": 0.00010000,
    "time_steps": 5
}

# Replace with Cloud path with data later on
PATH_TO_DRIVE_ML_DATA = "/Users/qw19176/Documents/Courses/Team-Cpp"
INPUT_PATH = PATH_TO_DRIVE_ML_DATA+"/inputs/"
OUTPUT_PATH = PATH_TO_DRIVE_ML_DATA+"/outputs/lstm/"
TIME_STEPS = params["time_steps"]
BATCH_SIZE = params["batch_size"]

# Some environmental parameters.
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
os.environ['TZ'] = 'Europe/London'
time.tzset()
stime = time.time()

modDate = dt.today().date()
dataDate = modDate - td(days=1)
updateData = False
is_update_model = True

dataFileName = "inputData_" + str(dataDate) + ".csv"
modFileName = "LSTM_" + str(modDate) + ".sav"

In [283]:
# check if directory already exists
if not os.path.exists(OUTPUT_PATH):
    os.makedirs(OUTPUT_PATH)
    print("Directory created", OUTPUT_PATH)
else:
    raise Exception("Directory already exists. Don't override.")




Exception: Directory already exists. Don't override.

In [284]:
def print_time(text, stime):
    seconds = (time.time()-stime)
    print(text, seconds//60,"minutes : ",np.round(seconds%60),"seconds")

In [285]:
print('Running...')

def show_more(df, lines):
    with pd.option_context("display.max_rows", lines):
        display(df)

Running...


In [286]:
def create_features(fd, features, label=None, shift = 0, nonShiftFeatures = None):
    fd = fd.set_index('Date')
    #X = df[['OilProduction', 'NatGasPrices', 'BrentPrices', '20dSMA', 'Momentum_14', 'MACD_12_26', 'MACDdiff_12_26', 'ROC_14', 'RSI_14', 'bollAmplitude', 'distFromTopBoll', 'distFromLowBoll', '20d200dDist','dayofyear','dayofmonth','weekofyear']]

    # X = df[['OilProduction', '20dSMA', 'Momentum_14', 'MACD_12_26', 'MACDdiff_12_26', 'ROC_14', 'RSI_14', 'bollAmplitude', 'distFromTopBoll', 'distFromLowBoll', '20d200dDist','dayofyear','dayofmonth','weekofyear']]
    # if shift > 0:
    #     tiems = X[['dayofyear','dayofmonth','weekofyear']]
    #     #X = X[['OilProduction', 'NatGasPrices', 'BrentPrices', '20dSMA', 'Momentum_14', 'MACD_12_26', 'MACDdiff_12_26','ROC_14', 'RSI_14', 'bollAmplitude', 'distFromTopBoll', 'distFromLowBoll', '20d200dDist']].shift(shift)
    #     X = X[['OilProduction', '20dSMA', 'Momentum_14', 'MACD_12_26', 'MACDdiff_12_26','ROC_14', 'RSI_14', 'bollAmplitude', 'distFromTopBoll', 'distFromLowBoll', '20d200dDist']].shift(shift)
    #     X = X.merge(tiems, how='inner', left_index=True, right_index=True)

    X = fd[features]
    if shift > 0:
        tiems = fd[nonShiftFeatures]
        newFeatures = features
        for f in nonShiftFeatures:
            newFeatures.remove(f)
        X = X[newFeatures].shift(shift)
        X = X.merge(tiems, how='inner', left_index=True, right_index=True)
        X = X.iloc[shift:]

    if label:
        y = fd[label]
        return X, y
    return X

In [448]:
"""
Getting WTI price data 
"""

if (os.path.exists(INPUT_PATH+dataFileName) and updateData is False):
    df = pd.read_csv(INPUT_PATH+dataFileName)
    print("Dataframe already exists, reading from file...")
else:
    print("Datafile not found, querying data and building dataframe...")
    wtiData         = quandl.get("FRED/DCOILWTICO")
    wtiData.reset_index(level=0, inplace=True)
    wtiData         = wtiData.rename(columns={"Value": "Prices"})
    yfStartDate     = wtiData['Date'].iloc[-1].strftime('%Y-%m-%d')
    stocks          = "CL=F"
    period          = "1d"
    Stocks, yfInfo  = dataFun.yFinData(yfStartDate)
    wtiData         = wtiData.append(Stocks, ignore_index =True)
    wtiData         = wtiData.sort_values(by = ["Date"])

    # Getting Oil production data and combining dataframes
    oilDF   = dataFun.oilProduction()
    df      = dataFun.combineFrames(wtiData,oilDF)
    df      = df[np.isfinite(df['Prices'])]
    df      = df.reset_index().drop(["index"], axis = 1)

    # Getting natural gas data and combining frames
    natGasData          = quandl.get("EIA/NG_RNGWHHD_D")
    natGasData.reset_index(level=0, inplace=True)
    natGasData          = natGasData.rename(columns={"Value": "NatGasPrices"})
    yfStartDate         = natGasData['Date'].iloc[-1].strftime('%Y-%m-%d')
    stocks              = "NG=F"
    period              = "1d"
    NGStocks, yfInfo    = dataFun.yFinData(yfStartDate,stock=stocks,name ="NatGasPrices")
    natGasData          = natGasData.append(NGStocks, ignore_index =True)
    natGasData          = natGasData.sort_values(by = ["Date"])
    newdf               = pd.merge(df, natGasData, on=['Date'], how ="left")

    """
    Getting Brent oil data and combining dataframes
    """

    brentData = quandl.get("FRED/DCOILBRENTEU")
    brentData.reset_index(level=0, inplace=True)
    name = "BrentPrices"
    brentData = brentData.rename(columns={"Value": name})
    yfStartDate = brentData['Date'].iloc[-1].strftime('%Y-%m-%d')
    stocks = "BZ=F"
    period = "1d"
    BStocks, yfInfo = dataFun.yFinData(yfStartDate,stock=stocks,name = name)
    brentData = brentData.append(BStocks, ignore_index =True)
    brentData = brentData.sort_values(by = ["Date"])
    df = pd.merge(newdf, brentData, on=['Date'], how ="left")

    df["BrentPrices"] = df["BrentPrices"].interpolate(method='nearest')
    df["NatGasPrices"] = df["NatGasPrices"].interpolate(method='nearest')

    # Calculating the technical indicators for price data
    df = df.reset_index().drop(["index"], axis = 1)
    df["20dSMA"] = dataFun.SMA(20, df["Prices"])
    df["10dSMA"] = dataFun.SMA(10, df["Prices"])
    df["5dSMA"] = dataFun.SMA(5, df["Prices"])
    df["50dSMA"] = dataFun.SMA(50, df["Prices"])
    df["200dSMA"] = dataFun.SMA(200, df["Prices"])


    df["boll_lo"] = dataFun.bollinger(df['Prices'])[0]
    df["boll_hi"] = dataFun.bollinger(df['Prices'])[1]

    df = dataFun.momentum(df, 14)
    df = dataFun.macd(df, 12, 26)
    df = dataFun.rate_of_change(df, 14)
    df = dataFun.relative_strength_index(df)

    df["boll_hi"] = pd.to_numeric(df["boll_hi"])
    df["boll_lo"] = pd.to_numeric(df["boll_lo"])
    df["20dSMA"] = pd.to_numeric(df["20dSMA"])
    df["10dSMA"] = pd.to_numeric(df["10dSMA"])
    df["5dSMA"] = pd.to_numeric(df["5dSMA"])
    df["50dSMA"] = pd.to_numeric(df["50dSMA"])
    df["200dSMA"] = pd.to_numeric(df["200dSMA"])

    df["bollAmplitude"] = df["boll_hi"] - df["boll_lo"]
    df["distFromTopBoll"] = df["boll_hi"] - df["Prices"]
    df["distFromLowBoll"] = df["boll_lo"] - df["Prices"]
    df["20d200dDist"] = np.abs(df["20dSMA"] - df["200dSMA"])

    df = df[np.isfinite(df['200dSMA'])]
    df = df.rename(columns={"Production of Crude Oil": "OilProduction"})
    df = df.drop_duplicates("Date",keep="first")
    df = df[np.isfinite(df['Prices'])]
    df = df.reset_index().drop(["index"], axis = 1)

    """
    Creating time series features from datetime index
    """

    df['dayofweek'] = df['Date'].dt.dayofweek
    df['quarter'] = df['Date'].dt.quarter
    df['month'] = df['Date'].dt.month
    df['year'] = df['Date'].dt.year
    df['dayofyear'] = df['Date'].dt.dayofyear
    df['dayofmonth'] = df['Date'].dt.day
    df['weekofyear'] = df['Date'].dt.weekofyear

    print("Saving dataframe to file ", dataFileName, "at ", INPUT_PATH)
    df.to_csv(INPUT_PATH+dataFileName)



Dataframe already exists, reading from file...


In [449]:
df.tail()
df.describe()
df[df.isna().any(axis=1)]

Unnamed: 0.1,Unnamed: 0,Date,Prices,OilProduction,NatGasPrices,BrentPrices,20dSMA,10dSMA,5dSMA,50dSMA,...,distFromTopBoll,distFromLowBoll,20d200dDist,dayofweek,quarter,month,year,dayofyear,dayofmonth,weekofyear
0,0,1986-10-17,14.85,8773.0,,,14.7820,14.943,14.748,15.0912,...,0.764336,-0.900336,0.09925,4,4,10,1986,290,17,42
1,1,1986-10-20,15.17,8773.0,,,14.8170,14.974,14.786,15.0910,...,0.482958,-1.188958,0.01010,0,4,10,1986,293,20,43
2,2,1986-10-21,15.22,8773.0,,,14.8810,15.013,14.920,15.0988,...,0.405210,-1.083210,0.11045,1,4,10,1986,294,21,43
3,3,1986-10-22,14.85,8773.0,,,14.9085,14.938,14.924,15.0974,...,0.751177,-0.634177,0.19295,2,4,10,1986,295,22,43
4,4,1986-10-23,14.88,8773.0,,,14.9250,14.891,14.994,15.0850,...,0.717137,-0.627137,0.26440,3,4,10,1986,296,23,43
5,5,1986-10-24,14.93,8773.0,,,14.9575,14.879,15.010,15.0780,...,0.627285,-0.572285,0.35240,4,4,10,1986,297,24,43
6,6,1986-10-27,14.40,8773.0,,,14.9560,14.821,14.856,15.0574,...,1.161463,-0.049463,0.40715,0,4,10,1986,300,27,44
7,7,1986-10-28,14.18,8773.0,,,14.9185,14.784,14.648,15.0244,...,1.436566,0.040434,0.42415,1,4,10,1986,301,28,44
8,8,1986-10-29,13.73,8773.0,,,14.8700,14.674,14.424,14.9874,...,2.014480,0.265520,0.43185,2,4,10,1986,302,29,44
9,9,1986-10-30,15.08,8773.0,,,14.8625,14.729,14.464,14.9894,...,0.646490,-1.081490,0.47485,3,4,10,1986,303,30,44


In [450]:
df = df[df["Date"] > trainDataDate]
df = df.reset_index().drop(["index"], axis = 1)
# df_train = df[df["Date"] <= testSplitDate].copy()
# df_test = df[df["Date"] > testSplitDate].copy()
df_train, df_test = train_test_split(df, train_size=0.8, test_size=0.2, shuffle=False)
# df_train, df_test = train_test_split(df, train_size=0.9, test_size=0.1, shuffle=False)

# training_set = df.set_index('Date')
# training_set = training_set[features]
feat = ['Prices']#,'Momentum_14', 'MACD_12_26', 'RSI_14','20d200dDist','dayofmonth','weekofyear']
nonShiftFeat = ['Prices','dayofmonth','weekofyear']
#['OilProduction', '20dSMA', 'Momentum_14', 'MACD_12_26', 'MACDdiff_12_26', 'ROC_14', 'RSI_14', 'bollAmplitude', 'distFromTopBoll', 'distFromLowBoll', '20d200dDist','dayofyear','dayofmonth','weekofyear']


# X_train, y_train = create_features(df_train,features,label='Prices', shift =1)
# X_test, y_test = create_features(df_test,label='Prices', shift =1)
# X_train = X_train.iloc[1:]
# X_test = X_test.iloc[1:]
# y_train = y_train.iloc[1:]
# y_test = y_test.iloc[1:]

In [451]:
X = df[feat]
df_train[feat]
df_train[df_train.isna().any(axis=1)]
test = df_train.iloc[1:]
test

Unnamed: 0.1,Unnamed: 0,Date,Prices,OilProduction,NatGasPrices,BrentPrices,20dSMA,10dSMA,5dSMA,50dSMA,...,distFromTopBoll,distFromLowBoll,20d200dDist,dayofweek,quarter,month,year,dayofyear,dayofmonth,weekofyear
1,7621,2017-01-04,53.26,8770.0,3.42,54.57,52.1250,52.765,53.436,48.9904,...,1.104948,-3.374948,5.88610,2,1,1,2017,4,4,1
2,7622,2017-01-05,53.77,8770.0,3.42,54.99,52.2660,52.920,53.388,49.0622,...,0.779132,-3.787132,5.96550,3,1,1,2017,5,5,1
3,7623,2017-01-06,53.98,8946.0,3.38,55.90,52.4725,53.174,53.424,49.1528,...,0.595539,-3.610539,6.09350,4,1,1,2017,6,6,1
4,7624,2017-01-09,51.95,8946.0,3.14,54.39,52.5280,53.171,53.064,49.2168,...,2.554415,-1.398415,6.07995,0,1,1,2017,9,9,2
5,7625,2017-01-10,50.82,8946.0,3.21,53.20,52.4935,53.052,52.756,49.2390,...,3.746467,-0.399467,5.98130,1,1,1,2017,10,10,2
6,7626,2017-01-11,52.19,8946.0,3.27,53.61,52.4660,52.989,52.542,49.3084,...,2.349790,-1.797790,5.87740,2,1,1,2017,11,11,2
7,7627,2017-01-12,53.01,8946.0,3.36,54.51,52.4670,52.889,52.390,49.4320,...,1.531873,-2.617873,5.79790,3,1,1,2017,12,12,2
8,7628,2017-01-13,52.36,8944.0,3.36,54.37,52.5345,52.745,52.066,49.5460,...,2.134451,-1.785451,5.78830,4,1,1,2017,13,13,2
9,7629,2017-01-17,52.45,8944.0,3.37,54.68,52.6120,52.615,52.166,49.6886,...,1.966213,-1.642213,5.78035,1,1,1,2017,17,17,3
10,7630,2017-01-18,51.12,8944.0,3.26,53.77,52.5715,52.491,52.226,49.8178,...,3.353867,-0.450867,5.65575,2,1,1,2017,18,18,3


In [452]:
df_train = create_features(df_train,features=feat, shift = 0, nonShiftFeatures=nonShiftFeat)
feat = ['Prices']#,'Momentum_14', 'MACD_12_26', 'RSI_14','20d200dDist','dayofmonth','weekofyear']
nonShiftFeat = ['Prices','dayofmonth','weekofyear']
df_test = create_features(df_test,features=feat, shift = 0, nonShiftFeatures=nonShiftFeat)



In [453]:
feat = ['Prices']#,'Momentum_14', 'MACD_12_26', 'RSI_14','20d200dDist','dayofmonth','weekofyear']
nonShiftFeat = ['Prices','dayofmonth','weekofyear']
x = df_train.loc[:,feat].values

In [454]:
from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler(feature_range=(0,1))

x_train = sc.fit_transform(x)
x_test = sc.transform(df_test.loc[:,feat])

In [455]:
# print("Deleting unused dataframes of total size(KB)",(sys.getsizeof(df)+sys.getsizeof(df_train)+sys.getsizeof(df_test))//1024)
# del df
# del df_test
# del df_train
# del x

In [456]:
def build_timeseries(mat, y_col_index):
    """
    Converts ndarray into timeseries format and supervised data format. Takes first TIME_STEPS
    number of rows as input and sets the TIME_STEPS+1th data as corresponding output and so on.
    :param mat: ndarray which holds the dataset
    :param y_col_index: index of column which acts as output
    :return: returns two ndarrays-- input and output in format suitable to feed
    to LSTM.
    """
    # total number of time-series samples would be len(mat) - TIME_STEPS
    dim_0 = mat.shape[0] - TIME_STEPS
    dim_1 = mat.shape[1]
    x = np.zeros((dim_0, TIME_STEPS, dim_1))
    y = np.zeros((dim_0,))
    print("dim_0",dim_0)
    for i in tqdm_notebook(range(dim_0)):
        x[i] = mat[i:TIME_STEPS+i]
        y[i] = mat[TIME_STEPS+i, y_col_index]
#         if i < 10:
#           print(i,"-->", x[i,-1,:], y[i])
    print("length of time-series i/o",x.shape,y.shape)
    return x, y


def trim_dataset(mat, batch_size):
    """
    trims dataset to a size that's divisible by BATCH_SIZE
    """
    no_of_rows_drop = mat.shape[0]%batch_size
    if(no_of_rows_drop > 0):
        return mat[:-no_of_rows_drop]
    else:
        return mat

In [457]:
tqdm_notebook.pandas('Processing...')
# df_ge = process_dataframe(df_ge)
print(df.shape)
print(df.columns)
print(df.dtypes)
print("Train--Test size", len(df_train), len(df_test))

(775, 30)
Index(['Unnamed: 0', 'Date', 'Prices', 'OilProduction', 'NatGasPrices',
       'BrentPrices', '20dSMA', '10dSMA', '5dSMA', '50dSMA', '200dSMA',
       'boll_lo', 'boll_hi', 'Momentum_14', 'MACD_12_26', 'MACDsign_12_26',
       'MACDdiff_12_26', 'ROC_14', 'RSI_14', 'bollAmplitude',
       'distFromTopBoll', 'distFromLowBoll', '20d200dDist', 'dayofweek',
       'quarter', 'month', 'year', 'dayofyear', 'dayofmonth', 'weekofyear'],
      dtype='object')
Unnamed: 0           int64
Date                object
Prices             float64
OilProduction      float64
NatGasPrices       float64
BrentPrices        float64
20dSMA             float64
10dSMA             float64
5dSMA              float64
50dSMA             float64
200dSMA            float64
boll_lo            float64
boll_hi            float64
Momentum_14        float64
MACD_12_26         float64
MACDsign_12_26     float64
MACDdiff_12_26     float64
ROC_14             float64
RSI_14             float64
bollAmplitude      floa

In [458]:
print("Are any NaNs present in train/test matrices?",np.isnan(x_train).any(), np.isnan(x_train).any())
target_idx = 0
x_t, y_t = build_timeseries(x_train, target_idx)
x_t = trim_dataset(x_t, BATCH_SIZE)
y_t = trim_dataset(y_t, BATCH_SIZE)
print("Batch trimmed size",x_t.shape, y_t.shape)

Are any NaNs present in train/test matrices? False False
dim_0 615


HBox(children=(FloatProgress(value=0.0, max=615.0), HTML(value='')))


length of time-series i/o (615, 5, 1) (615,)
Batch trimmed size (600, 5, 1) (600,)


In [459]:
x_temp, y_temp = build_timeseries(x_test, target_idx)
x_val, x_test_t = np.split(trim_dataset(x_temp, BATCH_SIZE),2)
y_val, y_test_t = np.split(trim_dataset(y_temp, BATCH_SIZE),2)

print("Test size", x_test_t.shape, y_test_t.shape, x_val.shape, y_val.shape)


dim_0 150


HBox(children=(FloatProgress(value=0.0, max=150.0), HTML(value='')))


length of time-series i/o (150, 5, 1) (150,)
Test size (70, 5, 1) (70,) (70, 5, 1) (70,)


In [460]:
x_temp
y_val

array([0.41740624, 0.42570856, 0.43200687, 0.50959061, 0.49957057,
       0.50128829, 0.4815345 , 0.42828514, 0.40137418, 0.36072144,
       0.3704552 , 0.3833381 , 0.40366447, 0.38419697, 0.38362439,
       0.38820498, 0.41139422, 0.44546235, 0.45949041, 0.31949614,
       0.37761237, 0.34783853, 0.31835099, 0.24792442, 0.2897223 ,
       0.34154022, 0.35785857, 0.41711995, 0.36301174, 0.34440309,
       0.35356427, 0.39393072, 0.392213  , 0.37703979, 0.36787861,
       0.33209276, 0.31663327, 0.35814486, 0.38018895, 0.40624105,
       0.36043516, 0.32722588, 0.39335814, 0.3965073 , 0.39994274,
       0.44088176, 0.42628113, 0.37732608, 0.36215288, 0.35156026,
       0.5903235 , 0.48038935, 0.44975666, 0.44975666, 0.44202691,
       0.464071  , 0.42198683, 0.39793873, 0.39393072, 0.3856284 ,
       0.33237904, 0.31835099, 0.29172631, 0.28428285, 0.29659319,
       0.29430289, 0.29086745, 0.29058116, 0.31749213, 0.35156026])

In [461]:
# lr = 0.1
# lstm_model = Sequential()
# lstm_model.add(LSTM(100, batch_input_shape=(BATCH_SIZE, TIME_STEPS, x_t.shape[2]), dropout=0.0, recurrent_dropout=0.0, stateful=True,     kernel_initializer='random_uniform'))
# lstm_model.add(Dropout(0.5))
# lstm_model.add(Dense(20,activation='relu'))
# lstm_model.add(Dense(1,activation='sigmoid'))
# optimizer = optimizers.RMSprop(lr=lr)
# lstm_model.compile(loss='mean_squared_error', optimizer=optimizer)

def create_model():
    lstm_model = Sequential()
    # (batch_size, timesteps, data_dim)
    lstm_model.add(LSTM(100, batch_input_shape=(BATCH_SIZE, TIME_STEPS, x_t.shape[2]),
                        dropout=0.0, recurrent_dropout=0.0, stateful=True, return_sequences=True,
                        kernel_initializer='random_uniform'))
    lstm_model.add(Dropout(0.4))
    lstm_model.add(LSTM(60, dropout=0.0))
    lstm_model.add(Dropout(0.4))
    lstm_model.add(Dense(20,activation='relu'))
    lstm_model.add(Dense(1,activation='sigmoid'))
    optimizer = optimizers.RMSprop(lr=params["lr"])
    # optimizer = optimizers.SGD(lr=0.000001, decay=1e-6, momentum=0.9, nesterov=True)
    lstm_model.compile(loss='mean_squared_error', optimizer=optimizer)
    return lstm_model

# model = Sequential()
# model.add(LSTM(units=50,return_sequences=True,input_shape=(X_train.shape[1], 1)))
# model.add(Dropout(0.2))
# model.add(LSTM(units=50,return_sequences=True))
# model.add(Dropout(0.2))
# model.add(LSTM(units=50,return_sequences=True))
# model.add(Dropout(0.2))
# model.add(LSTM(units=50))
# model.add(Dropout(0.2))
# model.add(Dense(units=1))
# model.compile(optimizer='adam',loss='mean_squared_error')
# model.fit(X_train,y_train,epochs=100,batch_size=32)

In [462]:
model = None
try:
    model = pickle.load(open(modFileName, 'rb'))
    print("Loaded saved model...")
except FileNotFoundError:
    print("Model not found")

if model is None or is_update_model:
    from keras import backend as K
    print("Building model...")
    print("checking if GPU available", K.tensorflow_backend._get_available_gpus())
    model = create_model()
    
    es = EarlyStopping(monitor='val_loss', mode='min', verbose=1,
                       patience=40, min_delta=0.0001)
    
    mcp = ModelCheckpoint(os.path.join(OUTPUT_PATH,
                          "best_model.h5"), monitor='val_loss', verbose=1,
                          save_best_only=True, save_weights_only=False, mode='min', period=1)

    # Not used here. But leaving it here as a reminder for future
    r_lr_plat = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=30, 
                                  verbose=0, mode='auto', min_delta=0.0001, cooldown=0, min_lr=0)
    
    csv_logger = CSVLogger(os.path.join(OUTPUT_PATH, 'training_log_' + time.ctime().replace(" ","_") + '.log'), append=True)
    
    history = model.fit(x_t, y_t, epochs=params["epochs"], verbose=2, batch_size=BATCH_SIZE,
                        shuffle=False, validation_data=(trim_dataset(x_val, BATCH_SIZE),
                        trim_dataset(y_val, BATCH_SIZE)), callbacks=[es, mcp, csv_logger])
    
    print("saving model...")
    pickle.dump(model, open(modFileName, "wb"))


In [0]:
# OUTPUT_PATH = "/Users/qw19176/Documents/Courses/Team-Cpp/"
# csv_logger = CSVLogger(os.path.join(OUTPUT_PATH, 'LSTMRegressor' + '.log'), append=True)
# epochs = 100
# history = lstm_model.fit(x_t, y_t, epochs=epochs, verbose=2, batch_size=BATCH_SIZE,
#                     shuffle=False, validation_data=(trim_dataset(x_val, BATCH_SIZE),
#                     trim_dataset(y_val, BATCH_SIZE)), callbacks=[csv_logger])

In [0]:
# search_params = {
#     "batch_size": [20, 30, 40],
#     "time_steps": [30, 60, 90], 
#     "lr": [0.01, 0.001, 0.0001],
#     "epochs": [30, 50, 70]
# }

# def eval_model():
#     """
#     implement your logic to build a model, train it and then calculate validation loss.
#     Save this validation loss using CSVLogger of Keras or in a text file. Later you can
#     query to get the best combination.
#     """
#     pass

# def get_all_combinations(params):
#     all_names = params.keys()
#     combinations = it.product(*(params[name] for name in all_names))
#     return list(combinations)

# def run_search(mat, params):
#     param_combs = get_all_combinations(params) # list of tuples
#     logging.info("Total combinations to try = {}".format(len(param_combs)))
#     for i, combination in enumerate(param_combs):
#         logging.info("Trying combo no. {} {}".format(i, combination))
#         eval_model(mat, combination, i)

# run_search(x_input, search_params)

In [0]:
# model.evaluate(x_test_t, y_test_t, batch_size=BATCH_SIZE
y_pred = model.predict(trim_dataset(x_test_t, BATCH_SIZE), batch_size=BATCH_SIZE)
y_pred = y_pred.flatten()
y_test_t = trim_dataset(y_test_t, BATCH_SIZE)
error = mean_squared_error(y_test_t, y_pred)
print("Error is", error, y_pred.shape, y_test_t.shape)
print(y_pred[0:15])
print(y_test_t[0:15])

In [0]:
# convert the predicted value to range of real data
y_pred_org = (y_pred * sc.data_range_[target_idx]) + sc.data_min_[target_idx]
# min_max_scaler.inverse_transform(y_pred)
y_test_t_org = (y_test_t * sc.data_range_[target_idx]) + sc.data_min_[target_idx]
# min_max_scaler.inverse_transform(y_test_t)
print(y_pred_org[0:15])
print(y_test_t_org[0:15])

In [0]:
# Visualize the training data
from matplotlib import pyplot as plt
plt.figure()
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
#plt.show()
plt.savefig(os.path.join(OUTPUT_PATH, 'train_vis_BS_'+str(BATCH_SIZE)+"_"+time.ctime()+'.png'))

In [0]:
# load the saved best model from above
saved_model = load_model(os.path.join(OUTPUT_PATH, 'best_model.h5')) # , "lstm_best_7-3-19_12AM",
print(saved_model)

In [0]:
y_pred = saved_model.predict(trim_dataset(x_test_t, BATCH_SIZE), batch_size=BATCH_SIZE)
y_pred = y_pred.flatten()
y_test_t = trim_dataset(y_test_t, BATCH_SIZE)
error = mean_squared_error(y_test_t, y_pred)
print("Error is", error, y_pred.shape, y_test_t.shape)
print(y_pred[0:15])
print(y_test_t[0:15])
y_pred_org = (y_pred * sc.data_range_[target_idx]) + sc.data_min_[target_idx] # min_max_scaler.inverse_transform(y_pred)
y_test_t_org = (y_test_t * sc.data_range_[target_idx]) + sc.data_min_[target_idx] # min_max_scaler.inverse_transform(y_test_t)
print(y_pred_org[0:15])
print(y_test_t_org[0:15])

In [0]:
# Visualize the prediction
from matplotlib import pyplot as plt
plt.figure()
plt.plot(y_pred_org)
plt.plot(y_test_t_org)
plt.title('Prediction vs Real Stock Price')
plt.ylabel('Price')
plt.xlabel('Days')
plt.legend(['Prediction', 'Real'], loc='upper left')
#plt.show()
plt.savefig(os.path.join(OUTPUT_PATH, 'pred_vs_real_BS'+str(BATCH_SIZE)+"_"+time.ctime()+'.png'))
print_time("program completed ", stime)

In [None]:
"""
TALOS OPTIMISATION

NOT IMPLEMENTED YET
"""

def data(search_params):
    """
    The function that prepares the data for LSTM training specific to this problem as per values in search_params.
    """
    global mat

    BATCH_SIZE = search_params["batch_size"]
    TIME_STEPS = search_params["time_steps"]
    x_train, x_test = train_test_split(mat, train_size=0.8, test_size=0.2, shuffle=False)

    # scale the train and test dataset
    min_max_scaler = MinMaxScaler()
    x_train = min_max_scaler.fit_transform(x_train)
    x_test = min_max_scaler.transform(x_test)

    x_train_ts, y_train_ts = build_timeseries(x_train, 3, TIME_STEPS)
    x_test_ts, y_test_ts = build_timeseries(x_test, 3, TIME_STEPS)
    x_train_ts = trim_dataset(x_train_ts, BATCH_SIZE)
    y_train_ts = trim_dataset(y_train_ts, BATCH_SIZE)
    x_test_ts = trim_dataset(x_test_ts, BATCH_SIZE)
    y_test_ts = trim_dataset(y_test_ts, BATCH_SIZE)
    print("Test size(trimmed) {}, {}".format(x_test_ts.shape, y_test_ts.shape))
    return x_train_ts, y_train_ts, x_test_ts, y_test_ts
  
  def create_model_talos(x_train_ts, y_train_ts, x_test_ts, y_test_ts, params):
    """
    function that builds model, trains, evaluates on validation data and returns Keras history object and model for
    talos scanning. Here I am creating data inside function because data preparation varies as per the selected value of 
    batch_size and time_steps during searching. So we ignore data that's received here as argument from scan method of Talos.
    """
    x_train_ts, y_train_ts, x_test_ts, y_test_ts = data(params)
    BATCH_SIZE = params["batch_size"]
    TIME_STEPS = params["time_steps"]
    lstm_model = Sequential()
    # (batch_size, timesteps, data_dim)
    lstm_model.add(LSTM(params["lstm1_nodes"], batch_input_shape=(BATCH_SIZE, TIME_STEPS, x_train_ts.shape[2]), dropout=0.2,
                        recurrent_dropout=0.2, stateful=True, return_sequences=True,
                        kernel_initializer='random_uniform'))
    if params["lstm_layers"] == 2:
        lstm_model.add(LSTM(params["lstm2_nodes"], dropout=0.2))
    else:
        lstm_model.add(Flatten())

    if params["dense_layers"] == 2:
        lstm_model.add(Dense(params["dense2_nodes"], activation='relu'))

    lstm_model.add(Dense(1, activation='sigmoid'))
    if params["optimizer"] == 'rms':
        optimizer = optimizers.RMSprop(lr=params["lr"])
    else:
        optimizer = optimizers.SGD(lr=params["lr"], decay=1e-6, momentum=0.9, nesterov=True)
    lstm_model.compile(loss='mean_squared_error', optimizer=optimizer)  # binary_crossentropy
    history = lstm_model.fit(x_train_ts, y_train_ts, epochs=params["epochs"], verbose=2, batch_size=BATCH_SIZE,
                             validation_data=[x_test_ts, y_test_ts],
                             callbacks=[LogMetrics(search_params, params, -1), csv_logger])
    return history, lstm_model
  
print("Starting Talos scanning...")
t = ta.Scan(x=mat, # data parameter is ignored in this example as here data varies based on batch_size & time_steps
            y=mat[:,0], # dummy data just to avoid errors. input and output calculated in create_model_talos
            model=create_model_talos,
            params=search_params,
            dataset_name='stock_ge',
            experiment_no='1',
            reduction_interval=10)

pickle.dump(t, open(os.path.join(OUTPUT_PATH,"talos_res"),"wb"))

In [5]:
features = ["easy", "easter", "eastmas", "estover"]
nonShiftFeatures = ["easy", "easter"]


In [7]:
for f in nonShiftFeatures:
    features.remove(f)
features

['eastmas', 'estover']