In [16]:
import numpy as np
import pandas as pd
import seaborn as sns
import os
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from keras.models import Sequential
from keras.layers import Dense, LSTM, GRU

First, clean up the data set to make testing easier. A few important things to note:
    -  Analysing data from too far in the past is not very applicable to today's markets, e.g. NYA in the 1960's. Further, some stock exchanges do not have data that far in the past, either due to lack of data or the exchange not existing. Thus to allow for cross comparison of exchanges we will normalise the data sets such that they all start in the 2000's.
    -  Convert the open and close with returns

In [17]:
df = pd.read_csv("indexProcessed.csv")
var = 2010
df['Date'] = pd.to_datetime(df['Date'])
df = df[(df['Date'].dt.strftime('%Y')) >= str(var)]
df.drop(columns="Volume", inplace=True)
df

Unnamed: 0,Index,Date,Open,High,Low,Close,Adj Close,CloseUSD
5687,HSI,2010-01-04,21860.03906,22024.83008,21689.22070,21823.27930,21823.27930,2837.026309
5688,HSI,2010-01-05,22092.15039,22297.03906,21987.26953,22279.58008,22279.58008,2896.345410
5689,HSI,2010-01-06,22357.46094,22514.78906,22277.13086,22416.66992,22416.66992,2914.167090
5690,HSI,2010-01-07,22548.02930,22548.02930,22169.60938,22269.44922,22269.44922,2895.028399
5691,HSI,2010-01-08,22282.75000,22443.22070,22206.16016,22296.75000,22296.75000,2898.577500
...,...,...,...,...,...,...,...,...
104219,J203.JO,2021-05-25,66054.92188,66812.45313,66022.97656,66076.67969,66076.67969,4625.367578
104220,J203.JO,2021-05-26,66076.67969,66446.36719,66030.35156,66108.22656,66108.22656,4627.575859
104221,J203.JO,2021-05-27,66108.22656,66940.25000,66102.54688,66940.25000,66940.25000,4685.817500
104222,J203.JO,2021-05-28,66940.25000,67726.56250,66794.60938,67554.85938,67554.85938,4728.840157


In [18]:
# list of all indexes
index_list = []

# create folder for splitting the original csv 
if not os.path.exists("data_by_index"): os.makedirs("data_by_index")
# split the original csv into index only
for (index), group in df.groupby('Index'):
     group.to_csv(f'data_by_index/{index}.csv', index=False)
     index_list.append({index})

# example csv split
pd.read_csv("data_by_index/NYA.csv").set_index("Date")

Unnamed: 0_level_0,Index,Open,High,Low,Close,Adj Close,CloseUSD
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-01-04,NYA,7184.979980,7331.120117,7184.979980,7326.740234,7326.740234,7326.740234
2010-01-05,NYA,7326.740234,7359.459961,7313.600098,7354.870117,7354.870117,7354.870117
2010-01-06,NYA,7354.850098,7389.279785,7342.490234,7377.700195,7377.700195,7377.700195
2010-01-07,NYA,7377.700195,7398.209961,7325.509766,7393.930176,7393.930176,7393.930176
2010-01-08,NYA,7393.930176,7426.410156,7367.810059,7425.350098,7425.350098,7425.350098
...,...,...,...,...,...,...,...
2021-05-24,NYA,16375.000000,16508.519530,16375.000000,16464.689450,16464.689450,16464.689450
2021-05-25,NYA,16464.689450,16525.810550,16375.150390,16390.189450,16390.189450,16390.189450
2021-05-26,NYA,16390.189450,16466.339840,16388.320310,16451.960940,16451.960940,16451.960940
2021-05-27,NYA,16451.960940,16546.359380,16451.960940,16531.949220,16531.949220,16531.949220


In [19]:
nya_df = pd.read_csv("data_by_index/NYA.csv").set_index("Date")
tko_df = pd.read_csv("data_by_index/N225.csv").set_index("Date")

# Log return function
def lg_return(df):
    lg_return_list = []
    for index, row in df.iterrows():
        lg_return_list.append((np.log(row["Close"]) - np.log(row["Open"])))
    return lg_return_list

# Log return function
def returns(df):
    lg_return_list = []
    for index, row in df.iterrows():
        lg_return_list.append(row["Close"]/row["Open"] - 1)
    return lg_return_list

# insert log returns to each dataset
nya_df.insert(7, "Daily Log Returns", lg_return(nya_df))
tko_df.insert(7, "Daily Log Returns", lg_return(tko_df))

nya_df

Unnamed: 0_level_0,Index,Open,High,Low,Close,Adj Close,CloseUSD,Daily Log Returns
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2010-01-04,NYA,7184.979980,7331.120117,7184.979980,7326.740234,7326.740234,7326.740234,0.019538
2010-01-05,NYA,7326.740234,7359.459961,7313.600098,7354.870117,7354.870117,7354.870117,0.003832
2010-01-06,NYA,7354.850098,7389.279785,7342.490234,7377.700195,7377.700195,7377.700195,0.003102
2010-01-07,NYA,7377.700195,7398.209961,7325.509766,7393.930176,7393.930176,7393.930176,0.002197
2010-01-08,NYA,7393.930176,7426.410156,7367.810059,7425.350098,7425.350098,7425.350098,0.004240
...,...,...,...,...,...,...,...,...
2021-05-24,NYA,16375.000000,16508.519530,16375.000000,16464.689450,16464.689450,16464.689450,0.005462
2021-05-25,NYA,16464.689450,16525.810550,16375.150390,16390.189450,16390.189450,16390.189450,-0.004535
2021-05-26,NYA,16390.189450,16466.339840,16388.320310,16451.960940,16451.960940,16451.960940,0.003762
2021-05-27,NYA,16451.960940,16546.359380,16451.960940,16531.949220,16531.949220,16531.949220,0.004850


In [20]:
# even out the dataframes such that they have the same dates
def even_out_frames(df1, df2):
    
    df1_index = df1.index.to_numpy()
    for index, row in df2.iterrows():
        if index not in df1_index:
            df2.drop(index=index, inplace=True)
    
    df2_index = df2.index.to_numpy()
    for index, row in df1.iterrows():
        if index not in df2_index:
                df1.drop(index=index, inplace=True)
    return 0

even_out_frames(tko_df, nya_df)
tko_df

Unnamed: 0_level_0,Index,Open,High,Low,Close,Adj Close,CloseUSD,Daily Log Returns
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2010-01-04,N225,10609.33984,10694.49023,10608.13965,10654.79004,10654.79004,106.547900,0.004275
2010-01-05,N225,10719.44043,10791.04004,10655.57031,10681.83008,10681.83008,106.818301,-0.003515
2010-01-06,N225,10709.54981,10768.61035,10661.16992,10731.45020,10731.45020,107.314502,0.002043
2010-01-07,N225,10742.75000,10774.00000,10636.66992,10681.66016,10681.66016,106.816602,-0.005703
2010-01-08,N225,10743.29981,10816.45020,10677.55957,10798.32031,10798.32031,107.983203,0.005108
...,...,...,...,...,...,...,...,...
2021-05-24,N225,28212.32031,28584.17969,28212.32031,28364.60938,28364.60938,283.646094,0.005383
2021-05-25,N225,28516.99023,28576.97070,28443.74023,28553.98047,28553.98047,285.539805,0.001296
2021-05-26,N225,28396.61914,28710.83008,28396.61914,28642.18945,28642.18945,286.421895,0.008611
2021-05-27,N225,28543.32031,28587.21094,28360.56055,28549.00977,28549.00977,285.490098,0.000199


In [21]:
tko_close = tko_df["Close"]

nya_df.insert(8, "Prev Tokyo Close", tko_close)

tko_df.drop(index="2010-01-04", inplace=True)

tko_open = tko_df["Open"]

tko_open = np.append(tko_open,0)

nya_df.insert(9, "Next Tokyo Open", tko_open)
nya_df.drop(index="2021-05-28", inplace=True)

In [22]:
nya_df.drop(columns=["CloseUSD", "Adj Close"], inplace=True)

next_tko_open = nya_df["Close"]

next_tko_open.to_numpy().astype(np.float32).reshape(1,-1)

nya_train, nya_test, tko_open_train, tko_open_test = train_test_split(nya_df.drop(columns=["Index", "Daily Log Returns", "Next Tokyo Open"]).astype(np.float32), next_tko_open, test_size=0.33, shuffle=True)


In [29]:
model = Sequential()
model.add(LSTM(128, return_sequences=True, input_shape= (nya_train.shape[1], 1)))
model.add(GRU(128, return_sequences=True))
model.add(Dense(32, activation="relu"))
model.add(Dense(1))
model.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_11 (LSTM)              (None, 5, 128)            66560     
                                                                 
 gru_2 (GRU)                 (None, 5, 128)            99072     
                                                                 
 lstm_12 (LSTM)              (None, 5, 64)             49408     
                                                                 
 gru_3 (GRU)                 (None, 64)                24960     
                                                                 
 dense_6 (Dense)             (None, 32)                2080      
                                                                 
 dense_7 (Dense)             (None, 1)                 33        
                                                                 
Total params: 242,113
Trainable params: 242,113
Non-tr

In [30]:
min_max_scaler = MinMaxScaler()

train_scale = min_max_scaler.fit_transform(nya_train)

test_scale = min_max_scaler.fit_transform(nya_test)

In [31]:
model.compile(optimizer='adam', loss='mean_squared_error')
model.fit(train_scale, tko_open_train, batch_size= 1, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50

KeyboardInterrupt: 