In [19]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

stock_df = pd.read_csv('indexData.csv')
stock_df

Unnamed: 0,Index,Date,Open,High,Low,Close,Adj Close,Volume
0,NYA,1965-12-31,528.690002,528.690002,528.690002,528.690002,528.690002,0.0
1,NYA,1966-01-03,527.210022,527.210022,527.210022,527.210022,527.210022,0.0
2,NYA,1966-01-04,527.840027,527.840027,527.840027,527.840027,527.840027,0.0
3,NYA,1966-01-05,531.119995,531.119995,531.119995,531.119995,531.119995,0.0
4,NYA,1966-01-06,532.070007,532.070007,532.070007,532.070007,532.070007,0.0
...,...,...,...,...,...,...,...,...
112452,N100,2021-05-27,1241.119995,1251.910034,1241.119995,1247.069946,1247.069946,379696400.0
112453,N100,2021-05-28,1249.469971,1259.209961,1249.030029,1256.599976,1256.599976,160773400.0
112454,N100,2021-05-31,1256.079956,1258.880005,1248.140015,1248.930054,1248.930054,91173700.0
112455,N100,2021-06-01,1254.609985,1265.660034,1254.609985,1258.579956,1258.579956,155179900.0


### Data preprocessing

In [20]:
stock_NYA_df = stock_df[stock_df['Index'] == 'NYA']
stock_NYA_df

Unnamed: 0,Index,Date,Open,High,Low,Close,Adj Close,Volume
0,NYA,1965-12-31,528.690002,528.690002,528.690002,528.690002,528.690002,0.000000e+00
1,NYA,1966-01-03,527.210022,527.210022,527.210022,527.210022,527.210022,0.000000e+00
2,NYA,1966-01-04,527.840027,527.840027,527.840027,527.840027,527.840027,0.000000e+00
3,NYA,1966-01-05,531.119995,531.119995,531.119995,531.119995,531.119995,0.000000e+00
4,NYA,1966-01-06,532.070007,532.070007,532.070007,532.070007,532.070007,0.000000e+00
...,...,...,...,...,...,...,...,...
13943,NYA,2021-05-24,16375.000000,16508.519530,16375.000000,16464.689450,16464.689450,2.947400e+09
13944,NYA,2021-05-25,16464.689450,16525.810550,16375.150390,16390.189450,16390.189450,3.420870e+09
13945,NYA,2021-05-26,16390.189450,16466.339840,16388.320310,16451.960940,16451.960940,3.674490e+09
13946,NYA,2021-05-27,16451.960940,16546.359380,16451.960940,16531.949220,16531.949220,5.201110e+09


In [21]:
stock_NYA_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13948 entries, 0 to 13947
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Index      13948 non-null  object 
 1   Date       13948 non-null  object 
 2   Open       13947 non-null  float64
 3   High       13947 non-null  float64
 4   Low        13947 non-null  float64
 5   Close      13947 non-null  float64
 6   Adj Close  13947 non-null  float64
 7   Volume     13947 non-null  float64
dtypes: float64(6), object(2)
memory usage: 980.7+ KB


In [22]:
stock_NYA_df = stock_NYA_df.dropna()
stock_NYA_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13947 entries, 0 to 13947
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Index      13947 non-null  object 
 1   Date       13947 non-null  object 
 2   Open       13947 non-null  float64
 3   High       13947 non-null  float64
 4   Low        13947 non-null  float64
 5   Close      13947 non-null  float64
 6   Adj Close  13947 non-null  float64
 7   Volume     13947 non-null  float64
dtypes: float64(6), object(2)
memory usage: 980.6+ KB


In [23]:
stock_NYA_df['Date'] = pd.to_datetime(stock_NYA_df['Date'])
stock_NYA_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13947 entries, 0 to 13947
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Index      13947 non-null  object        
 1   Date       13947 non-null  datetime64[ns]
 2   Open       13947 non-null  float64       
 3   High       13947 non-null  float64       
 4   Low        13947 non-null  float64       
 5   Close      13947 non-null  float64       
 6   Adj Close  13947 non-null  float64       
 7   Volume     13947 non-null  float64       
dtypes: datetime64[ns](1), float64(6), object(1)
memory usage: 980.6+ KB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stock_NYA_df['Date'] = pd.to_datetime(stock_NYA_df['Date'])


In [24]:
stock_NYA_df['day_difference'] = stock_NYA_df['Date'].diff().dt.days
stock_NYA_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stock_NYA_df['day_difference'] = stock_NYA_df['Date'].diff().dt.days


Unnamed: 0,Index,Date,Open,High,Low,Close,Adj Close,Volume,day_difference
0,NYA,1965-12-31,528.690002,528.690002,528.690002,528.690002,528.690002,0.000000e+00,
1,NYA,1966-01-03,527.210022,527.210022,527.210022,527.210022,527.210022,0.000000e+00,3.0
2,NYA,1966-01-04,527.840027,527.840027,527.840027,527.840027,527.840027,0.000000e+00,1.0
3,NYA,1966-01-05,531.119995,531.119995,531.119995,531.119995,531.119995,0.000000e+00,1.0
4,NYA,1966-01-06,532.070007,532.070007,532.070007,532.070007,532.070007,0.000000e+00,1.0
...,...,...,...,...,...,...,...,...,...
13943,NYA,2021-05-24,16375.000000,16508.519530,16375.000000,16464.689450,16464.689450,2.947400e+09,3.0
13944,NYA,2021-05-25,16464.689450,16525.810550,16375.150390,16390.189450,16390.189450,3.420870e+09,1.0
13945,NYA,2021-05-26,16390.189450,16466.339840,16388.320310,16451.960940,16451.960940,3.674490e+09,1.0
13946,NYA,2021-05-27,16451.960940,16546.359380,16451.960940,16531.949220,16531.949220,5.201110e+09,1.0


In [25]:
stock_NYA_df['day_difference'].value_counts()

day_difference
1.0    10889
3.0     2556
4.0      332
2.0      165
5.0        3
7.0        1
Name: count, dtype: int64

In [26]:
data = stock_NYA_df['Adj Close'].to_numpy()
data

array([  528.690002,   527.210022,   527.840027, ..., 16451.96094 ,
       16531.94922 , 16555.66016 ])

In [27]:
data.shape

(13947,)

In [28]:
data = np.reshape(data, (len(data), 1))
data.shape

(13947, 1)

In [29]:
data

array([[  528.690002],
       [  527.210022],
       [  527.840027],
       ...,
       [16451.96094 ],
       [16531.94922 ],
       [16555.66016 ]])

In [30]:
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(data)
scaled_data

array([[0.01113857],
       [0.01104745],
       [0.01108624],
       ...,
       [0.991475  ],
       [0.99639958],
       [0.99785937]])

In [33]:
window_size = 60

X, y = [], []

for i in range(window_size, len(scaled_data)):
    X.append(scaled_data[i-window_size:i, 0])
    y.append(scaled_data[i, 0])

X[:2]

[array([0.01113857, 0.01104745, 0.01108624, 0.01128818, 0.01134666,
        0.01137929, 0.01145748, 0.01148334, 0.01142486, 0.01149012,
        0.01155476, 0.01167851, 0.01176962, 0.01169143, 0.0115683 ,
        0.01158739, 0.01169143, 0.01171791, 0.01168528, 0.01167235,
        0.0115683 , 0.01142486, 0.01116443, 0.01130788, 0.01134666,
        0.01155476, 0.01166558, 0.01163972, 0.01182811, 0.01173699,
        0.01173084, 0.01169821, 0.01155476, 0.0115486 , 0.01137252,
        0.01130788, 0.0111121 , 0.01096926, 0.0107544 , 0.01084551,
        0.0108843 , 0.01047427, 0.01016213, 0.01029203, 0.01019414,
        0.00976503, 0.00979704, 0.01005747, 0.01006424, 0.01003838,
        0.00967391, 0.0094652 , 0.00963451, 0.0097324 , 0.009882  ,
        0.01012273, 0.01021384, 0.0101098 , 0.01015536, 0.01023355]),
 array([0.01104745, 0.01108624, 0.01128818, 0.01134666, 0.01137929,
        0.01145748, 0.01148334, 0.01142486, 0.01149012, 0.01155476,
        0.01167851, 0.01176962, 0.01169143, 0.

In [35]:
X, y = np.array(X), np.array(y)
X[:2]

array([[0.01113857, 0.01104745, 0.01108624, 0.01128818, 0.01134666,
        0.01137929, 0.01145748, 0.01148334, 0.01142486, 0.01149012,
        0.01155476, 0.01167851, 0.01176962, 0.01169143, 0.0115683 ,
        0.01158739, 0.01169143, 0.01171791, 0.01168528, 0.01167235,
        0.0115683 , 0.01142486, 0.01116443, 0.01130788, 0.01134666,
        0.01155476, 0.01166558, 0.01163972, 0.01182811, 0.01173699,
        0.01173084, 0.01169821, 0.01155476, 0.0115486 , 0.01137252,
        0.01130788, 0.0111121 , 0.01096926, 0.0107544 , 0.01084551,
        0.0108843 , 0.01047427, 0.01016213, 0.01029203, 0.01019414,
        0.00976503, 0.00979704, 0.01005747, 0.01006424, 0.01003838,
        0.00967391, 0.0094652 , 0.00963451, 0.0097324 , 0.009882  ,
        0.01012273, 0.01021384, 0.0101098 , 0.01015536, 0.01023355],
       [0.01104745, 0.01108624, 0.01128818, 0.01134666, 0.01137929,
        0.01145748, 0.01148334, 0.01142486, 0.01149012, 0.01155476,
        0.01167851, 0.01176962, 0.01169143, 0.0

In [36]:
X.shape

(13887, 60)

In [40]:
test_split = 0.2
split_index = int(len(y) * (1-test_split))
X_train = X[:split_index]
Y_train = y[:split_index]
X_test = X[(split_index + window_size - 1):]
Y_test = y[(split_index + window_size - 1):]

indices = np.arange(len(Y_train))
np.random.shuffle(indices)
X_train = X_train[indices]
Y_train = Y_train[indices]

In [41]:
X_train[:2]

array([[0.14419314, 0.14450589, 0.14548233, 0.14468812, 0.14492885,
        0.14455145, 0.14228582, 0.14156979, 0.14128967, 0.14220147,
        0.14337922, 0.14548848, 0.14546941, 0.14617249, 0.14653696,
        0.14651727, 0.1460549 , 0.14675182, 0.14559254, 0.14497441,
        0.14361379, 0.14362057, 0.143881  , 0.14496149, 0.14747399,
        0.14699256, 0.14572305, 0.14522806, 0.14567073, 0.14407616,
        0.14412788, 0.14477247, 0.144597  , 0.14416727, 0.14345126,
        0.1444862 , 0.14427809, 0.14440123, 0.14360703, 0.14283252,
        0.14171325, 0.13893969, 0.13884857, 0.1397733 , 0.14033292,
        0.14061922, 0.14030031, 0.13875745, 0.14003987, 0.14008543,
        0.13993582, 0.13928506, 0.13729955, 0.13754029, 0.13824953,
        0.13854935, 0.14024797, 0.14065861, 0.14168676, 0.14136171],
       [0.10831108, 0.10855858, 0.10911144, 0.10762707, 0.10926782,
        0.10924196, 0.11034215, 0.11014021, 0.11252282, 0.11238615,
        0.11227533, 0.11200876, 0.11141586, 0.1

### Creating a model

In [42]:
model_lstm = tf.keras.models.Sequential()
model_lstm.add(tf.keras.layers.LSTM(units=50, return_sequences=True, input_shape=(X_train.shape[1], 1)))
model_lstm.add(tf.keras.layers.LSTM(units=50))
model_lstm.add(tf.keras.layers.Dense(units=1))

model_lstm.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy'])
model_lstm.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 60, 50)            10400     
                                                                 
 lstm_1 (LSTM)               (None, 50)                20200     
                                                                 
 dense (Dense)               (None, 1)                 51        
                                                                 
Total params: 30651 (119.73 KB)
Trainable params: 30651 (119.73 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


### Traning a model

In [43]:
model_lstm.fit(X_train, Y_train, epochs=100, batch_size=32)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
 70/348 [=====>........................] - ETA: 14s - loss: 2.6633e-05 - accuracy: 4.4643e-04

KeyboardInterrupt: 