In [1]:
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter
import joblib

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

In [2]:
seed = 42

In [3]:
coin = 'KRW-ARK'

In [4]:
df = pd.read_csv(f'../data/from_pyupbit/{coin}.csv')

In [5]:
df

Unnamed: 0,timestamp,open,high,low,close,volume,value
0,2020-08-22 11:27:00,558.0,558.0,558.0,558.0,98.000000,5.468400e+04
1,2020-08-22 11:28:00,560.0,560.0,560.0,560.0,29.374300,1.644961e+04
2,2020-08-22 11:29:00,561.0,561.0,560.0,561.0,1351.474971,7.581088e+05
3,2020-08-22 11:30:00,561.0,561.0,561.0,561.0,9.074733,5.090925e+03
4,2020-08-22 11:40:00,561.0,561.0,560.0,560.0,95.126560,5.336000e+04
...,...,...,...,...,...,...,...
1576795,2024-10-12 20:08:00,660.1,661.5,660.1,661.5,43947.008730,2.903625e+07
1576796,2024-10-12 20:09:00,661.5,661.5,660.1,660.8,102755.667870,6.787026e+07
1576797,2024-10-12 20:10:00,660.8,660.8,660.3,660.7,19605.391412,1.295274e+07
1576798,2024-10-12 20:11:00,660.7,660.7,659.4,660.3,33766.902404,2.228660e+07


In [6]:
df['timestamp'] = pd.to_datetime(df['timestamp'])
df.set_index('timestamp', inplace=True)

In [7]:
df

Unnamed: 0_level_0,open,high,low,close,volume,value
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-08-22 11:27:00,558.0,558.0,558.0,558.0,98.000000,5.468400e+04
2020-08-22 11:28:00,560.0,560.0,560.0,560.0,29.374300,1.644961e+04
2020-08-22 11:29:00,561.0,561.0,560.0,561.0,1351.474971,7.581088e+05
2020-08-22 11:30:00,561.0,561.0,561.0,561.0,9.074733,5.090925e+03
2020-08-22 11:40:00,561.0,561.0,560.0,560.0,95.126560,5.336000e+04
...,...,...,...,...,...,...
2024-10-12 20:08:00,660.1,661.5,660.1,661.5,43947.008730,2.903625e+07
2024-10-12 20:09:00,661.5,661.5,660.1,660.8,102755.667870,6.787026e+07
2024-10-12 20:10:00,660.8,660.8,660.3,660.7,19605.391412,1.295274e+07
2024-10-12 20:11:00,660.7,660.7,659.4,660.3,33766.902404,2.228660e+07


In [8]:
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data = scaler.fit_transform(df['close'].values.reshape(-1, 1))
joblib.dump(scaler, f'models/{coin}_scaler.pkl')

['models/KRW-ARK_scaler.pkl']

In [9]:
timestep = 1

In [10]:
X = []
y = []
for i in range(len(scaled_data) - timestep - 1):
    X.append(scaled_data[i:(i + timestep), 0])
    y.append(scaled_data[i + timestep, 0])

In [11]:
X, y = np.array(X), np.array(y)

In [12]:
train_size = int(len(X) * .8)
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

In [13]:
train_dates = df.index[:train_size]
test_dates = df.index[train_size:]

In [14]:
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

In [15]:
print(X_train.shape, y_train.shape)

(1261438, 1, 1) (1261438,)


In [16]:
model = Sequential()
model.add(LSTM(50, return_sequences=True, input_shape=(timestep, 1)))
model.add(LSTM(50, return_sequences=False))
model.add(Dense(25))
model.add(Dense(1))

In [17]:
model.compile(optimizer='adam', loss='mean_squared_error')

In [18]:
early_stop = EarlyStopping(monitor='val_loss', patience=10)

In [19]:
model.fit(X_train, y_train, batch_size=32, epochs=20,
          validation_data=(X_test, y_test), callbacks=[early_stop])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20


<keras.callbacks.History at 0x21c47daf8e0>

In [20]:
model.save(f'models/lstm_{coin}.h5')

In [21]:
train_predict = model.predict(X_train)
test_predict = model.predict(X_test)



In [22]:
train_predict = scaler.inverse_transform(train_predict)
test_predict = scaler.inverse_transform(test_predict)
y_train = scaler.inverse_transform([y_train])
y_test = scaler.inverse_transform([y_test])

In [23]:
train_score = np.sqrt(mean_squared_error(y_train[0], train_predict[:, 0]))
test_score = np.sqrt(mean_squared_error(y_test[0], test_predict[:, 0]))
print(f'Train RMSE: {train_score:.2f}')
print(f'Test RMSE: {test_score:.2f}')

Train RMSE: 6.37
Test RMSE: 3.07


In [24]:
last_1_day = scaled_data[-timestep:]
X_predict = last_1_day.reshape(1, timestep, 1)
print(last_1_day)

[[0.06495296]]


In [25]:
pred = model.predict(X_predict)
pred = scaler.inverse_transform(pred)
print(pred)

[[657.81744]]


In [26]:
preds = []
for i in range(30):
    pred = model.predict(X_predict)
    preds.append(scaler.inverse_transform(pred)[0][0])
    pred = pred.reshape(1, 1, 1)
    X_predict = np.append(X_predict[:, 1:, :], pred, axis=1).reshape(1, timestep, 1)



In [27]:
preds

[657.81744,
 656.04114,
 654.27106,
 652.5071,
 650.7494,
 648.998,
 647.2528,
 645.5139,
 643.7813,
 642.05505,
 640.3352,
 638.62177,
 636.9147,
 635.214,
 633.5198,
 631.832,
 630.1507,
 628.47595,
 626.80774,
 625.1461,
 623.491,
 621.8425,
 620.2006,
 618.56537,
 616.9367,
 615.3147,
 613.69934,
 612.09064,
 610.4887,
 608.8935]