<a href="https://colab.research.google.com/github/Sina-Akhavi/bitcoin-timeseries-Forecasting/blob/main/bitcoin_forecasting_using_lstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
os.chdir('./sample_data')

In [3]:
import pandas as pd
import numpy as np
import math
import datetime as dt
import matplotlib.pyplot as plt
from itertools import cycle
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
# ------------------------------------------------
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
# ------------------------------------------------
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM


maindf = pd.read_csv('./btc_data.csv')

maindf


Unnamed: 0,Date,Close,High,Low,Open,Volume
0,2014-09-17,457.334015,468.174011,452.421997,465.864014,21056800
1,2014-09-18,424.440002,456.859985,413.104004,456.859985,34483200
2,2014-09-19,394.795990,427.834991,384.532013,424.102997,37919700
3,2014-09-20,408.903992,423.295990,389.882996,394.673004,36863600
4,2014-09-21,398.821014,412.425995,393.181000,408.084991,26580100
...,...,...,...,...,...,...
3862,2025-04-14,84542.390625,85785.000000,83690.640625,83694.523438,34090769777
3863,2025-04-15,83668.992188,86429.351562,83598.820312,84539.695312,28040322885
3864,2025-04-16,84033.867188,85428.281250,83100.617188,83674.507812,29617804112
3865,2025-04-17,84895.750000,85449.070312,83749.750000,84030.671875,21276866029


#Check for the Null Values

In [4]:
print("Null values: ", maindf.isnull().values.sum())


Null values:  0


In [5]:
print("NA values: ", maindf.isnull().values.any())

NA values:  False


In [6]:
maindf.shape

(3867, 6)

#Overall overview from 2014-2022

In [None]:
maindf['Date'] = pd.to_datetime(maindf['Date'], format='%Y-%m-%d')
y_overall = maindf.loc[(maindf['Date'] >= '2014-9-17') & (maindf['Date'] <= '2022-02-19')]
y_overall.drop(y_overall[['Adj Close', 'Volume']], axis=1)

monthvise = y_overall.groupby(y_overall['Date'].dt.strftime('%B'))[['Open', 'Close']].mean()
new_order = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
monthvise = monthvise.reindex(new_order, axis=0)
monthvise

Unnamed: 0_level_0,Open,Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
January,12855.131425,12828.374881
February,12773.095753,12837.782739
March,10918.895761,10957.226324
April,11338.4489,11359.962198
May,10659.455257,10580.209317
June,9299.305977,9294.420703
July,9285.4025,9330.128271
August,11312.971706,11345.157739
September,10489.365578,10462.37815
October,11321.578327,11416.077925


In [None]:
names = cycle(['Stock Open Price', 'Stock Close Price', 'Stock High Price', 'Stock Low Price'])
fig = px.line(y_overall, x=y_overall.Date, y=[y_overall['Open'], y_overall['Close'], y_overall['High'], y_overall['Low']],
              labels={'Date': 'Date', 'Value': 'Stock Value'})

fig.update_layout(title_text='Stock Analysis chart', font_size=15, font_color='black', legend_title_text='Stock Parameters')
fig.for_each_trace(lambda t: t.update(name=next(names)))

fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)

fig.show()

#Normalization

In [7]:
closedf = maindf[['Date', 'Close']]
closedf

Unnamed: 0,Date,Close
0,2014-09-17,457.334015
1,2014-09-18,424.440002
2,2014-09-19,394.795990
3,2014-09-20,408.903992
4,2014-09-21,398.821014
...,...,...
3862,2025-04-14,84542.390625
3863,2025-04-15,83668.992188
3864,2025-04-16,84033.867188
3865,2025-04-17,84895.750000


In [8]:
# closedf
# closedf = closedf[closedf['Date'] > '2021-02-19']

closedf = closedf[closedf['Date'] > '2014-09-17']
closedf = closedf[closedf['Date'] < '2020-08-01']
close_stock = closedf.copy()


In [9]:
close_stock

Unnamed: 0,Date,Close
1,2014-09-18,424.440002
2,2014-09-19,394.795990
3,2014-09-20,408.903992
4,2014-09-21,398.821014
5,2014-09-22,402.152008
...,...,...
2140,2020-07-27,10990.873047
2141,2020-07-28,10912.823242
2142,2020-07-29,11100.467773
2143,2020-07-30,11111.213867


In [10]:
fig = px.line(closedf, x=closedf['Date'], y=closedf['Close'], labels={'date': 'Date', 'close': 'Close Stock'})
fig.update_traces(marker_line_width=2, opacity=0.8, marker_line_color='orange')
fig.update_layout(title_text='Considered Period to predict Bitcoin Close Price', plot_bgcolor='white',
                  font_size=15, font_color='black')
fig.show()

In [11]:
del closedf['Date']
scaler = MinMaxScaler()
closedf = scaler.fit_transform(np.array(closedf).reshape(-1, 1))

closedf

array([[0.01275083],
       [0.0112164 ],
       [0.01194666],
       ...,
       [0.56536035],
       [0.56591659],
       [0.57690316]])

In [12]:
training_size = int(len(closedf) * 0.80)
test_size = len(closedf) - training_size
train_data, test_data = closedf[0: training_size, :], closedf[training_size: len(closedf), :1]

# train_data[:5]
print('train_data shape: ', train_data.shape)
print('test_data shape: ', test_data.shape)

train_data shape:  (1715, 1)
test_data shape:  (429, 1)


#Getting Familiar with dataset

- 1715 records for training data starting from 2014-09-18
- 429 records for test data ending date is 2020-07-31


In [13]:
def create_XtrainYtrain(dataset, time_step):

  data_X, data_Y = [], []

  for i in range(len(dataset) - time_step - 1):
    a = dataset[i: (i + time_step), 0]
    data_X.append(a)
    data_Y.append(dataset[(i + time_step), 0])

  return np.array(data_X), np.array(data_Y)



In [14]:
time_step = 15
X_train, Y_train = create_XtrainYtrain(train_data, time_step)
X_test, Y_test = create_XtrainYtrain(test_data, time_step)



In [15]:
print('x_train shape:', X_train.shape)
print('y_train shape:', Y_train.shape)
print('x_test shape:', X_test.shape)
print('y_test shape:', Y_test.shape)

x_train shape: (1699, 15)
y_train shape: (1699,)
x_test shape: (413, 15)
y_test shape: (413,)


In [16]:
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

print('X_train.shape: ', X_train.shape)
print('X_test.shape: ', X_test.shape)

X_train.shape:  (1699, 15, 1)
X_test.shape:  (413, 15, 1)


#LSTM Model Construction

In [17]:
from tensorflow.keras.layers import Dropout

model = Sequential()
model.add(LSTM(31, input_shape=(time_step, 1), activation='relu',
               return_sequences=True))
model.add(LSTM(31, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1))

model.compile(loss="mean_squared_error", optimizer="adam")
history = model.fit(X_train, Y_train, validation_data=(X_test, Y_test),
                    epochs=200, batch_size=32, verbose=1)

Epoch 1/200



Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.



[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 28ms/step - loss: 0.0308 - val_loss: 0.0043
Epoch 2/200
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - loss: 0.0092 - val_loss: 0.0142
Epoch 3/200
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - loss: 0.0081 - val_loss: 0.0120
Epoch 4/200
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 25ms/step - loss: 0.0058 - val_loss: 0.0026
Epoch 5/200
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step - loss: 0.0049 - val_loss: 0.0067
Epoch 6/200
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - loss: 0.0052 - val_loss: 0.0037
Epoch 7/200
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - loss: 0.0047 - val_loss: 0.0074
Epoch 8/200
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - loss: 0.0045 - val_loss: 0.0019
Epoch 9/200
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━

In [20]:
model.save('lstm_model.h5')



In [None]:
# Let's do the prediction

In [18]:
train_predict = model.predict(X_train)
test_predict = model.predict(X_test)

# do the inverse transformation

train_predict = scaler.inverse_transform(train_predict)
test_predict = scaler.inverse_transform(test_predict)

original_ytrain = scaler.inverse_transform(Y_train.reshape(-1, 1))
original_ytest = scaler.inverse_transform(Y_test.reshape(-1, 1))

[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step


In [19]:
# Make sure your arrays are 1D
train_pred = train_predict.flatten()
test_pred  = test_predict.flatten()
y_train    = original_ytrain.flatten()
y_test     = original_ytest.flatten()

# 1) Save train set results
df_train = pd.DataFrame({
    'y_true_train': y_train,
    'y_pred_train': train_pred
})
df_train.to_csv('train_results.csv', index=False)

# 2) Save test set results
df_test = pd.DataFrame({
    'y_true_test': y_test,
    'y_pred_test': test_pred
})
df_test.to_csv('test_results.csv', index=False)

print("CSVs saved: train_results.csv, test_results.csv")

CSVs saved: train_results.csv, test_results.csv


#Evaluation metrics


In [None]:
def calculate_mape(actual, predicted) -> float:

    # Convert actual and predicted
    # to numpy array data type if not already
    if not all([isinstance(actual, np.ndarray),
                isinstance(predicted, np.ndarray)]):
        actual, predicted = np.array(actual), np.array(predicted)

    # Calculate the MAPE value and return
    return round(np.mean(np.abs((
      actual - predicted) / actual)) * 100, 2)



print('Train Data RMSE: ', math.sqrt(mean_squared_error(original_ytrain, train_predict)))
print('Train Data MSE: ', mean_squared_error(original_ytrain, train_predict))
print('Train Data MAE: ', mean_absolute_error(original_ytrain, train_predict))
print('Train MAPE=', calculate_mape(original_ytrain, train_predict))
print('----------------------------------------------------------')
print('Test Data RMSE: ', math.sqrt(mean_squared_error(original_ytest, test_predict)))
print('Test Data MSE: ', mean_squared_error(original_ytest, test_predict))
print('Test Data MAE: ', mean_absolute_error(original_ytest, test_predict))
print('Test MAPE=', calculate_mape(original_ytest, test_predict))

Train Data RMSE:  350.3446208693014
Train Data MSE:  122741.35337205455
Train Data MAE:  242.47171880348043
Train MAPE= 33.31
----------------------------------------------------------
Test Data RMSE:  456.7160096862385
Test Data MSE:  208589.51350372034
Test Data MAE:  335.2106419208535
Test MAPE= 3.86


In [None]:
# shift train predictions for plotting
look_back = time_step
trainPredictPlot = np.empty_like(closedf)
trainPredictPlot[:, :] = np.nan
trainPredictPlot[look_back: len(train_predict) + look_back, :] = train_predict
print("Train Predicted data: ", trainPredictPlot.shape)

# shift test predictions for plotting
testPredictPlot = np.empty_like(closedf)
testPredictPlot[:, :] = np.nan
testPredictPlot[len(train_predict) + (look_back * 2) + 1: len(closedf)-1, :] = test_predict
print("Test Predicted data: ", testPredictPlot.shape)

names = cycle(['Original close price', 'Train Predicted close price',  'Test Predicted close price'])

plotdf = pd.DataFrame({'date': close_stock['Date'],
                       'original_close': close_stock['Close'],
                       'train_predicted_close': trainPredictPlot.reshape(1, -1)[0].tolist(),
                       'test_predicted_close': testPredictPlot.reshape(1, -1)[0].tolist()})

fig = px.line(plotdf, x=plotdf['date'], y=[plotdf['original_close'], plotdf['train_predicted_close'], plotdf['test_predicted_close']],
              labels={'value': 'stock price', 'date': 'Data'})

fig.update_layout(title_text='Comparison between original close price vs predicted close price', plot_bgcolor='white', font_size=15,
                  font_color='black', legend_title_text='Close price')

fig.for_each_trace(lambda t: t.update(name = next(names)))
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
fig.show()


Train Predicted data:  (2144, 1)
Test Predicted data:  (2144, 1)
