In [28]:
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [29]:
# download ethereum data from yfinance
eth = yf.download('ETH-USD', start='2022-12-01', end='2023-12-31', interval='5m')

# convert to pandas dataframe
eth = pd.DataFrame(eth)

eth.head()

[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-11-30 22:00:00+00:00,1295.510254,1295.809082,1295.049683,1295.049683,1295.049683,0
2022-11-30 22:05:00+00:00,1295.099365,1295.763306,1295.093018,1295.763306,1295.763306,12383232
2022-11-30 22:10:00+00:00,1295.908325,1295.908325,1295.14856,1295.14856,1295.14856,3593216
2022-11-30 22:15:00+00:00,1295.0271,1295.0271,1294.24939,1294.24939,1294.24939,0
2022-11-30 22:20:00+00:00,1294.17749,1294.395996,1294.007812,1294.395996,1294.395996,11730944


In [30]:
# change all the values to percentage change
eth['Open'] = eth['Open'].pct_change()
eth['High'] = eth['High'].pct_change()
eth['Low'] = eth['Low'].pct_change()
eth['Close'] = eth['Close'].pct_change()
eth['Volume'] = eth['Volume'].pct_change()

eth.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-11-30 22:00:00+00:00,,,,,1295.049683,
2022-11-30 22:05:00+00:00,-0.000317,-3.5e-05,3.3e-05,0.000551,1295.763306,inf
2022-11-30 22:10:00+00:00,0.000625,0.000112,4.3e-05,-0.000474,1295.14856,-0.709832
2022-11-30 22:15:00+00:00,-0.00068,-0.00068,-0.000694,-0.000694,1294.24939,-1.0
2022-11-30 22:20:00+00:00,-0.000656,-0.000487,-0.000187,0.000113,1294.395996,inf


In [31]:
# drop adj close column
eth = eth.drop(['Adj Close'], axis=1)

# drop nan values
eth = eth.dropna()

In [32]:
# calculate the number on infs
eth.isin([np.inf, -np.inf]).sum()

Open         0
High         0
Low          0
Close        0
Volume    2378
dtype: int64

In [33]:
# change all the infs to 0
eth = eth.replace([np.inf, -np.inf], 0)

In [34]:
# split the data into x and y train and test using built-in function
from sklearn.model_selection import train_test_split

x = eth.drop(['Close'], axis=1)
y = eth['Close']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, shuffle=False)

# scale the data
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [35]:
# calculate how many datapoints in 1 day
num_of_datapoints = int(24 * 60 / 5)

In [36]:
# LSTM model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM
from sklearn.metrics import mean_absolute_error, mean_squared_error

model = Sequential()
model.add(LSTM(50, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(50, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(25))
model.add(Dense(1))

model.compile(optimizer='adam', loss='mean_squared_error')

# Create an empty list to store the input and output sequences
x_train_seq = []
y_train_seq = []
x_test_seq = []
y_test_seq = []

# Loop through the entire training dataset
for i in range(num_of_datapoints, len(x_train)):
    # Get one day's worth of data points from the training dataset
    x_train_seq.append(x_train[i-num_of_datapoints:i, 0])
    # Get the corresponding output (i.e. the next day's data points)
    y_train_seq.append(y_train.iloc[i])

# Loop through the entire testdataset
for i in range(num_of_datapoints, len(x_test)):
    # Get one day's worth of data points from the test dataset
    x_test_seq.append(x_test[i-num_of_datapoints:i, 0])
    # Get the corresponding output (i.e. the next day's data points)
    y_test_seq.append(y_test.iloc[i])

# Convert the input and output sequences to numpy arrays
x_train_seq = np.array(x_train_seq)
y_train_seq = np.array(y_train_seq)
x_test_seq = np.array(x_test_seq)
y_test_seq = np.array(y_test_seq)

# Reshape the input data to match the LSTM's input shape
x_train_seq = np.reshape(x_train_seq, (x_train_seq.shape[0], x_train_seq.shape[1], 1))
x_test_seq = np.reshape(x_test_seq, (x_test_seq.shape[0], x_test_seq.shape[1], 1))

print(x_train_seq.shape)

# train the model
model.fit(x_train_seq, y_train_seq, batch_size=1, epochs=1, validation_data=(x_test, y_test))

# plot the train and test loss
plt.plot(model.history.history['loss'], label='train')
plt.plot(model.history.history['val_loss'], label='test')



(8023, 288, 1)
  20/8023 [..............................] - ETA: 12:23 - loss: 0.0031

KeyboardInterrupt: 

In [None]:
# make predictions on the test set
y_pred = model.predict(x_test_seq)

# calculate the MAE
mae = mean_absolute_error(y_test_seq, y_pred)
print("MAE:", mae)


In [None]:
# plot the prediction and the test set
plt.plot(y_test_seq, label='test')
plt.plot(y_pred, label='prediction')
plt.legend()
plt.show()