In [None]:
# https://towardsdatascience.com/time-series-forecasting-with-deep-learning-and-attention-mechanism-2d001fc871fc

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import random

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
# Setting hyperparameters
n_iters = 299969
epochs = 1
learning_rate = 0.001
batch_size = 128
lr_decay = False
hidden_size = 64
num_layers = 1
teacher_forcing_ratio = 0.5

In [3]:
# from google.colab import drive
# drive.mount('/content/drive')


# path = 'drive/MyDrive/code/bitcoin_data/bitstampUSD_1-min_data_2012-01-01_to_2021-03-31.csv'
path = "bitcoin_data/bitstampUSD_1-min_data_2012-01-01_to_2021-03-31.csv"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
df1 = pd.read_csv(path)

In [5]:
df1['Volume_(BTC)'].fillna(value=0, inplace=True)
df1['Volume_(Currency)'].fillna(value=0, inplace=True)
df1['Weighted_Price'].fillna(value=0, inplace=True)

# next we need to fix the OHLC (open high low close) data which is a continuous timeseries so
# lets fill forwards those values...
df1['Open'].fillna(method='ffill', inplace=True)
df1['High'].fillna(method='ffill', inplace=True)
df1['Low'].fillna(method='ffill', inplace=True)
df1['Close'].fillna(method='ffill', inplace=True)

print(df1.shape)
df1.isnull().sum()

(4857377, 8)


Timestamp            0
Open                 0
High                 0
Low                  0
Close                0
Volume_(BTC)         0
Volume_(Currency)    0
Weighted_Price       0
dtype: int64

In [6]:
def generate_data(X, window, horizon=0):
  features = []
  y = []
  for i in range(0, len(X)-window-1-horizon):
    features.append(X[i:window+i])
    y.append(X[window+i: window+i+horizon])
  return np.array(features), np.array(y)

In [7]:
scaler = MinMaxScaler()
scaler.fit(df1["Weighted_Price"].to_numpy().reshape(-1, 1))
data = scaler.transform(df1["Weighted_Price"].to_numpy().reshape(-1, 1))

In [8]:
# Generating dataset

class bitcoin(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
        self.len = X.shape[0]

    def __len__(self):
        return self.len

    def __getitem__(self, index):
        # print(index)
        # print(self.X[index])
        # print(self.y[index])
        return torch.Tensor(self.X[index]), torch.Tensor([self.y[index]]).squeeze()

In [9]:
window = 60
horizon = 4

X, y = generate_data(data[2400000:4800000], window, horizon)

In [10]:
X.shape, y.shape

((2399935, 60, 1), (2399935, 4, 1))

In [11]:
train = bitcoin(X.squeeze()[:2299935], y.squeeze()[:2299935])

In [12]:
val = bitcoin(X.squeeze()[2299935:2349935], y.squeeze()[2299935:2349935])

In [13]:
test = bitcoin(X.squeeze()[2349935:2399934], y.squeeze()[2349935:2399934])

In [14]:
train_loader = DataLoader(train, batch_size=batch_size, shuffle=False, drop_last=True)
val_loader = DataLoader(val, batch_size=batch_size, shuffle=False, drop_last=True)
test_loader = DataLoader(test, batch_size=batch_size, shuffle=False, drop_last=True)

In [15]:
dataset_iter = iter(train_loader)
temp = next(dataset_iter)
features, labels = temp
print(features.shape, labels.shape)

torch.Size([128, 60]) torch.Size([128, 4])


  return torch.Tensor(self.X[index]), torch.Tensor([self.y[index]]).squeeze()


In [16]:
class Encoder(nn.Module):
    def __init__(self, seq_len, input_shape, hidden_size, num_layers):
        super(Encoder, self).__init__()
        self.num_layers = num_layers
        self.hidden = hidden_size
        self.input_shape = input_shape
        self.gru = nn.GRU(input_shape, hidden_size, num_layers, batch_first=True)

    def forward(self, input):
        # input [batch, window]
        # hidden = torch.zeros([self.num_layers, self.hidden]).cuda()
        # hidden = torch.zeros([self.num_layers, batch_size, self.hidden]).cuda()
        output, hidden = self.gru(input)
        return output, hidden

In [17]:
class Decoder(nn.Module):
    def __init__(self, seq_len, input_shape, hidden_size, num_layers):
        super(Decoder, self).__init__()
        self.num_layers = num_layers
        self.hidden = hidden_size
        self.input_shape = input_shape
        self.gru = nn.GRU(input_shape + hidden_size, hidden_size, num_layers, batch_first=True)

        # For computing attention
        self.w1 = nn.Linear(hidden_size, hidden_size)
        self.w2 = nn.Linear(hidden_size, hidden_size)
        self.w3 = nn.Linear(hidden_size, 1)

        # Initialize fully connected layer
        self.final = nn.Linear(hidden_size, 1)

    def compute_attention(self, dec_hs, enc_output):
        # dec_hs: Decoder hidden state; [1, batch_size, hidden_units]
        # enc_output: Encoder outputs; [batch_size, window, hidden_units]
        dec_x = dec_hs.permute(1, 0, 2)

        # context_vector: Context vector, according to formula; [batch_size, hidden_units]
        # attention_weights: The attention weights you have calculated; [batch_size, max_len_src, 1]

        # alignment scores e(j,t)
        out = torch.tanh(self.w1(dec_x) + self.w2(enc_output))
        out = self.w3(out)

        # he scores e(j,t) are normalized using softmax function over the encoder time steps j, obaining the attention weights α(j,t)
        soft = nn.Softmax(dim=1)
        attention = soft(out)

        #  The context vector c(t) is calculated as the weighted sum of all the hidden values of the encoder according to the attention weights
        # context vector [batch_size, hidden_units]
        # attention weights [batch_size, max_len_src, 1]
        context_vector, attention_weights = torch.sum(
            attention * enc_output, dim=1), attention
        return context_vector, attention_weights


    def forward(self, input, dec_hs, enc_output):
        # **This function runs the decoder for a single time step.**
        # input [batch_size, 1]

        # hidden = torch.zeros([self.num_layers, self.hidden]).cuda()
        # hidden = torch.zeros([self.num_layers, batch_size, self.hidden]).cuda()

        context_vector, attention_weights = self.compute_attention(dec_hs, enc_output)
        # context vector [batch, hidden_size]
        s = input.size(0)
        out = torch.cat((context_vector.unsqueeze(1), input.reshape(s, 1, 1)), 2)
        fc_out, dec_hs = self.gru(out)
        output = self.final(fc_out)
        final_out = torch.flatten(output, start_dim = 1)

        return final_out, dec_hs, attention_weights

In [18]:
# Test encoder model,
input_shape = 1
enc_test = Encoder(window, input_shape, hidden_size, num_layers=1).to(device)
# (seq,btch,in_sh)
out, h = enc_test(features.unsqueeze(2).to(device))
print(out.shape, h.shape)

torch.Size([128, 60, 64]) torch.Size([1, 128, 64])


In [19]:
# Test decoder model,
input_shape = 1
dec_test = Decoder(window, input_shape, hidden_size, num_layers=1).to(device)
# (seq,btch,in_sh)
input = features[:, 0].to(device)
dec_hs = torch.zeros([num_layers, batch_size, hidden_size]).to(device)
out, dec_hs, att = dec_test(input, dec_hs, out)
print(out.shape, dec_hs.shape, att.shape)

torch.Size([128, 1]) torch.Size([1, 128, 64]) torch.Size([128, 60, 1])


In [20]:
input_shape = 1
encoder = Encoder(window, input_shape, hidden_size, num_layers=1).to(device)

decoder = Decoder(window, input_shape, hidden_size, num_layers=1).to(device)

In [21]:
criterion = nn.MSELoss()
rnn_model_params = list(encoder.parameters()) + list(decoder.parameters())
optimizer = torch.optim.Adam(rnn_model_params, lr=learning_rate)

In [25]:
for epoch in range(epochs):
  n_batch = 0
  total_loss = 0

  for batch_x, batch_y in train_loader:
    n_batch += 1
    mse_train = 0
    batch_x = batch_x.to(device)
    batch_y = batch_y.to(device)
    batch_x = batch_x.unsqueeze(2)
    batch_y = batch_y.unsqueeze(2)
    optimizer.zero_grad()

    enc_output, enc_hidden = encoder(batch_x)
    dec_hidden = enc_hidden

    # use teacher forcing - feeding the target as the next input (via dec_input)
    dec_input = batch_y[:, 0].view(batch_size, 1, 1)

    # run code below for every timestep in the ys batch
    for t in range(1, batch_y.size(1)):
      predictions, dec_hidden, _ = decoder(dec_input.to(device), dec_hidden.to(device), enc_output.to(device))
      # assert len(predictions.shape) == 2 and predictions.shape[0] == dec_input.shape[0] and predictions.shape[1] == len(trg_vocab.word2idx), "First output of decoder must have shape [batch_size, vocab_size], you returned shape " + str(predictions.shape)
      mse_train_step = criterion(batch_y[:, t], predictions)
      mse_train += mse_train_step
      dec_input = batch_y[:, t].view(batch_y.size(0), 1, 1)

    batch_loss = mse_train / (batch_y.size(1) - 1)
    total_loss += batch_loss.item()
    if n_batch % 100 == 0:
      print(f"Iteration:{n_batch}, Loss: {[batch_loss.item()]}")

    batch_loss.backward()
    optimizer.step()

  print(f"Epoch:{epoch+1}, Loss: {[total_loss/n_batch]}")

Iteration:100, Loss: [2.1276073312037624e-05]
Iteration:200, Loss: [2.579393367341254e-05]


KeyboardInterrupt: ignored

In [None]:
with torch.no_grad():
  preds = []
  true = []
  total_loss = 0
  n_batch = 0
  for batch_x, batch_y in test_loader:
        n_batch += 1
        batch_x = batch_x.cuda()
        batch_y = batch_y.cuda()
        true.extend(scaler.inverse_transform(batch_y.detach().cpu().numpy()))
        batch_x = batch_x.unsqueeze(2)
        batch_y = batch_y.unsqueeze(2)
        encoder_output, encoder_hidden = encoder(batch_x)
        dec_input = batch_y[:, 0].view(1, 1, 1)
        temp = dec_input.detach().cpu().numpy()
        preds.append(scaler.inverse_transform(temp.reshape(1, -1)))

        dec_hidden = encoder_hidden
        for t in range(1, batch_y.size(1)):
          predictions, dec_hidden, _ = decoder(dec_input.to(device), dec_hidden.to(device), enc_output.to(device))
          best_guess = predictions.detach().cpu().numpy()
          preds.append(scaler.inverse_transform(best_guess.reshape(1, -1)))
          dec_input = predictions.view(batch_y.size(0), 1, 1)



  preds = np.concatenate(preds)
  true = np.concatenate(true)
  mse = mean_squared_error(true, preds)
  mae = mean_absolute_error(true, preds)

  print(mse, mae)

In [None]:
preds.shape, true.shape

In [None]:
preds_reshaped = preds[996:].reshape(-1, 500)
true_reshaped = true[996:].reshape(-1, 500)

preds_avg = np.mean(preds_reshaped, axis=1)
true_avg = np.mean(true_reshaped, axis=1)

plt.figure(figsize=(8, 6))
plt.plot(preds_avg, 'b', label='Preds')
plt.plot(true_avg, 'g', label='True')
plt.legend()
plt.show()