In [None]:
!git clone https://github.com/lucidrains/performer-pytorch

In [None]:
%cd performer-pytorch

In [None]:
!pip install performer-pytorch

In [None]:
!pip install -r requirements.txt

In [None]:
import pandas as pd
import numpy as np
import torch

from torch.utils.data import DataLoader

In [None]:
DEVICE=torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
DEVICE

In [None]:
from sklearn.model_selection import train_test_split

price_data = pd.read_csv('/content/btc-usdt-1.csv')

In [None]:
input_data = (price_data.bid_price + price_data.ask_price) / 2

train_data, val_data = train_test_split(input_data, test_size=0.33)
train_data.shape, val_data.shape

In [None]:
# from sklearn.preprocessing import MinMaxScaler

# scaler = MinMaxScaler(feature_range=(-1, 1)) 
# train_data = scaler.fit_transform(train_data.values.reshape(-1, 1)).reshape(-1)
# val_data = scaler.transform(val_data.values.reshape(-1, 1)).reshape(-1) 

In [None]:
train_data

In [None]:
train_data = train_data.astype(int)
val_data = val_data.astype(int)

In [None]:
train_data

In [None]:
!git clone https://github.com/Nik212/crypto-transformers

In [None]:
import crypto_transformers.ts_dataset as ds
import crypto_transformers.utils as utils

def get_dataloader(data, enc_seq_len=60, dec_seq_len=120, step_size=1, batch_first=True, batch_size=None):
    '''
    E.g. if you want the model to consider the past 100
    time steps in order to predict the future 50 
    time steps, window_size = 100+50 = 150
    
    Args:
    
        enc_seq_len: int, length of input given to encoder

        dec_seq_len: int, length of input given to decoder
        
        step_size: int, Step size, i.e. how many time steps does the moving window move at each step
            
    '''
    
    output_sequence_length = dec_seq_len # target sequence length. If hourly data and length = 48, you predict 2 days ahead

    window_size = enc_seq_len + output_sequence_length # used to slice data into sub-sequences
   
    indices = utils.get_indices_entire_sequence(
        data=data, 
        window_size=window_size, 
        step_size=step_size
    )

    # Making instance of custom dataset class
    data = ds.TransformerDataset(
        data=torch.tensor(data.tolist()).unsqueeze(1),
        indices=indices,
        enc_seq_len=enc_seq_len,
        dec_seq_len=dec_seq_len,
        target_seq_len=output_sequence_length
    )

    # Making dataloader
    return DataLoader(data, batch_size, shuffle=False, num_workers=2) # replace 40 with your number

In [None]:
BATCH_SIZE=64

enc_seq_len=300
dec_seq_len=60

train_loader = get_dataloader(train_data, enc_seq_len, dec_seq_len, batch_size=BATCH_SIZE)
val_loader = get_dataloader(val_data, enc_seq_len, dec_seq_len, batch_size=BATCH_SIZE)

In [None]:
train_loader = {'input_ids': next(iter(train_loader))[0]}
val_loader = {'input_ids': next(iter(val_loader))[0]}

In [None]:
!pip install pytorch_lightning

In [None]:
!pip install wandb

In [None]:
import pytorch_lightning as pl
from performer_pytorch.autoregressive_wrapper import AutoregressiveWrapper
from performer_pytorch import PerformerLM

class LitGpt(pl.LightningModule):

    def __init__(self, weight_decay, lr, pretrained=False):
        super().__init__()
        self.weight_decay = weight_decay
        self.lr = lr
        self.model = PerformerLM(
          num_tokens = 28512,
          max_seq_len = 2048,             # max sequence length
          dim = 512,                      # dimension
          depth = 12,                     # layers
          heads = 8,                      # heads
          causal = True,                 # auto-regressive or not
          nb_features = 1,              # number of random features, if not set, will default to (d * log(d)), where d is the dimension of each head
          feature_redraw_interval = 1000, # how frequently to redraw the projection matrix, the more frequent, the slower the training
          generalized_attention = False,  # defaults to softmax approximation, but can be set to True for generalized attention
          kernel_fn = torch.nn.ReLU(),    # the kernel function to be used, if generalized attention is turned on, defaults to Relu
          reversible = True,              # reversible layers, from Reformer paper
          ff_chunks = 10,                 # chunk feedforward layer, from Reformer paper
          use_scalenorm = False,          # use scale norm, from 'Transformers without Tears' paper
          use_rezero = False,             # use rezero, from 'Rezero is all you need' paper
          ff_glu = True,                  # use GLU variant for feedforward
          emb_dropout = 0.1,              # embedding dropout
          ff_dropout = 0.1,               # feedforward dropout
          attn_dropout = 0.1,             # post-attn dropout
          local_attn_heads = 4,           # 4 heads are local attention, 4 others are global performers
          local_window_size = 256,        # window size of local attention
          rotary_position_emb = True,     # use rotary positional embedding, which endows linear attention with relative positional encoding with no learned parameters. should always be turned on unless if you want to go back to old absolute positional encoding
          shift_tokens = True             # shift tokens by 1 along sequence dimension before each block, for better convergence
        )


    def forward(self, x, return_loss=True):
        # in lightning, forward defines the prediction/inference actions
        return self.model(x, return_loss=return_loss)

    def training_step(self, batch, batch_idx):
        # training_step defined the train loop. It is independent of forward
        loss = self(batch['input_ids'], return_loss=True)
        loss = loss.mean()
        self.log('train_loss', loss)
        return loss

    def configure_optimizers(self):
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": self.weight_decay,
            },
            {
                "params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]
        optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=self.lr)
        return [optimizer], []


In [None]:
import gc
gc.collect(), torch.cuda.empty_cache()

In [None]:
weight_decay = 0.01
lr = 2e-5
autoencoder = LitGpt(weight_decay, lr)

In [None]:
N_EPOCHS=10


model = autoencoder.to(DEVICE)

In [None]:
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint

wb_logger = pl.loggers.WandbLogger(
    name=f"Informer|n_epochs={N_EPOCHS}|batch_size={BATCH_SIZE}|window_size={dec_seq_len}",
    project='sequential_data'
)

checkpoint_callback = ModelCheckpoint(
    dirpath='checkpoints',
    monitor='val_MSE',
    filename='{epoch:02d}-{val_f1:.3f}',
    mode='min'
)

trainer = pl.Trainer(
    max_epochs=N_EPOCHS,
    logger=wb_logger,
    accelerator='gpu',
    devices=1,
    benchmark=True,
    callbacks=[checkpoint_callback]
)

In [None]:
import wandb

trainer = pl.Trainer(
    max_epochs=N_EPOCHS,
    gpus=1,
    logger=wb_logger,
    callbacks=[checkpoint_callback]    
    )

trainer.fit(model, train_loader, val_loader)

wandb.finish()



In [None]:
import random

model.eval()
with torch.no_grad():
    inp = random.choice(val_loader['input_ids'])
    print(inp.shape)
    pred = autoencoder(inp, DEVICE)
    print(f'validation loss: {pred}')

In [None]:
# torch.squeeze(pred).shape

In [None]:
pred_data = torch.squeeze(torch.sum(pred, dim = 0)).numpy()

In [None]:
exp = -1* pred_data
scaled_pred = exp/max(exp)
print(scaled_pred)

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0, 1)) 
scaler.fit_transform(train_data.values.reshape(-1, 1)).reshape(-1)
scaled_val_data = scaler.transform(val_data.values.reshape(-1, 1)).reshape(-1) 

In [None]:
scaled_val_data

In [None]:
scaled_pred[:100]

In [None]:
scaled_val_data[:100]

In [None]:
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (10,10)

plt.plot(range(100), scaled_pred[:100], color='r', label='predictions')
plt.plot(range(100), scaled_val_data[:100], color='g', label='real')


plt.xlabel("time")
plt.ylabel("Value")
plt.title("Autoregressive Predictions")


plt.legend()


plt.show()