* Install pytorch-lightning
* install tqdm

In [1]:
!nvidia-smi

Thu Dec  2 16:10:15 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 462.80       Driver Version: 462.80       CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  GeForce RTX 206... WDDM  | 00000000:01:00.0  On |                  N/A |
| N/A   48C    P8     6W /  N/A |    510MiB /  6144MiB |     10%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
import matplotlib
import math

import numpy as np
import pandas as pd
from tqdm import tqdm
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
from sklearn.preprocessing import MinMaxScaler

import torch
import torch.autograd as autograd
from torch import nn, optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from pandas.plotting import register_matplotlib_converters

In [3]:

%matplotlib inline
%config InlineBackend.figure_format='retina'

sns.set(style='whitegrid', palette='muted', font_scale=1.2)

HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#93D30C", "#8F00FF"]

sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))

rcParams['figure.figsize'] = 14, 10
register_matplotlib_converters()

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

<torch._C.Generator at 0x22c908986f0>

In [4]:
pl.seed_everything(42)

Global seed set to 42


42

### Load Data

In [5]:
df = pd.read_csv('Binance_BTCUSDT_minute.csv', parse_dates=['date'])
# Sort the date
df = df.sort_values(by="date").reset_index(drop=True)
df.head()

Unnamed: 0,unix,date,symbol,open,high,low,close,Volume BTC,Volume USDT,tradecount
0,1575170040000,2019-12-01 03:14:00,BTC/USDT,7278.0,7282.7,7269.11,7282.23,175.907,1279841.0,513
1,1575170100000,2019-12-01 03:15:00,BTC/USDT,7281.08,7282.0,7272.47,7278.9,124.766,908115.4,435
2,1575170160000,2019-12-01 03:16:00,BTC/USDT,7279.06,7297.38,7279.0,7297.37,311.67,2270862.0,729
3,1575170220000,2019-12-01 03:17:00,BTC/USDT,7296.14,7298.99,7280.31,7288.0,194.617,1418390.0,632
4,1575170280000,2019-12-01 03:18:00,BTC/USDT,7286.01,7289.7,7275.74,7275.74,83.405,607379.5,316


In [6]:
df.tail(3)

Unnamed: 0,unix,date,symbol,open,high,low,close,Volume BTC,Volume USDT,tradecount
1048571,1638408780000,2021-12-02 01:33:00,BTC/USDT,57233.42,57233.43,57178.41,57190.17,21.15174,1209836.0,856
1048572,1638408840000,2021-12-02 01:34:00,BTC/USDT,57195.67,57216.48,57160.9,57187.41,21.67796,1239699.0,724
1048573,1638408900000,2021-12-02 01:35:00,BTC/USDT,57187.4,57228.98,57184.39,57185.99,17.35244,992535.4,539


In [7]:
df.dtypes

unix                    int64
date           datetime64[ns]
symbol                 object
open                  float64
high                  float64
low                   float64
close                 float64
Volume BTC            float64
Volume USDT           float64
tradecount              int64
dtype: object

In [8]:
df.shape

(1048574, 10)

In [9]:
df.isna().sum()

unix           0
date           0
symbol         0
open           0
high           0
low            0
close          0
Volume BTC     0
Volume USDT    0
tradecount     0
dtype: int64

## Preprocessing

In [10]:
df.close

0           7282.23
1           7278.90
2           7297.37
3           7288.00
4           7275.74
             ...   
1048569    57317.69
1048570    57233.43
1048571    57190.17
1048572    57187.41
1048573    57185.99
Name: close, Length: 1048574, dtype: float64

In [11]:
df["prev_close"] = df.shift(1)["close"]

In [12]:
df.head()

Unnamed: 0,unix,date,symbol,open,high,low,close,Volume BTC,Volume USDT,tradecount,prev_close
0,1575170040000,2019-12-01 03:14:00,BTC/USDT,7278.0,7282.7,7269.11,7282.23,175.907,1279841.0,513,
1,1575170100000,2019-12-01 03:15:00,BTC/USDT,7281.08,7282.0,7272.47,7278.9,124.766,908115.4,435,7282.23
2,1575170160000,2019-12-01 03:16:00,BTC/USDT,7279.06,7297.38,7279.0,7297.37,311.67,2270862.0,729,7278.9
3,1575170220000,2019-12-01 03:17:00,BTC/USDT,7296.14,7298.99,7280.31,7288.0,194.617,1418390.0,632,7297.37
4,1575170280000,2019-12-01 03:18:00,BTC/USDT,7286.01,7289.7,7275.74,7275.74,83.405,607379.5,316,7288.0


In [13]:
from tqdm.notebook import tqdm
tqdm.pandas()

In [14]:
df["close_change"] = df.progress_apply(
    lambda row: 0 if np.isnan(row.prev_close) else row.close - row.prev_close,
    axis = 1
)

  0%|          | 0/1048574 [00:00<?, ?it/s]

In [15]:
df.head()

Unnamed: 0,unix,date,symbol,open,high,low,close,Volume BTC,Volume USDT,tradecount,prev_close,close_change
0,1575170040000,2019-12-01 03:14:00,BTC/USDT,7278.0,7282.7,7269.11,7282.23,175.907,1279841.0,513,,0.0
1,1575170100000,2019-12-01 03:15:00,BTC/USDT,7281.08,7282.0,7272.47,7278.9,124.766,908115.4,435,7282.23,-3.33
2,1575170160000,2019-12-01 03:16:00,BTC/USDT,7279.06,7297.38,7279.0,7297.37,311.67,2270862.0,729,7278.9,18.47
3,1575170220000,2019-12-01 03:17:00,BTC/USDT,7296.14,7298.99,7280.31,7288.0,194.617,1418390.0,632,7297.37,-9.37
4,1575170280000,2019-12-01 03:18:00,BTC/USDT,7286.01,7289.7,7275.74,7275.74,83.405,607379.5,316,7288.0,-12.26


In [16]:
# Convert to feature data frame
rows = []

for _, row in tqdm(df.iterrows(), total=df.shape[0]):
    row_data = dict(
        day_of_week=row.date.dayofweek,
        day_of_month = row.date.day,
        week_of_year=row.date.week,
        month= row.date.month,
        _open=row.open,
        high=row.high,
        low=row.low,
        close_change=row.close_change,
        close=row.close
    )
    rows.append(row_data)
    
features_df = pd.DataFrame(rows)

  0%|          | 0/1048574 [00:00<?, ?it/s]

In [17]:
features_df.head()

Unnamed: 0,day_of_week,day_of_month,week_of_year,month,_open,high,low,close_change,close
0,6,1,48,12,7278.0,7282.7,7269.11,0.0,7282.23
1,6,1,48,12,7281.08,7282.0,7272.47,-3.33,7278.9
2,6,1,48,12,7279.06,7297.38,7279.0,18.47,7297.37
3,6,1,48,12,7296.14,7298.99,7280.31,-9.37,7288.0
4,6,1,48,12,7286.01,7289.7,7275.74,-12.26,7275.74


In [18]:
features_df.shape

(1048574, 9)

In [19]:
train_size = int(len(features_df)*0.9)
train_size

943716

In [20]:
train_df, test_df = features_df[:train_size], features_df[train_size:]
train_df.shape, test_df.shape

((943716, 9), (104858, 9))

In [21]:
943716 + 104858

1048574

## Scaling

In [22]:
scaler = MinMaxScaler(feature_range=(-1,1))
scaler = scaler.fit(train_df)

In [23]:
train_df = pd.DataFrame(scaler.transform(train_df),
                        index=train_df.index,
                        columns=train_df.columns
                       )

In [24]:
train_df.head()

Unnamed: 0,day_of_week,day_of_month,week_of_year,month,_open,high,low,close_change,close
0,1.0,-1.0,0.807692,1.0,-0.883095,-0.885266,-0.88054,0.453872,-0.882957
1,1.0,-1.0,0.807692,1.0,-0.882994,-0.885289,-0.88043,0.452852,-0.883066
2,1.0,-1.0,0.807692,1.0,-0.88306,-0.884785,-0.880217,0.459531,-0.882461
3,1.0,-1.0,0.807692,1.0,-0.882501,-0.884732,-0.880174,0.451002,-0.882768
4,1.0,-1.0,0.807692,1.0,-0.882833,-0.885036,-0.880323,0.450116,-0.883169


In [25]:
test_df = pd.DataFrame(scaler.transform(test_df),
                        index=test_df.index,
                        columns=test_df.columns
                       )

## Creating Squences


In [26]:
def create_sequences(input_data: pd.DataFrame, target_column, sequence_length):
    
    sequences = []
    data_size = len(input_data)
    
    for i in tqdm(range(data_size - sequence_length)):
        
        sequence = input_data[i:i+sequence_length]
        
        label_position = i + sequence_length
        
        label = input_data.iloc[label_position][target_column]
        
        sequences.append((sequence, label))
        
    return sequences

In [27]:
# Creating simple data
sample_data = pd.DataFrame(dict(
    feature_1=[1, 2, 3, 4, 5],
    label=[6,7,8,9,10]
))

sample_data.head()

Unnamed: 0,feature_1,label
0,1,6
1,2,7
2,3,8
3,4,9
4,5,10


In [28]:
sample_seq = create_sequences(sample_data, "label", sequence_length=3)

  0%|          | 0/2 [00:00<?, ?it/s]

In [29]:
len(sample_seq)

2

In [30]:
print(sample_seq[0][0])
print()
print(f"label: {sample_seq[0][1]}")

   feature_1  label
0          1      6
1          2      7
2          3      8

label: 9


In [31]:
print(sample_seq[1][0])
print()
print(f"label: {sample_seq[1][1]}")

   feature_1  label
1          2      7
2          3      8
3          4      9

label: 10


In [32]:
SQUENCE_LENGTH = 120

train_sequences = create_sequences(train_df, "close", SQUENCE_LENGTH)
test_sequences = create_sequences(test_df, "close", SQUENCE_LENGTH)

  0%|          | 0/943596 [00:00<?, ?it/s]

  0%|          | 0/104738 [00:00<?, ?it/s]

In [33]:
train_sequences[0][0].shape

(120, 9)

In [34]:
len(train_sequences), len(test_sequences)

(943596, 104738)

In [35]:
# Training DataFrame - train_sequences
(943716-943596) == SQUENCE_LENGTH

True

## PyTorch Dataset

In [36]:
class BTCDataset(Dataset):
    def __init__(self, sequences):
        self.sequences = sequences
        
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        sequence, label = self.sequences[idx]
        
        return dict(
            sequence=torch.tensor(sequence.to_numpy()),
            label=torch.tensor(label).float()
        )

In [37]:
class BTCPriceDataModule(pl.LightningDataModule):
    def __init__(self, train_seq, test_seq, batch_size=8):
        self.train_seq = train_seq
        self.test_seq = test_seq
        self.batch_size = batch_size
        
        
    def setup(self):
        self.train_dataset = BTCDataset(self.train_seq)
        self.test_dataset = BTCDataset(self.test_seq)
        
    
    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=2
        )
    
    def val_dataloder(self):
        return DataLoader(
            self.train_dataset,
            batch_size=1,
            shuffle=False, #Because it's time series data
            num_workers=1
        )
    
    
        def test_dataloder(self):
            return DataLoader(
                self.train_dataset,
                batch_size=self.batch_size,
                shuffle=False, #Because it's time series data
                num_workers=1
        )
        
        
        
        

In [38]:
N_EPOCHS = 8
BATCH_SIZE = 64


data_module = BTCPriceDataModule(train_sequences, test_sequences, batch_size=BATCH_SIZE)
data_module.setup()

In [39]:
train_dataset = BTCDataset(train_sequences)

## Model

In [40]:
class PricePredictionModel(nn.Module):
    
    def __init__(self, n_feature, n_hidden=128, n_layers=2):
        super().__init__()
        
        self.n_hidden = n_hidden
        
        self.lstm = nn.LSTM(
            input_size = n_feature,
            hidden_size=n_hidden,
            batch_first=True,
            num_layers=n_layers,
            dropout=0.2
        )
         
        
        #Output
        self.regressor = nn.Linear(n_hidden, 1)
        
    
    def forward(self, x):
        self.lstm.flatten_parameters()
        
        _, (hidden, _) = self.lstm(x)
        out = hidden[-1]
        
        return self.regressor(out)

In [41]:
class BTCPricePredictor(pl.LightningDataModule):
    
    def __init__(self, n_features: int):
        super().__init__()
        self.model = PricePredictionModel(n_features)
        self.criterion = nn.MSELoss()
        
    
    def forward(self, x, labels=None):
        output = self.model(x)
        loss = 0
        if labels is not None:
            loss = self.criterion(output, labels.unsqueeze(dim=1))
        return loss, output
    
    def training_step(self, batch, batch_idx):
        sequences = batch["sequence"]
        labels = batch["label"]
        loss, outputs = self(sequences, labels)
        self.log("train_loss", loss, prog_bar=True, logger=True)
        return loss
    
    
    def validation_step(self, batch, batch_idx):
        sequences = batch["sequence"]
        labels = batch["label"]
        loss, outputs = self(sequences, labels)
        self.log("val_loss", loss, prog_bar=True, logger=True)
        return loss
    
    
    
    def test_step(self, batch, batch_idx):
        sequences = batch["sequence"]
        labels = batch["label"]
        loss, outputs = self(sequences, labels)
        self.log("test_loss", loss, prog_bar=True, logger=True)
        return loss
    
    
    def configure_optimizer(self):
        return optim.AdamW(self.parameters(), lr=0.0001)
        

In [42]:
model = BTCPricePredictor(n_features=train_df.shape[1])

In [43]:
#for item in data_module.train_dataloader():
 #   print(item["sequences"].shape)
  #  print(item["label"].shape)
    # print(item["label"])
  #  break

In [68]:
%load_ext tensorboard
%tensorboard --logdir ./lightning_logs

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6006 (pid 11408), started 0:13:59 ago. (Use '!kill 11408' to kill it.)

In [93]:
checkpoint_callback = ModelCheckpoint(
    dirpath="checkpoints",
    filename="best-checkpoint",
    save_top_k=1,
    verbose=True,
    monitor="val_loss",
    mode="min"
)


logger = TensorBoardLogger("Lightning_logs", name="btc-price")
early_stopping_callback = EarlyStopping(monitor="val_loss", patience=2)


trainer = pl.Trainer(
    logger=logger,
    checkpoint_callback=checkpoint_callback,
    callbacks=[early_stopping_callback],
    max_epochs=N_EPOCHS,
    gpus=[],
    #tpu_cores=1,
    progress_bar_refresh_rate=30
)

  rank_zero_deprecation(
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [94]:
trainer.fit(model, data_module)

ValueError: The parent should define the method