<a href="https://colab.research.google.com/github/PasechnikDarya/ML-practice/blob/main/lstm_bitcoin_price.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install --quiet pytorch-lightning==1.2.5

[K     |████████████████████████████████| 826 kB 5.2 MB/s 
[K     |████████████████████████████████| 272 kB 47.6 MB/s 
[K     |████████████████████████████████| 118 kB 53.2 MB/s 
[K     |████████████████████████████████| 829 kB 33.1 MB/s 
[K     |████████████████████████████████| 269 kB 40.7 MB/s 
[K     |████████████████████████████████| 1.3 MB 33.4 MB/s 
[K     |████████████████████████████████| 294 kB 45.6 MB/s 
[K     |████████████████████████████████| 142 kB 49.9 MB/s 
[?25h  Building wheel for future (setup.py) ... [?25l[?25hdone
  Building wheel for PyYAML (setup.py) ... [?25l[?25hdone


In [41]:
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
import math

import pandas as pd
import numpy as np

import pytorch_lightning as pl
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from pylab import rcParams
from matplotlib import rc
from sklearn.preprocessing import MinMaxScaler
from torch.utils.data import Dataset, DataLoader
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger

from tqdm.notebook import tqdm


In [42]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

sns.set(style='whitegrid', font_scale=1.2)

rcParams['figure.figsize'] = 12, 8

tqdm.pandas()

In [43]:
pl.seed_everything(42)

Global seed set to 42


42

## Load data

In [5]:
df = pd.read_csv('Binance_BTCUSDT_minute.csv', skiprows=1, parse_dates=['date'])
df = df.sort_values(by='date').reset_index(drop=True)

In [6]:
df.shape

(69459, 10)

In [7]:
df

Unnamed: 0,unix,date,symbol,open,high,low,close,Volume BTC,Volume USDT,tradecount
0,1625754420000,2021-07-08 14:27:00,BTC/USDT,32584.58,32586.74,32533.49,32553.76,6.000000,,
1,1625754480000,2021-07-08 14:28:00,BTC/USDT,32553.77,32612.44,32548.27,32607.59,55.069409,1.794269e+06,581.0
2,1625754540000,2021-07-08 14:29:00,BTC/USDT,32607.59,32650.00,32597.45,32642.31,51.685306,1.686271e+06,814.0
3,1625754600000,2021-07-08 14:30:00,BTC/USDT,32642.31,32666.00,32610.46,32632.85,52.325956,1.707966e+06,869.0
4,1625754660000,2021-07-08 14:31:00,BTC/USDT,32631.38,32649.73,32614.47,32645.33,25.485972,8.316624e+05,615.0
...,...,...,...,...,...,...,...,...,...,...
69454,1629937860000,2021-08-26 00:31:00,BTC/USDT,49034.00,49077.85,49003.30,49029.00,19.657713,9.641287e+05,945.0
69455,1629937920000,2021-08-26 00:32:00,BTC/USDT,49029.01,49041.79,48990.01,48999.80,14.685761,7.198294e+05,807.0
69456,1629937980000,2021-08-26 00:33:00,BTC/USDT,48999.80,49070.25,48983.09,49053.41,24.355314,1.193922e+06,991.0
69457,1629938040000,2021-08-26 00:34:00,BTC/USDT,49053.40,49053.41,49008.02,49038.72,8.390942,4.114275e+05,634.0


# Preprocessing

In [8]:
df['prev_close'] = df.shift(1)['close']
df.head()

Unnamed: 0,unix,date,symbol,open,high,low,close,Volume BTC,Volume USDT,tradecount,prev_close
0,1625754420000,2021-07-08 14:27:00,BTC/USDT,32584.58,32586.74,32533.49,32553.76,6.0,,,
1,1625754480000,2021-07-08 14:28:00,BTC/USDT,32553.77,32612.44,32548.27,32607.59,55.069409,1794269.0,581.0,32553.76
2,1625754540000,2021-07-08 14:29:00,BTC/USDT,32607.59,32650.0,32597.45,32642.31,51.685306,1686271.0,814.0,32607.59
3,1625754600000,2021-07-08 14:30:00,BTC/USDT,32642.31,32666.0,32610.46,32632.85,52.325956,1707966.0,869.0,32642.31
4,1625754660000,2021-07-08 14:31:00,BTC/USDT,32631.38,32649.73,32614.47,32645.33,25.485972,831662.4,615.0,32632.85


In [9]:
df['close_change'] = df.progress_apply(
    lambda row: 0 if np.isnan(row.prev_close) else row.close - row.prev_close,
    axis=1
)

  0%|          | 0/69459 [00:00<?, ?it/s]

In [10]:
df.head()

Unnamed: 0,unix,date,symbol,open,high,low,close,Volume BTC,Volume USDT,tradecount,prev_close,close_change
0,1625754420000,2021-07-08 14:27:00,BTC/USDT,32584.58,32586.74,32533.49,32553.76,6.0,,,,0.0
1,1625754480000,2021-07-08 14:28:00,BTC/USDT,32553.77,32612.44,32548.27,32607.59,55.069409,1794269.0,581.0,32553.76,53.83
2,1625754540000,2021-07-08 14:29:00,BTC/USDT,32607.59,32650.0,32597.45,32642.31,51.685306,1686271.0,814.0,32607.59,34.72
3,1625754600000,2021-07-08 14:30:00,BTC/USDT,32642.31,32666.0,32610.46,32632.85,52.325956,1707966.0,869.0,32642.31,-9.46
4,1625754660000,2021-07-08 14:31:00,BTC/USDT,32631.38,32649.73,32614.47,32645.33,25.485972,831662.4,615.0,32632.85,12.48


In [11]:
rows = []

for _, row in tqdm(df.iterrows(), total=df.shape[0]):
    row_data = dict(
        day_of_week = row.date.dayofweek,
        day_of_month = row.date.day,
        week_of_year = row.date.week,
        month = row.date.month,
        open = row.open,
        high = row.high,
        low = row.low,
        close_change = row.close_change,
        close = row.close
    )
    rows.append(row_data)

  0%|          | 0/69459 [00:00<?, ?it/s]

In [12]:
features_df = pd.DataFrame(rows)
features_df.head()

Unnamed: 0,day_of_week,day_of_month,week_of_year,month,open,high,low,close_change,close
0,3,8,27,7,32584.58,32586.74,32533.49,0.0,32553.76
1,3,8,27,7,32553.77,32612.44,32548.27,53.83,32607.59
2,3,8,27,7,32607.59,32650.0,32597.45,34.72,32642.31
3,3,8,27,7,32642.31,32666.0,32610.46,-9.46,32632.85
4,3,8,27,7,32631.38,32649.73,32614.47,12.48,32645.33


In [13]:
features_df.shape

(69459, 9)

In [14]:
train_size = int(len(features_df) * 0.9)
train_size

62513

In [15]:
train_df, test_df = features_df[:train_size], features_df[train_size:]
train_df.shape, test_df.shape

((62513, 9), (6946, 9))

In [16]:
scaler = MinMaxScaler(feature_range=(-1, 1))
scaler = scaler.fit(train_df)

In [18]:
train_df = pd.DataFrame(scaler.transform(train_df),
                        index=train_df.index,
                        columns=train_df.columns
                        )

In [19]:
test_df = pd.DataFrame(scaler.transform(test_df),
                        index=test_df.index,
                        columns=test_df.columns
                        )

In [20]:
train_df.head()

Unnamed: 0,day_of_week,day_of_month,week_of_year,month,open,high,low,close_change,close
0,0.0,-0.533333,-1.0,-1.0,-0.672339,-0.682781,-0.674752,-0.40254,-0.675418
1,0.0,-0.533333,-1.0,-1.0,-0.675417,-0.680206,-0.673276,-0.360188,-0.67004
2,0.0,-0.533333,-1.0,-1.0,-0.67004,-0.676443,-0.668362,-0.375223,-0.666571
3,0.0,-0.533333,-1.0,-1.0,-0.666571,-0.67484,-0.667062,-0.409982,-0.667516
4,0.0,-0.533333,-1.0,-1.0,-0.667663,-0.67647,-0.666662,-0.392721,-0.666269


In [35]:
def create_sequences(input_data: pd.DataFrame, target_column, sequence_length):

    sequences = []
    data_size = len(input_data)
    start_index = input_data.index[0]

    for i in tqdm(range(data_size - sequence_length)):
      
        sequence = input_data[start_index + i:start_index + i + sequence_length]
        label = input_data[target_column][start_index + i + sequence_length]

        sequences.append((sequence, label))

    return sequences
  

In [44]:
SEQUENCE_LENGTH = 120

train_sequences = create_sequences(train_df, 'close', SEQUENCE_LENGTH)
test_sequences = create_sequences(test_df, 'close', SEQUENCE_LENGTH)

  0%|          | 0/62393 [00:00<?, ?it/s]

  0%|          | 0/6826 [00:00<?, ?it/s]

In [46]:
len(train_sequences), len(test_sequences)

(62393, 6826)

## Pytorch dataset

In [48]:
class BTCDataset(Dataset):
    
    def __init__(self, sequences):
        self.sequences = sequences

    def __len__(self):
        return len(this.sequences)

    def __getitem__(self, idx):
        sequence, label = self.sequences[idx]

        return dict(sequence=torch.Tensor(sequence.to_numpy()),
                    label=torch.tensor(label).float()
              )

In [50]:
class BTCPriceDataModule(pl.LightningDataModule):
    def __init__(self, train_sequences, test_sequences, batch_size=8):
        
        super().__init__()
        self.train_sequences = train_sequences
        self.test_sequences = test_sequences
        self.batch_size = batch_size

    def setup(self):
        self.train_dataset = BTCDataset(self.train_sequences)
        self.test_dataset = BTCDataset(self.test_sequences)

    def train_dataloaler(self):
        return DataLoader(self.train_dataset,
                          batch_size=self.size,
                          shuffle=False,
                          num_workers=2)
      
    def val_dataloaler(self):
        return DataLoader(self.test_dataset,
                          batch_size=1,
                          shuffle=False,
                          num_workers=1)
        
    def test_dataloaler(self):
        return DataLoader(self.test_dataset,
                          batch_size=self.size,
                          shuffle=False,
                          num_workers=2)

In [None]:
N_EPOCHS = 8
BATCH_SIZE = 64

data_module = BTCPriceDataModule(train_sequences, test_sequences, batch_size=BATCH)