In [1]:
!pip install pandas==1.2.5
!pip install --quiet tqdm==4.62.2

Collecting pandas==1.2.5
  Downloading pandas-1.2.5-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (9.9 MB)
     |████████████████████████████████| 9.9 MB 4.4 MB/s            
Installing collected packages: pandas
  Attempting uninstall: pandas
    Found existing installation: pandas 1.3.5
    Uninstalling pandas-1.3.5:
      Successfully uninstalled pandas-1.3.5
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
beatrix-jupyterlab 3.1.6 requires google-cloud-bigquery-storage, which is not installed.
woodwork 0.13.0 requires pandas>=1.3.0, but you have pandas 1.2.5 which is incompatible.
tfx-bsl 1.5.0 requires absl-py<0.13,>=0.9, but you have absl-py 0.15.0 which is incompatible.
tfx-bsl 1.5.0 requires numpy<1.20,>=1.16, but you have numpy 1.20.3 which is incompatible.
tfx-bsl 1.5.0 requires pyarrow<6,>=1, but you have pyarrow 6.0.1 which 

In [2]:
!pip install --quiet pytorch-lightning==1.2.5



In [3]:
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc

import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import pytorch_lightning as pl
from sklearn.preprocessing import MinMaxScaler
import os
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [4]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

rcParams['figure.figsize'] = 12, 8
tqdm.pandas()  # to use apply function from pandas

## Seeding Everything

In [5]:
pl.seed_everything(42)

42

## Loading Data

In [6]:
link = '../input/finance/Binance_BTCUSDT_minute1.csv'
data = pd.read_csv(link, parse_dates=["date"])

In [7]:
data = data.sort_values(by="date").reset_index(drop=True)
data.head()

Unnamed: 0,unix,date,symbol,open,high,low,close,Volume BTC,Volume USDT,tradecount
0,1585700000000.0,2020-01-04 00:00:00,BTC/USDT,6407.1,6422.82,6407.1,6417.24,427.762,2744209.0,1082
1,1585700000000.0,2020-01-04 00:01:00,BTC/USDT,6417.23,6418.28,6410.24,6415.5,115.233,739144.6,375
2,1585700000000.0,2020-01-04 00:02:00,BTC/USDT,6415.36,6418.0,6411.5,6416.26,127.199,815992.2,360
3,1585700000000.0,2020-01-04 00:03:00,BTC/USDT,6416.26,6419.2,6414.4,6416.92,104.306,669463.9,278
4,1585700000000.0,2020-01-04 00:04:00,BTC/USDT,6416.92,6416.92,6410.55,6412.93,90.538,580586.8,267


In [8]:
data['prev_close'] = data.shift(1)['close']
data.head()

Unnamed: 0,unix,date,symbol,open,high,low,close,Volume BTC,Volume USDT,tradecount,prev_close
0,1585700000000.0,2020-01-04 00:00:00,BTC/USDT,6407.1,6422.82,6407.1,6417.24,427.762,2744209.0,1082,
1,1585700000000.0,2020-01-04 00:01:00,BTC/USDT,6417.23,6418.28,6410.24,6415.5,115.233,739144.6,375,6417.24
2,1585700000000.0,2020-01-04 00:02:00,BTC/USDT,6415.36,6418.0,6411.5,6416.26,127.199,815992.2,360,6415.5
3,1585700000000.0,2020-01-04 00:03:00,BTC/USDT,6416.26,6419.2,6414.4,6416.92,104.306,669463.9,278,6416.26
4,1585700000000.0,2020-01-04 00:04:00,BTC/USDT,6416.92,6416.92,6410.55,6412.93,90.538,580586.8,267,6416.92


## Apply column function to create closing change variable

In [9]:
data['closing_change'] = data.progress_apply( 
    lambda row: 0 if np.isnan(row.prev_close) else row.close - row.prev_close, axis = 1)

  0%|          | 0/1048574 [00:00<?, ?it/s]

In [10]:
#data.drop("prev_change", axis=1, inplace=True)
data.head()

Unnamed: 0,unix,date,symbol,open,high,low,close,Volume BTC,Volume USDT,tradecount,prev_close,closing_change
0,1585700000000.0,2020-01-04 00:00:00,BTC/USDT,6407.1,6422.82,6407.1,6417.24,427.762,2744209.0,1082,,0.0
1,1585700000000.0,2020-01-04 00:01:00,BTC/USDT,6417.23,6418.28,6410.24,6415.5,115.233,739144.6,375,6417.24,-1.74
2,1585700000000.0,2020-01-04 00:02:00,BTC/USDT,6415.36,6418.0,6411.5,6416.26,127.199,815992.2,360,6415.5,0.76
3,1585700000000.0,2020-01-04 00:03:00,BTC/USDT,6416.26,6419.2,6414.4,6416.92,104.306,669463.9,278,6416.26,0.66
4,1585700000000.0,2020-01-04 00:04:00,BTC/USDT,6416.92,6416.92,6410.55,6412.93,90.538,580586.8,267,6416.92,-3.99


## Features Engineering

In [11]:
rows = []

for i, row in tqdm(data.iterrows(), total=data.shape[0]): # data.shape[0] = len(data)
    row_data = dict(
        day_of_week = row.date.dayofweek,
        day_of_month = row.date.day,
        week_of_year = row.date.week,
        month = row.date.month,
        open = row.open,
        high = row.high,
        low = row.low,
        close_change = row.closing_change,
        close = row.close
    )
    rows.append(row_data)
    
features_df = pd.DataFrame(rows)

  0%|          | 0/1048574 [00:00<?, ?it/s]

In [12]:
print(features_df.shape)
features_df.head()

(1048574, 9)


Unnamed: 0,day_of_week,day_of_month,week_of_year,month,open,high,low,close_change,close
0,5,4,1,1,6407.1,6422.82,6407.1,0.0,6417.24
1,5,4,1,1,6417.23,6418.28,6410.24,-1.74,6415.5
2,5,4,1,1,6415.36,6418.0,6411.5,0.76,6416.26
3,5,4,1,1,6416.26,6419.2,6414.4,0.66,6416.92
4,5,4,1,1,6416.92,6416.92,6410.55,-3.99,6412.93


## Train Test Split

In [13]:
split = 0.9
train_size = int(len(features_df) *split)
print(train_size)

943716


In [14]:
train_df, test_df = features_df[:train_size], features_df[train_size+1:]
train_df.shape, test_df.shape

((943716, 9), (104857, 9))

## Normalizing the Data using MinMaxScaler

In [15]:
scaler = MinMaxScaler(feature_range = (-1, 1))
scaler = scaler.fit(train_df)

## Tranform

In [16]:
train_df = pd.DataFrame(scaler.transform(train_df), 
                        index=train_df.index,
                        columns=train_df.columns)
train_df.head()

Unnamed: 0,day_of_week,day_of_month,week_of_year,month,open,high,low,close_change,close
0,0.666667,-0.8,-1.0,-1.0,-0.917292,-0.918927,-0.914516,0.259101,-0.916981
1,0.666667,-0.8,-1.0,-1.0,-0.916981,-0.919067,-0.914419,0.259015,-0.917034
2,0.666667,-0.8,-1.0,-1.0,-0.917039,-0.919075,-0.914381,0.259138,-0.917011
3,0.666667,-0.8,-1.0,-1.0,-0.917011,-0.919038,-0.914292,0.259133,-0.916991
4,0.666667,-0.8,-1.0,-1.0,-0.916991,-0.919108,-0.91441,0.258905,-0.917113


In [17]:
test_df = pd.DataFrame(scaler.transform(test_df), 
                        index=test_df.index,
                        columns=test_df.columns)
test_df.head()

Unnamed: 0,day_of_week,day_of_month,week_of_year,month,open,high,low,close_change,close
943717,0.0,0.466667,0.923077,1.0,0.370962,0.370268,0.375289,0.257672,0.37007
943718,0.0,0.466667,0.923077,1.0,0.370069,0.369374,0.374455,0.257627,0.369149
943719,0.0,0.466667,0.923077,1.0,0.369149,0.368762,0.373389,0.257397,0.368085
943720,0.0,0.466667,0.923077,1.0,0.368085,0.367688,0.373077,0.259521,0.368348
943721,0.0,0.466667,0.923077,1.0,0.368348,0.367777,0.372121,0.257017,0.367047


## Cutting the dataset into sequences

In [18]:
def create_sequences(input_data: pd.DataFrame, target_column, sequence_length):
    
    sequences = []
    data_size = len(input_data)
    
    for i in tqdm(range(data_size - sequence_length)):
        sequence = input_data[i:i+sequence_length]
        
        label_position = i + sequence_length
        label = input_data.iloc[label_position][target_column]
        
        sequences.append((sequence, label))

    return sequences

## Example to understand sequence

In [19]:
sample1 = pd.DataFrame(dict(
    feature_1 = [1, 2, 3, 4, 5],
    label = [6, 7, 8, 9, 10]

))

sample1.head()

Unnamed: 0,feature_1,label
0,1,6
1,2,7
2,3,8
3,4,9
4,5,10


In [20]:
sample_sequences = create_sequences(sample1, "label", sequence_length=3)
sample_sequences

  0%|          | 0/2 [00:00<?, ?it/s]

[(   feature_1  label
  0          1      6
  1          2      7
  2          3      8,
  9),
 (   feature_1  label
  1          2      7
  2          3      8
  3          4      9,
  10)]

In [21]:
len(sample_sequences)

2

In [22]:
print(sample_sequences[0][0])
print()
print(f"label: {sample_sequences[0][1]}")

   feature_1  label
0          1      6
1          2      7
2          3      8

label: 9


In [23]:
print(sample_sequences[1][0])
print()
print(sample_sequences[1][1])


   feature_1  label
1          2      7
2          3      8
3          4      9

10


In [24]:
SEQUENCE_LENGTH = 60

train_sequences = create_sequences(train_df, "close", SEQUENCE_LENGTH)
test_sequences  = create_sequences(test_df, "close", SEQUENCE_LENGTH)

  0%|          | 0/943656 [00:00<?, ?it/s]

  0%|          | 0/104797 [00:00<?, ?it/s]

In [25]:
train_sequences[0][1]

-0.920541607497522