In [1]:
import os
import gc
# import warnings
# warnings.filterwarnings('ignore')

import torch
import pandas as pd
import numpy as np
import datetime as dt
import talib as tb
import torch.nn as nn
import torch.nn.functional as F

from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from torch.utils.data import Dataset, DataLoader
from tensorboardX import SummaryWriter

%matplotlib inline

## load and preprocess data

In [2]:
path = './g-research-crypto-forecasting/'
asset_list = pd.read_csv(path+"asset_details.csv")
train_dataset = pd.read_csv(path+"train.csv")
validation_dataset = pd.read_csv(path+"supplemental_train.csv")

In [3]:
# purging to avoid leakage of information
train_dataset = train_dataset.iloc[:round(len(train_dataset)*0.95)]

val_len = len(validation_dataset)
test_dataset = validation_dataset.iloc[-round(0.45*val_len):]
validation_dataset = validation_dataset.iloc[:round(0.45*val_len)]

In [4]:
test_val_gap = (test_dataset['timestamp'].min() - validation_dataset['timestamp'].max())/3600/24
train_val_gap = (validation_dataset['timestamp'].min() - train_dataset['timestamp'].max())/3600/24
print(f"gap between train dataset and validation dataset: {train_val_gap:.2f} days")
print(f"gap between validation dataset and test_dataset: {test_val_gap:.2f} days")

gap between train dataset and validation dataset: 60.12 days
gap between validation dataset and test_dataset: 12.49 days


In [5]:
train_dataset.dropna(axis=0, subset="Target", inplace=True)
validation_dataset.dropna(axis=0, subset="Target", inplace=True)
test_dataset.dropna(axis=0, subset="Target", inplace=True)

train_dataset.reset_index(inplace=True)
validation_dataset.reset_index(inplace=True)
test_dataset.reset_index(inplace=True)

In [6]:
asset_names = [asset_list[asset_list['Asset_ID']==aid]['Asset_Name'].item() for aid in range(14)]
asset_names

['Binance Coin',
 'Bitcoin',
 'Bitcoin Cash',
 'Cardano',
 'Dogecoin',
 'EOS.IO',
 'Ethereum',
 'Ethereum Classic',
 'IOTA',
 'Litecoin',
 'Maker',
 'Monero',
 'Stellar',
 'TRON']

In [7]:
asset_list

Unnamed: 0,Asset_ID,Weight,Asset_Name
0,2,2.397895,Bitcoin Cash
1,0,4.304065,Binance Coin
2,1,6.779922,Bitcoin
3,5,1.386294,EOS.IO
4,7,2.079442,Ethereum Classic
5,6,5.894403,Ethereum
6,9,2.397895,Litecoin
7,11,1.609438,Monero
8,13,1.791759,TRON
9,12,2.079442,Stellar


In [8]:
def process_one_asset(asset):
    asset.sort_values(by='timestamp', inplace=True)
    scaler = StandardScaler()
    
    asset.loc[np.isinf(asset['VWAP']), 'VWAP'] = asset[~np.isinf(asset['VWAP'])]['VWAP'].mean()
    asset.loc[np.isnan(asset['VWAP']), 'VWAP'] = asset[~np.isnan(asset['VWAP'])]['VWAP'].mean()

    asset['Vol'] = (asset['Close'].diff()/asset['Close'])**2
    # add some time series feature
    primary_features = ['Count', 'Open', 'High', 'Low', 'Close', 'Volume', 'VWAP']
    asset[primary_features] = scaler.fit_transform(asset[primary_features])

    asset['upper_shadow'] = asset['High'] - asset[['Open', 'Close']].max(axis=1)
    asset['lower_shadow'] = asset[['Open', 'Close']].min(axis=1) - asset['Low']
    asset['upper_ratio'] = asset['upper_shadow']/(asset['High'] - asset['Low'])
    asset['lower_ratio'] = asset['lower_shadow']/(asset['High'] - asset['Low'])
    asset['Average_volume'] = asset['Volume']/asset['Count']
    asset['Regret'] = asset['VWAP']/asset['Close']

    asset['MA5'] = asset['Close'].rolling(5).mean()
    asset['MA15'] = asset['Close'].rolling(15).mean()
    asset['MA30'] = asset['Close'].rolling(30).mean()
    asset['MA60'] = asset['Close'].rolling(60).mean()

    asset['VMA5'] = asset['Volume'].rolling(5).mean()
    asset['VMA15'] = asset['Volume'].rolling(15).mean()
    asset['VMA30'] = asset['Volume'].rolling(30).mean()
    asset['VMA60'] = asset['Volume'].rolling(60).mean()

    asset['RV5'] = asset['Vol'].rolling(5).sum()
    asset['RV15'] = asset['Vol'].rolling(15).sum()
    asset['RV30'] = asset['Vol'].rolling(30).sum()
    asset['RV60'] = asset['Vol'].rolling(60).sum()
    
    asset.fillna(0, inplace=True)
    
    secondary_feature = ['Vol', 'upper_shadow', 'lower_shadow', 'upper_ratio', 'lower_ratio', 
                         'Average_volume', 'Regret', 'MA5', 'MA15', 'MA30', 'MA60', 'VMA5', 
                         'VMA15', 'VMA30', 'VMA60', 'RV5', 'RV15', 'RV30', 'RV60']
    
    asset[secondary_feature] = scaler.fit_transform(asset[secondary_feature])
    
    return asset

In [9]:
def process_time_series_feature(data, verbose=False):
    processed_assets = []
    for aid in range(14):
        if verbose:
            print(f"Processing: {asset_names[aid]}")
        asset = data[data['Asset_ID'] == aid].copy(deep=True)
        asset = process_one_asset(asset)
        processed_assets.append(asset)
    
    if verbose:
        print("Concat processed single asset data")
    processed_assets = pd.concat(processed_assets)
    processed_assets.sort_values(by='index', inplace=True)
    processed_assets.drop(['index'], axis=1, inplace=True)

    if verbose:
        print("One-Hot encoding")

    onehot = pd.get_dummies(processed_assets['Asset_ID'], dtype=int)
    for i in range(14):
        if i not in onehot.columns:
            onehot[i] = 0
    onehot = onehot.reindex(columns=list(range(14)))
    onehot.columns = [asset_names[aid] for aid in onehot.columns]
    processed_assets = pd.concat([processed_assets, onehot], axis=1)
    
    return processed_assets

In [10]:
print("------------------------------------\nprocess training data\n------------------------------------")
processed_train = process_time_series_feature(train_dataset, verbose=True)
del train_dataset
print("------------------------------------\nprocess validation data\n------------------------------------")
processed_val = process_time_series_feature(validation_dataset, verbose=True)
del validation_dataset
print("------------------------------------\nprocess test data\n------------------------------------")
processed_test = process_time_series_feature(test_dataset, verbose=True)
del test_dataset

------------------------------------
process training data
------------------------------------
Processing: Binance Coin
Processing: Bitcoin
Processing: Bitcoin Cash
Processing: Cardano
Processing: Dogecoin
Processing: EOS.IO
Processing: Ethereum
Processing: Ethereum Classic
Processing: IOTA
Processing: Litecoin
Processing: Maker
Processing: Monero
Processing: Stellar
Processing: TRON
Concat processed single asset data
One-Hot encoding
------------------------------------
process validation data
------------------------------------
Processing: Binance Coin
Processing: Bitcoin
Processing: Bitcoin Cash
Processing: Cardano
Processing: Dogecoin
Processing: EOS.IO
Processing: Ethereum
Processing: Ethereum Classic
Processing: IOTA
Processing: Litecoin
Processing: Maker
Processing: Monero
Processing: Stellar
Processing: TRON
Concat processed single asset data
One-Hot encoding
------------------------------------
process test data
------------------------------------
Processing: Binance Coin
P

## Training Part

In [19]:
class MyDataset(Dataset):
    def __init__(self, df):
        self.groups = df.groupby(['timestamp'])
        self.timestamps = df['timestamp'].unique()
        self.feature_cols = ['Count', 'Open', 'High', 'Low', 'Close','Volume', 'VWAP', 'Vol', 
                             'upper_shadow', 'lower_shadow', 'upper_ratio', 'lower_ratio', 
                             'Average_volume', 'Regret', 'MA5', 'MA15', 'MA30', 'MA60', 'VMA5', 
                             'VMA15', 'VMA30', 'VMA60', 'RV5', 'RV15', 'RV30', 'RV60', 'Binance Coin', 
                             'Bitcoin', 'Bitcoin Cash', 'Cardano', 'Dogecoin', 'EOS.IO', 'Ethereum', 
                             'Ethereum Classic', 'IOTA', 'Litecoin', 'Maker', 'Monero', 'Stellar', 'TRON']
    
    def __len__(self):
        return len(self.timestamps)
    
    def __getitem__(self, idx):
        ts = self.timestamps[idx]
        data = self.groups.get_group(ts)
        x = data.loc[:, self.feature_cols].values
        num_assets = x.shape[0]
        if num_assets < 14:
            padding = np.zeros((14-num_assets, x.shape[1]))
            x = np.concatenate((x, padding), axis=0)
        x = torch.tensor(x, dtype=torch.float32)
        
        y = data.loc[:, 'Target'].values
        y = torch.tensor(y, dtype=torch.float32)
        y = F.pad(y, (0, 14-num_assets))
        return x, y, num_assets

In [15]:
class CryptoTransformer(nn.Module):
    def __init__(self, model_dim=64, num_heads=2, num_layers=2, dropout=0.2, ffn_dim=128, num_features=40):
        super().__init__()
        
        self.input_up = nn.Linear(num_features, model_dim)
        self.encoder_layer = nn.TransformerEncoderLayer(d_model = model_dim, 
                                                        nhead = num_heads,
                                                        dim_feedforward = ffn_dim,
                                                        dropout = dropout,
                                                        batch_first = True)
        self.encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=num_layers)
        self.output = nn.Linear(model_dim, 1)
        
    def forward(self, x):
        emb_x = self.input_up(x)
        enc_x = self.encoder(emb_x)
        output = self.output(enc_x)
        return output

In [78]:
def init_weights(m):
    if isinstance(m, nn.Linear):
        torch.nn.init.xavier_uniform_(m.weight)
        m.bias.data.fill_(0.)

In [79]:
def train_loop(dataloader, net, loss_fn, optimizer, device):
    running_loss = 0
    current = 0
    net.train()

    with tqdm(dataloader) as t:
        for batch, (X, y, num_assets) in enumerate(t):
            batch_size = X.shape[0]
            X = X.to(device)
            y = y.to(device)
            num_assets = num_assets.to(device)
            y_pred = net(X)
            y_pred = y_pred.view(batch_size, 14)

            mask = torch.arange(14).expand(batch_size, 14).to(device)
            mask = mask < num_assets.unsqueeze(1)
            y_pred = y_pred * mask.float()
            y = y * mask.float()
            
            loss = loss_fn(y_pred, y)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            running_loss = (batch_size * loss.item() + running_loss * current) / (batch_size + current)
            current += batch_size
            t.set_postfix({'running_loss':running_loss})
    
    return running_loss

In [80]:
def val_loop(dataloader, net, loss_fn, device):
    running_loss = 0.0
    current = 0
    net.eval()
    
    with torch.no_grad():
        with tqdm(dataloader) as t:
            for batch, (X, y, num_assets) in enumerate(t):
                batch_size = X.shape[0]
                X = X.to(device)
                y = y.to(device)
                num_assets = num_assets.to(device)
                y_pred = net(X)
                y_pred = y_pred.view(batch_size, 14)

                mask = torch.arange(14).expand(batch_size, 14).to(device)
                mask = mask < num_assets.unsqueeze(1)
                y_pred = y_pred * mask.float()
                y = y * mask.float()

                loss = loss_fn(y_pred, y)
                running_loss = (batch_size * loss.item() + running_loss * current) / (batch_size + current)
                t.set_postfix({'running_loss':running_loss})
                
    return running_loss

In [81]:
if not os.path.exists('./checkpoint'):
    os.mkdir('./checkpoint')
if not os.path.exists('./logs'):
    os.mkdir('./logs')

In [82]:
# hyperparameters
BATCH_SIZE = 128
NUM_FEATURES = 40
MODEL_DIM = 32
FFN_DIM = 64
DROPOUT = 0.3
NUM_HEADS = 2
NUM_LAYERS = 2
MAX_EPOCH = 10
if torch.cuda.is_available():
    device = 'cuda'
elif torch.backends.mps.is_available():
    device = 'mps'
else:
    device = 'cpu'

In [88]:
train_dataset = MyDataset(processed_train)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

val_dataset = MyDataset(processed_val)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [89]:
net = CryptoTransformer(model_dim=MODEL_DIM, num_heads=NUM_HEADS, num_layers=NUM_LAYERS, dropout=DROPOUT, ffn_dim=FFN_DIM).to(device)
net.apply(init_weights)
loss_fn = nn.MSELoss(reduction='mean')
optimizer = torch.optim.AdamW(net.parameters())
total_num = sum(p.numel() for p in net.parameters())
trainable_num = sum(p.numel() for p in net.parameters() if p.requires_grad)
print(f"Total number of parameters: {total_num/1e3:.0f}K, number of trainable parameters: {trainable_num/1e3:.0f}K")

Total number of parameters: 27K, number of trainable parameters: 27K


In [90]:
min_val_loss = float('inf')
best_epoch = 1

tb = SummaryWriter(log_dir='./logs/experiment2/')

for t in range(1, MAX_EPOCH+1):
    print(f"Epoch {t}\n-------------------------------")
    train_loss = train_loop(train_dataloader, net, loss_fn, optimizer, device)
    val_loss = val_loop(val_dataloader, net, loss_fn, device)
    tb.add_scalar("Train Loss", train_loss, t)
    tb.add_scalar("Val Loss", val_loss, t)
    
    torch.save(net, f"./checkpoint/experiment2/epoch_{t}.pt")
    if val_loss < min_val_loss:
        best_epoch = t
        min_val_loss = val_loss
print(f"best epoch: {best_epoch}, minimun validations loss: {min_val_loss:.2e}")

Epoch 1
-------------------------------


  8%|▊         | 1166/14609 [01:44<20:02, 11.18it/s, running_loss=0.0384]


KeyboardInterrupt: 

## Test Part

In [75]:
net = torch.load('./logs/experiment1/epoch_1.pt')
test_dataset = MyDataset(processed_test)

In [24]:
predictions = []
net.eval()
net = net.cpu()
with torch.no_grad():
    with tqdm(test_dataset) as t:
        for x, y, num_assets in t:
            y_pred = net(x)
            y_pred = y_pred[:num_assets]
            predictions.append(y_pred)

  0%|          | 0/81010 [00:00<?, ?it/s]

100%|██████████| 81010/81010 [01:02<00:00, 1288.12it/s]


In [25]:
predictions = torch.cat(predictions)
predictions = predictions.numpy()
processed_test['Predictions'] = predictions
asset_list.set_index('Asset_ID', inplace=True)
processed_test['Weight'] = processed_test['Asset_ID'].apply(lambda x: asset_list.loc[x]['Weight'])

In [70]:
def weighted_mean(x, w):
    return np.sum(x*w)/np.sum(w)

def weighted_cov(x, y, w):
    mean_x = weighted_mean(x, w)
    mean_y = weighted_mean(y, w)
    return weighted_mean((x-mean_x)*(y-mean_y), w)

def weighted_corr(x, y, w):
    return weighted_cov(x, y, w)/np.sqrt(weighted_cov(x, x, w) * weighted_cov(y, y, w))

In [77]:
corr = weighted_corr(processed_test['Predictions'], processed_test['Target'], processed_test['Weight'])
print(f'Weighted Pearson Coefficient: {corr:.2f}')

-0.0008556166236553724