In [None]:
!pip install pytorch-forecasting

In [2]:
import pandas as pd
import numpy as np
import gc
import sys
import os
import torch
from tqdm import tqdm

In [17]:
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor
# import dataset, network to train and metric to optimize
from pytorch_forecasting import TimeSeriesDataSet, TemporalFusionTransformer, QuantileLoss, DeepAR

from pytorch_forecasting.data.encoders import NaNLabelEncoder 

In [4]:
if False:
    !kaggle datasets download -d robikscube/ubiquant-parquet -p /home/ubuntu/data
    !unzip -q /home/ubuntu/data/ubiquant-parquet.zip -d /home/ubuntu/data

In [39]:
dir_train = '/home/ubuntu/data'
data = pd.read_parquet(os.path.join(dir_train, 'train_low_mem.parquet'))

# pytorch-forecasting has no built-in methods for dealing with large dataset
# when dataset is large, memory will run out
# we only use a small fragment of the data to familiarize us with the package
# for manual dealing of large dataset
# search 'large datasets' on this page: https://pytorch-forecasting.readthedocs.io/en/stable/api/pytorch_forecasting.data.timeseries.TimeSeriesDataSet.html
data = data[data['time_id'] <= 200]


Unnamed: 0,row_id,time_id,investment_id,target,f_0,f_1,f_2,f_3,f_4,f_5,...,f_290,f_291,f_292,f_293,f_294,f_295,f_296,f_297,f_298,f_299
0,0_1,0,1,-0.300875,0.932573,0.113691,-0.402206,0.378386,-0.203938,-0.413469,...,0.366028,-1.09562,0.200075,0.819155,0.941183,-0.086764,-1.087009,-1.044826,-0.287605,0.321566
1,0_2,0,2,-0.23104,0.810802,-0.514115,0.742368,-0.616673,-0.194255,1.77121,...,-0.154193,0.912726,-0.734579,0.819155,0.941183,-0.387617,-1.087009,-0.929529,-0.97406,-0.343624
2,0_6,0,6,0.568807,0.393974,0.615937,0.567806,-0.607963,0.068883,-1.083155,...,-0.13802,0.912726,-0.551904,-1.220772,-1.060166,-0.219097,-1.087009,-0.612428,-0.113944,0.243608
3,0_7,0,7,-1.06478,-2.343535,-0.01187,1.874606,-0.606346,-0.586827,-0.815737,...,0.382201,0.912726,-0.266359,-1.220772,0.941183,-0.609113,0.104928,-0.783423,1.15173,-0.773309
4,0_8,0,8,-0.53194,0.842057,-0.262993,2.33003,-0.583422,-0.618392,-0.742814,...,-0.170365,0.912726,-0.741355,-1.220772,0.941183,-0.588445,0.104928,0.753279,1.345611,-0.737624
5,0_9,0,9,1.505904,0.608855,1.369305,-0.761515,0.86586,-0.359269,-1.835762,...,0.333684,-1.09562,-0.335999,0.819155,-1.060166,-0.343812,-1.087009,0.077862,0.142943,-0.05555
6,0_10,0,10,-0.260731,-1.863797,0.113691,1.573864,-0.598433,-0.569936,0.398784,...,0.82156,0.912726,0.476309,-1.220772,0.941183,-0.434315,1.296864,0.171329,1.051288,-0.745335
7,0_12,0,12,-0.469207,0.408954,-0.765238,0.26143,-0.591895,-0.03726,0.668721,...,0.82156,-1.09562,-0.864354,-1.220772,-1.060166,-0.300218,1.296864,-0.779556,0.274961,-0.18252
8,0_13,0,13,0.094525,0.861187,2.373796,-1.148977,0.752205,-0.050502,-2.249047,...,-0.658241,0.912726,0.718282,0.819155,0.941183,4.198117,1.296864,1.854434,0.0,-0.68834
9,0_14,0,14,-0.25112,-2.476555,0.239253,2.222353,-0.582276,-0.618236,0.386263,...,0.82156,-1.09562,-0.615709,-1.220772,-1.060166,-0.647769,0.104928,-0.849789,0.805876,-0.820165


In [29]:
# TimeSeriesDataSet requires that 
# we turn investment_id to str, time_id to int

data.investment_id = data.investment_id.astype(str)
data.time_id = data.time_id.astype(int)
data.dtypes

row_id            object
time_id            int64
investment_id     object
target           float32
f_0              float32
                  ...   
f_295            float32
f_296            float32
f_297            float32
f_298            float32
f_299            float32
Length: 304, dtype: object

In [7]:
# col_names of features, to 
f_cols = [f'f_{i}' for i in range(300)]

In [9]:
max_encoder_length = 12
max_prediction_length = 1
training_cutoff = 180

In [32]:
training = TimeSeriesDataSet(
    data[lambda x: x.time_id <= training_cutoff],
    time_idx='time_id',  # column name of time of observation
    target='target',  # column name of target to predict
    group_ids=['investment_id'],  # column name(s) for timeseries IDs
    max_encoder_length=max_encoder_length,  # how much history to use
    max_prediction_length=max_prediction_length,  # how far to predict into future
    # covariates static for a timeseries ID
    static_categoricals=[],
    static_reals=[],
    # investment_id as categorical covariates that are known in the future for preddiction
    time_varying_known_categoricals=['investment_id'],
    # put in f_cols and time_id as real covariates that are known in the future for prediction
    time_varying_known_reals = f_cols + ['time_id'],
    time_varying_unknown_categoricals=[],
    time_varying_unknown_reals=[],
    allow_missing_timesteps=True,
    # having add_nan=True in Encoder allows us to predict unseen investments
    categorical_encoders = {'__group_id__investment_id': NaNLabelEncoder(add_nan=True), 'investment_id': NaNLabelEncoder(add_nan=True)}
)



In [37]:
train_dataloader = training.to_dataloader(train=True, batch_size=32, num_workers=4)

In [None]:
#training.get_parameters()

In [34]:
validation = TimeSeriesDataSet.from_dataset(training, data, predict=True, stop_randomization=True,
    categorical_encoders = {'__group_id__investment_id': NaNLabelEncoder(add_nan=True), 'investment_id': NaNLabelEncoder(add_nan=True)})



In [36]:
val_dataloader = validation.to_dataloader(train=False, batch_size=32 * 10, num_workers=4)

In [35]:
training.save("training.pkl")
validation.save("validation.pkl")


In [38]:
pl.seed_everything(42)

trainer = pl.Trainer(
    gpus=1,
    # clipping gradients is a hyperparameter and important to prevent divergance
    # of the gradient for recurrent neural networks
    gradient_clip_val=0.1,
)


deepar = DeepAR.from_dataset(
    training,
    # not meaningful for finding the learning rate but otherwise very important
    learning_rate=0.03,
    hidden_size=16,  # most important hyperparameter apart from learning rate
    # number of attention heads. Set to up to 4 for large datasets
#    attention_head_size=1,
    dropout=0.1,  # between 0.1 and 0.3 are good values
#    hidden_continuous_size=8,  # set to <= hidden_size
#    output_size=7,  # 7 quantiles by default
    loss=QuantileLoss(),
    # reduce learning rate if no improvement in validation loss after x epochs
    reduce_on_plateau_patience=4,
)
print(f"Number of parameters in network: {tft.size()/1e3:.1f}k")
# run into error saying 
# AssertionError: target target has to be real
# working on it

Global seed set to 42
  rank_zero_warn(
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(
  rank_zero_warn(


AssertionError: target target has to be real