In [None]:
!pip install pytorch-forecasting

In [None]:
import pandas as pd
import numpy as np
import gc
import sys
import os
import torch
from tqdm import tqdm

In [None]:
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor
# import dataset, network to train and metric to optimize
from pytorch_forecasting import TimeSeriesDataSet, TemporalFusionTransformer, QuantileLoss, DeepAR

from pytorch_forecasting.data.encoders import NaNLabelEncoder 

In [None]:
if False:
    !kaggle datasets download -d robikscube/ubiquant-parquet -p /home/ubuntu/data
    !unzip -q /home/ubuntu/data/ubiquant-parquet.zip -d /home/ubuntu/data

In [None]:
dir_train = '/home/ubuntu/data'
data = pd.read_parquet(os.path.join(dir_train, 'train_low_mem.parquet'))

# pytorch-forecasting has no built-in methods for dealing with large dataset
# when dataset is large, memory will run out
# we only use a small fragment of the data to familiarize us with the package
# for manual dealing of large dataset
# search 'large datasets' on this page: https://pytorch-forecasting.readthedocs.io/en/stable/api/pytorch_forecasting.data.timeseries.TimeSeriesDataSet.html
data = data[data['time_id'] <= 200]


In [None]:
# TimeSeriesDataSet requires that 
# we turn investment_id to str, time_id to int

data.investment_id = data.investment_id.astype(str)
data.time_id = data.time_id.astype(int)
data.dtypes

In [None]:
# col_names of features, to 
f_cols = [f'f_{i}' for i in range(300)]

In [None]:
max_encoder_length = 12
max_prediction_length = 1
training_cutoff = 180

In [None]:
training = TimeSeriesDataSet(
    data[lambda x: x.time_id <= training_cutoff],
    time_idx='time_id',  # column name of time of observation
    target='target',  # column name of target to predict
    group_ids=['investment_id'],  # column name(s) for timeseries IDs
    max_encoder_length=max_encoder_length,  # how much history to use
    max_prediction_length=max_prediction_length,  # how far to predict into future
    # covariates static for a timeseries ID
    static_categoricals=[],
    static_reals=[],
    # investment_id as categorical covariates that are known in the future for preddiction
    time_varying_known_categoricals=['investment_id'],
    # put in f_cols and time_id as real covariates that are known in the future for prediction
    time_varying_known_reals = f_cols + ['time_id'],
    time_varying_unknown_categoricals=[],
    time_varying_unknown_reals=[],
    allow_missing_timesteps=True,
    # having add_nan=True in Encoder allows us to predict unseen investments
    categorical_encoders = {'__group_id__investment_id': NaNLabelEncoder(add_nan=True), 'investment_id': NaNLabelEncoder(add_nan=True)}
)

In [None]:
train_dataloader = training.to_dataloader(train=True, batch_size=32, num_workers=4)

In [None]:
#training.get_parameters()

In [None]:
validation = TimeSeriesDataSet.from_dataset(training, data, predict=True, stop_randomization=True,
    categorical_encoders = {'__group_id__investment_id': NaNLabelEncoder(add_nan=True), 'investment_id': NaNLabelEncoder(add_nan=True)})

In [None]:
val_dataloader = validation.to_dataloader(train=False, batch_size=32 * 10, num_workers=4)

In [None]:
training.save("training.pkl")
validation.save("validation.pkl")


In [None]:
pl.seed_everything(42)

trainer = pl.Trainer(
    gpus=1,
    # clipping gradients is a hyperparameter and important to prevent divergance
    # of the gradient for recurrent neural networks
    gradient_clip_val=0.1,
)


deepar = DeepAR.from_dataset(
    training,
    # not meaningful for finding the learning rate but otherwise very important
    learning_rate=0.03,
    hidden_size=16,  # most important hyperparameter apart from learning rate
    # number of attention heads. Set to up to 4 for large datasets
#    attention_head_size=1,
    dropout=0.1,  # between 0.1 and 0.3 are good values
#    hidden_continuous_size=8,  # set to <= hidden_size
#    output_size=7,  # 7 quantiles by default
    loss=QuantileLoss(),
    # reduce learning rate if no improvement in validation loss after x epochs
    reduce_on_plateau_patience=4,
)
print(f"Number of parameters in network: {tft.size()/1e3:.1f}k")
# run into error saying 
# AssertionError: target target has to be real
# working on it

### Pytorch Forecasting (Han Li 2022-4-17 updates)
* [Data Passing](https://pytorch-forecasting.readthedocs.io/en/stable/tutorials/building.html#passing-data)

In [17]:
import pandas as pd
import numpy as np
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor
from pytorch_forecasting import TimeSeriesDataSet, TemporalFusionTransformer

# Used Yujie's cleaned version
DIR_BYID = '/media/user/12TB1/HanLi/GitHub/CMU11785-project/local_data/content/databyid'

ls_all_invest_ids = sorted([int(fn.split('.')[0]) for fn in os.listdir(os.path.join(DIR_BYID, 'target'))])

In [21]:
f_cols = [f"f_{i}" for i in range(300)]

# Read a subset for testing
n = 5
ls_dfs = []
for id in ls_all_invest_ids[:n]:
    df_f_id = pd.DataFrame(np.load(os.path.join(DIR_BYID, f'feats/{id}.npy')), columns=f_cols)
    df_t_id = pd.DataFrame(np.load(os.path.join(DIR_BYID, f'target/{id}.npy')), columns=['target'])
    df_f_id['investment_id'] = id
    ls_dfs.append(pd.concat([df_t_id, df_f_id], axis=1))

df = pd.concat(ls_dfs).reset_index().rename(columns={'index': 'time_id'})
df


Unnamed: 0,time_id,target,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,...,f_291,f_292,f_293,f_294,f_295,f_296,f_297,f_298,f_299,investment_id
0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
1,1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
2,2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
3,3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
4,4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6095,1215,0.968079,-1.554290,-0.838289,-2.424482,0.915358,0.222851,0.305951,-0.223640,-0.234155,...,-0.694756,-0.566056,-1.300175,-1.033787,0.228409,-0.082722,0.180542,0.319280,0.456568,4
6096,1216,-0.514005,-4.977215,-0.494941,-2.558681,1.110870,0.228895,-0.377394,-0.378233,-0.229393,...,1.368321,-1.107128,-1.161238,0.858016,1.042636,0.944268,0.940109,-0.645547,0.501274,4
6097,1217,-0.207239,-3.204356,-0.023446,-1.407762,1.220369,0.237594,-0.714554,-0.571917,-0.201502,...,0.605606,-1.116755,-2.465713,0.675163,0.800786,1.959251,1.591608,-0.853658,0.463734,4
6098,1218,0.318535,-2.848888,0.337431,-1.288697,1.025864,0.236846,-0.315953,-0.539815,-0.121384,...,-0.819054,-0.266339,0.482149,0.967866,0.098972,-1.386759,-0.426667,0.430580,0.457497,4


In [24]:
# create the dataset from the pandas dataframe
dataset = TimeSeriesDataSet(
    df,
    group_ids=["investment_id"],
    target="target",
    time_idx="time_id",
    min_encoder_length=5,
    max_encoder_length=5,
    min_prediction_length=2,
    max_prediction_length=2,
    time_varying_unknown_reals=[f"f_{i}" for i in range(300)],
)

dataset.get_parameters()

{'time_idx': 'time_id',
 'target': 'target',
 'group_ids': ['investment_id'],
 'weight': None,
 'max_encoder_length': 5,
 'min_encoder_length': 5,
 'min_prediction_idx': 0,
 'min_prediction_length': 2,
 'max_prediction_length': 2,
 'static_categoricals': [],
 'static_reals': [],
 'time_varying_known_categoricals': [],
 'time_varying_known_reals': [],
 'time_varying_unknown_categoricals': [],
 'time_varying_unknown_reals': ['f_0',
  'f_1',
  'f_2',
  'f_3',
  'f_4',
  'f_5',
  'f_6',
  'f_7',
  'f_8',
  'f_9',
  'f_10',
  'f_11',
  'f_12',
  'f_13',
  'f_14',
  'f_15',
  'f_16',
  'f_17',
  'f_18',
  'f_19',
  'f_20',
  'f_21',
  'f_22',
  'f_23',
  'f_24',
  'f_25',
  'f_26',
  'f_27',
  'f_28',
  'f_29',
  'f_30',
  'f_31',
  'f_32',
  'f_33',
  'f_34',
  'f_35',
  'f_36',
  'f_37',
  'f_38',
  'f_39',
  'f_40',
  'f_41',
  'f_42',
  'f_43',
  'f_44',
  'f_45',
  'f_46',
  'f_47',
  'f_48',
  'f_49',
  'f_50',
  'f_51',
  'f_52',
  'f_53',
  'f_54',
  'f_55',
  'f_56',
  'f_57',
  'f_

In [25]:
# convert the dataset to a dataloader
dataloader = dataset.to_dataloader(batch_size=4)

# and load the first batch
x, y = next(iter(dataloader))
print("x =", x)
print("\ny =", y)
print("\nsizes of x =")
for key, value in x.items():
    print(f"\t{key} = {value.size()}")

x = {'encoder_cat': tensor([], size=(4, 5, 0), dtype=torch.int64), 'encoder_cont': tensor([[[ 0.8952,  1.1545,  1.4696,  ..., -0.2996, -1.6060,  5.2190],
         [ 0.8705,  0.8604,  1.5183,  ..., -0.5999, -1.5334,  5.2132],
         [ 0.9316,  0.2112,  1.5212,  ..., -0.6561, -1.3178,  1.6526],
         [ 0.9680, -0.4409,  1.5621,  ..., -0.3906, -1.3156,  1.5707],
         [ 1.0579, -0.9110,  1.5842,  ..., -0.1730, -1.2395,  1.4460]],

        [[ 1.2796,  0.9712, -0.1817,  ...,  1.5526, -1.6673,  0.1927],
         [ 1.1332,  0.7899, -0.2313,  ...,  1.1810,  1.3822,  0.6540],
         [ 0.8378,  0.9057, -0.2258,  ..., -1.9277, -0.8170,  0.6802],
         [ 0.7789,  1.1605,  0.0203,  ..., -0.4066,  1.2564,  1.3275],
         [ 0.3965,  1.2627, -1.7200,  ..., -2.3754,  1.1802,  3.1274]],

        [[-0.1336, -0.4002, -0.1538,  ...,  0.4063,  0.9587,  1.7518],
         [ 0.0537, -0.3221,  0.1672,  ..., -0.1246, -1.1590,  1.7976],
         [-0.1631,  0.0553,  0.2934,  ..., -0.5409, -1.3804, 