### Data Exploration

To make data loading faster, use the [Parquet version](https://www.kaggle.com/c/ubiquant-market-prediction/discussion/301724), example notebook can be found [here](https://www.kaggle.com/code/robikscube/fast-data-loading-and-low-mem-with-parquet-files/notebook).

In [14]:
import pandas as pd
import numpy as np
import gc
import sys
import os
import torch
from tqdm import tqdm

In [15]:
# I downloaded the data and extracted it to a local_data/archive folder
dir_train = '/media/user/12TB1/HanLi/GitHub/CMU11785-project/local_data/archive'
dir_by_time = '/media/user/12TB1/HanLi/GitHub/CMU11785-project/local_data/by_time'
# this should take 10~30 seconds.
train = pd.read_parquet(os.path.join(dir_train, 'train_low_mem.parquet'))


In [None]:
train.info()
print(f"Unique investment_id: {len(train['investment_id'].unique())}")
# 1211 probabliy daily or monthly interval?
print(f"Unique time_id: {len(train['time_id'].unique())}")

In [None]:
train.head()

### Convert data into by time_id:
* feature data into shape=(T, N_features) (1211, 3579*300)
* Convert target data into shape=(T, N_targets) (1211, 3579)

In [None]:
f_cols = [f'f_{i}' for i in range(300)]
ls_invest_ids = sorted(list(train['investment_id'].unique()))

all_feats_cols = []
all_tgts_cols = []
for invst_id in ls_invest_ids:
    all_feats_cols.extend([f"{invst_id}_{f}" for f in f_cols])
    all_tgts_cols.append(f"{invst_id}_target")

ls_all_step_fs = []
ls_all_step_tgts = []
dict_all_step_map = {} # to map target values at each steps so we can query the predictions
for i, df_t in train.groupby('time_id'):
    break # comment this for real run
    time_id = int(df_t['time_id'].unique())
    ls_step_fs = [] # to hold all features at current timestep
    ls_step_tgts = [] # to hold all target values at current timestep
    dict_step_map = {}
    for k, invest_id in enumerate(ls_invest_ids):
        if invest_id in df_t['investment_id'].tolist():
            # If investment_id is in current timestep, include its features
            ls_invest_id_fs = df_t.loc[df_t['investment_id']==invest_id, f_cols].values.tolist()[0]
            tgt = df_t.loc[df_t['investment_id']==invest_id, 'target'].values[0]
            ls_step_tgts.append(tgt)
            row_id = df_t.loc[df_t['investment_id']==invest_id, 'row_id'].values[0]
            dict_step_map[row_id] = k
        else:
            # Otherwise append zeros
            ls_invest_id_fs = [0]*300 # all features are empty
            ls_step_tgts.append(0) # NOTE: Should we use 0 for empty target values?
        ls_step_fs.extend(ls_invest_id_fs)
    dict_all_step_map[time_id] = dict_step_map
    ls_all_step_fs.append(ls_step_fs)
    ls_all_step_tgts.append(ls_step_tgts)

df_t = pd.DataFrame(ls_all_step_tgts , columns=all_tgts_cols)
df_f = pd.DataFrame(ls_all_step_fs , columns=all_feats_cols)

In [None]:
df_targets = pd.read_parquet(os.path.join(dir_by_time, 'target_by_time.parquet'))
df_targets

In [53]:
DIR_BYID = '/media/user/12TB1/HanLi/GitHub/CMU11785-project/local_data/content/databyid'

sample_f = np.load('/media/user/12TB1/HanLi/GitHub/CMU11785-project/local_data/content/databyid/feats/0.npy')
sample_t = np.load('/media/user/12TB1/HanLi/GitHub/CMU11785-project/local_data/content/databyid/target/0.npy')
print("==>> sample_f.shape: ", sample_f.shape)
print("==>> sample_t.shape: ", sample_t.shape)

==>> sample_f.shape:  (1220, 300)
==>> sample_t.shape:  (1220,)


In [8]:
sample_f.shape

(1220, 300)

### Pytorch-Forecasting

#### 1 .TimeSeriesDataSet

In [18]:
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor

from pytorch_forecasting import TimeSeriesDataSet, TemporalFusionTransformer

In [66]:
f_cols = [f"f_{i}" for i in range(300)]

n = 5
ls_dfs = []
for id in range(n):
    df_f_id = pd.DataFrame(np.load(os.path.join(DIR_BYID, f'feats/{id}.npy')), columns=f_cols)
    df_t_id = pd.DataFrame(np.load(os.path.join(DIR_BYID, f'target/{id}.npy')), columns=['target'])
    # break
    df_f_id['investment_id'] = id
    ls_dfs.append(pd.concat([df_t_id, df_f_id], axis=1))

df = pd.concat(ls_dfs).reset_index().rename(columns={'index': 'time_id'})
df


Unnamed: 0,time_id,target,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,...,f_291,f_292,f_293,f_294,f_295,f_296,f_297,f_298,f_299,investment_id
0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
1,1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
2,2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
3,3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
4,4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6095,1215,0.968079,-1.554290,-0.838289,-2.424482,0.915358,0.222851,0.305951,-0.223640,-0.234155,...,-0.694756,-0.566056,-1.300175,-1.033787,0.228409,-0.082722,0.180542,0.319280,0.456568,4
6096,1216,-0.514005,-4.977215,-0.494941,-2.558681,1.110870,0.228895,-0.377394,-0.378233,-0.229393,...,1.368321,-1.107128,-1.161238,0.858016,1.042636,0.944268,0.940109,-0.645547,0.501274,4
6097,1217,-0.207239,-3.204356,-0.023446,-1.407762,1.220369,0.237594,-0.714554,-0.571917,-0.201502,...,0.605606,-1.116755,-2.465713,0.675163,0.800786,1.959251,1.591608,-0.853658,0.463734,4
6098,1218,0.318535,-2.848888,0.337431,-1.288697,1.025864,0.236846,-0.315953,-0.539815,-0.121384,...,-0.819054,-0.266339,0.482149,0.967866,0.098972,-1.386759,-0.426667,0.430580,0.457497,4


In [68]:

# create the dataset from the pandas dataframe
dataset = TimeSeriesDataSet(
    df,
    group_ids=["investment_id"],
    target="target",
    time_idx="time_id",
    min_encoder_length=1,
    max_encoder_length=1,
    min_prediction_length=1,
    max_prediction_length=1,
    time_varying_unknown_reals=[f"f_{i}" for i in range(300)],
)

dataset

<pytorch_forecasting.data.timeseries.TimeSeriesDataSet at 0x7f1327aa6c40>