### Data Exploration

To make data loading faster, use the [Parquet version](https://www.kaggle.com/c/ubiquant-market-prediction/discussion/301724), example notebook can be found [here](https://www.kaggle.com/code/robikscube/fast-data-loading-and-low-mem-with-parquet-files/notebook).

In [1]:
import pandas as pd
import numpy as np
import gc
import sys
import os
import torch
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# I downloaded the data and extracted it to a local_data/archive folder
dir_train = '/media/user/12TB1/HanLi/GitHub/CMU11785-project/local_data/archive'
dir_by_time = '/media/user/12TB1/HanLi/GitHub/CMU11785-project/local_data/by_time'
# this should take 10~30 seconds.
train = pd.read_parquet(os.path.join(dir_train, 'train_low_mem.parquet'))


In [None]:
train.info()
print(f"Unique investment_id: {len(train['investment_id'].unique())}")
# 1211 probabliy daily or monthly interval?
print(f"Unique time_id: {len(train['time_id'].unique())}")

### Convert data into by time_id:
* feature data into shape=(T, N_features) (1211, 3579*300)
* Convert target data into shape=(T, N_targets) (1211, 3579)

In [None]:
f_cols = [f'f_{i}' for i in range(300)]
ls_invest_ids = sorted(list(train['investment_id'].unique()))

all_feats_cols = []
all_tgts_cols = []
for invst_id in ls_invest_ids:
    all_feats_cols.extend([f"{invst_id}_{f}" for f in f_cols])
    all_tgts_cols.append(f"{invst_id}_target")

ls_all_step_fs = []
ls_all_step_tgts = []
dict_all_step_map = {} # to map target values at each steps so we can query the predictions
for i, df_t in train.groupby('time_id'):
    break # comment this for real run
    time_id = int(df_t['time_id'].unique())
    ls_step_fs = [] # to hold all features at current timestep
    ls_step_tgts = [] # to hold all target values at current timestep
    dict_step_map = {}
    for k, invest_id in enumerate(ls_invest_ids):
        if invest_id in df_t['investment_id'].tolist():
            # If investment_id is in current timestep, include its features
            ls_invest_id_fs = df_t.loc[df_t['investment_id']==invest_id, f_cols].values.tolist()[0]
            tgt = df_t.loc[df_t['investment_id']==invest_id, 'target'].values[0]
            ls_step_tgts.append(tgt)
            row_id = df_t.loc[df_t['investment_id']==invest_id, 'row_id'].values[0]
            dict_step_map[row_id] = k
        else:
            # Otherwise append zeros
            ls_invest_id_fs = [0]*300 # all features are empty
            ls_step_tgts.append(0) # NOTE: Should we use 0 for empty target values?
        ls_step_fs.extend(ls_invest_id_fs)
    dict_all_step_map[time_id] = dict_step_map
    ls_all_step_fs.append(ls_step_fs)
    ls_all_step_tgts.append(ls_step_tgts)

df_t = pd.DataFrame(ls_all_step_tgts , columns=all_tgts_cols)
df_f = pd.DataFrame(ls_all_step_fs , columns=all_feats_cols)

In [5]:
df_targets = pd.read_parquet(os.path.join(dir_by_time, 'target_by_time.parquet'))
df_targets

Unnamed: 0,0_target,1_target,2_target,3_target,4_target,6_target,7_target,8_target,9_target,10_target,...,3763_target,3764_target,3765_target,3766_target,3767_target,3768_target,3769_target,3770_target,3772_target,3773_target
0,0.000000,-0.300875,-0.231040,0.000000,0.000000,0.568807,-1.064780,-0.531940,1.505904,-0.260731,...,0.000000,0.000000,0.302557,0.003156,0.000000,-0.392297,-0.877746,-0.284696,0.202003,0.000000
1,0.000000,-0.917045,-0.472108,0.000000,0.000000,-0.147971,-0.372692,-0.105693,0.622500,-0.400038,...,0.000000,0.000000,-0.560079,0.250396,1.318857,-0.227782,-0.684049,-0.894825,-0.286612,0.000000
2,0.000000,-0.480234,-0.661659,0.000000,0.000000,0.243674,0.318899,-0.260137,-0.610705,-1.030857,...,0.000000,0.000000,-0.305467,2.031675,-0.040981,-0.018971,-0.250995,-0.323800,0.300915,0.000000
3,0.000000,-0.323562,-0.055215,0.000000,0.000000,1.816745,-0.711446,-0.640987,5.271096,-0.636719,...,0.000000,0.000000,-0.730791,0.857357,0.386379,-0.708491,-0.165561,0.836601,0.076417,0.000000
4,0.000000,2.494479,0.341267,0.000000,0.000000,0.470476,-1.581650,-0.592970,0.000000,-0.435524,...,0.000000,0.000000,-0.656495,1.897659,-1.476258,-0.210125,-0.206145,0.126859,-0.387297,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1206,0.579622,0.078338,-0.361899,1.624498,0.968079,-0.353440,0.000000,-0.665880,-0.344785,-0.818674,...,-0.074509,0.608381,-0.872156,-0.768846,0.000000,0.045190,0.227717,-0.811419,0.292205,-0.049360
1207,0.064896,-0.290113,-0.688692,-0.070665,-0.514005,-0.095650,0.000000,0.095912,-0.161798,-0.489038,...,0.163917,0.666394,-0.918939,1.095260,0.000000,-0.104977,-0.230372,-1.410046,0.651031,0.526346
1208,2.204664,-0.097136,0.007041,1.532567,-0.207239,0.287350,0.000000,0.567659,-1.444997,0.313066,...,0.351041,0.511106,0.261462,-0.467051,0.000000,0.577800,0.345404,-0.477728,-0.422950,1.492833
1209,1.037581,-0.080295,-0.082326,0.384600,0.318535,-0.187984,0.000000,-0.586860,-0.371770,0.534930,...,-0.029728,0.990797,-0.154512,-0.254095,0.000000,-0.164939,0.644711,-0.119869,0.513739,-0.386294


In [67]:
class UbiquantDatasetByTime(torch.utils.data.Dataset):
    def __init__(self,
                 dir_steps,
                 dir_target_parquet,
                 lookback=7,
                 horizon=1):

        self.X_dir = dir_steps
        self.Y_dir = dir_target_parquet
        self.X_files = os.listdir(self.X_dir) # unsorted
        self.Y_df = pd.read_parquet(dir_target_parquet)
        self.time_ids = sorted([int(s.split('.')[0]) for s in self.X_files])
        self.invst_ids = [int(col.split('_')[0]) for col in self.Y_df.columns]
        self.length = len(self.time_ids) - lookback - horizon + 1
        self.lookback = lookback
        self.horizon = horizon

    def __len__(self):
        return self.length

    def __getitem__(self, ind):
        # TODO: check if we can improve performance by reading parquet without pandas
        ls_x = [pd.read_parquet(os.path.join(self.X_dir, f"{self.time_ids[i]}.parquet"),
                                engine='fastparquet') for i in range(ind, ind+self.lookback)]
        xx = np.array(ls_x).squeeze()
        yy = self.Y_df.iloc[ind+self.lookback+self.horizon-1].to_numpy()
        return xx, yy


dd = UbiquantDatasetByTime(dir_steps=os.path.join(dir_by_time, 'steps'),
                           dir_target_parquet=os.path.join(dir_by_time, 'target_by_time.parquet'),
                           lookback=10,
                           horizon=1)


xx, yy = dd[1]
print("==>> xx.shape:\n", xx.shape)
print("==>> yy.shape:\n", yy.shape)


==>> xx.shape:
 (10, 1073700)
==>> yy.shape:
 (3579,)
