### Data Exploration

To make data loading faster, use the [Parquet version](https://www.kaggle.com/c/ubiquant-market-prediction/discussion/301724), example notebook can be found [here](https://www.kaggle.com/code/robikscube/fast-data-loading-and-low-mem-with-parquet-files/notebook).

In [None]:
import pandas as pd
import numpy as np
import gc
import sys
import os
import torch
from tqdm import tqdm

In [None]:
# I downloaded the data and extracted it to a local_data/archive folder
dir_train = '/media/user/12TB1/HanLi/GitHub/CMU11785-project/local_data/archive'
dir_by_time = '/media/user/12TB1/HanLi/GitHub/CMU11785-project/local_data/by_time'
# this should take 10~30 seconds.
train = pd.read_parquet(os.path.join(dir_train, 'train_low_mem.parquet'))


In [None]:
train.info()
print(f"Unique investment_id: {len(train['investment_id'].unique())}")
# 1211 probabliy daily or monthly interval?
print(f"Unique time_id: {len(train['time_id'].unique())}")

In [None]:
train.head()

### Convert data into by time_id:
* feature data into shape=(T, N_features) (1211, 3579*300)
* Convert target data into shape=(T, N_targets) (1211, 3579)

In [None]:
f_cols = [f'f_{i}' for i in range(300)]
ls_invest_ids = sorted(list(train['investment_id'].unique()))

all_feats_cols = []
all_tgts_cols = []
for invst_id in ls_invest_ids:
    all_feats_cols.extend([f"{invst_id}_{f}" for f in f_cols])
    all_tgts_cols.append(f"{invst_id}_target")

ls_all_step_fs = []
ls_all_step_tgts = []
dict_all_step_map = {} # to map target values at each steps so we can query the predictions
for i, df_t in train.groupby('time_id'):
    break # comment this for real run
    time_id = int(df_t['time_id'].unique())
    ls_step_fs = [] # to hold all features at current timestep
    ls_step_tgts = [] # to hold all target values at current timestep
    dict_step_map = {}
    for k, invest_id in enumerate(ls_invest_ids):
        if invest_id in df_t['investment_id'].tolist():
            # If investment_id is in current timestep, include its features
            ls_invest_id_fs = df_t.loc[df_t['investment_id']==invest_id, f_cols].values.tolist()[0]
            tgt = df_t.loc[df_t['investment_id']==invest_id, 'target'].values[0]
            ls_step_tgts.append(tgt)
            row_id = df_t.loc[df_t['investment_id']==invest_id, 'row_id'].values[0]
            dict_step_map[row_id] = k
        else:
            # Otherwise append zeros
            ls_invest_id_fs = [0]*300 # all features are empty
            ls_step_tgts.append(0) # NOTE: Should we use 0 for empty target values?
        ls_step_fs.extend(ls_invest_id_fs)
    dict_all_step_map[time_id] = dict_step_map
    ls_all_step_fs.append(ls_step_fs)
    ls_all_step_tgts.append(ls_step_tgts)

df_t = pd.DataFrame(ls_all_step_tgts , columns=all_tgts_cols)
df_f = pd.DataFrame(ls_all_step_fs , columns=all_feats_cols)

In [None]:
df_targets = pd.read_parquet(os.path.join(dir_by_time, 'target_by_time.parquet'))
df_targets