### Sample investment from full data and create PytorchForecasting TimeSeriesDataSet 

In [5]:
import os

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import pytorch_lightning as pl
from pytorch_forecasting import TimeSeriesDataSet
from pytorch_forecasting.data import GroupNormalizer

import warnings
warnings.filterwarnings('ignore')

DIR_DATA = '/media/user/12TB1/HanLi/GitHub/CMU11785-project/local_data'
DIR_BYID = os.path.join(DIR_DATA, 'content/databyid')

def create_dataset(
    ls_ids,
    dir_pf_dataset,
    max_prediction_length=3,
    max_encoder_length=14,
):
    """ Train : Val : Test = 7 : 2 : 1
    """
    f_cols = [f"f_{i}" for i in range(300)]
    ls_dfs = []
    for id in ls_ids:
        df_f_id = pd.DataFrame(np.load(os.path.join(DIR_BYID, f'feats/{id}.npy')), columns=f_cols)
        df_t_id = pd.DataFrame(np.load(os.path.join(DIR_BYID, f'target/{id}.npy')), columns=['target'])
        df_f_id['investment_id'] = id
        df_id = pd.concat([df_t_id, df_f_id], axis=1)
        df_id['investment_id'] = df_id['investment_id'].astype(str)
        ls_dfs.append(df_id)

    df = pd.concat(ls_dfs).reset_index().rename(columns={'index': 'time_id'})
    df = df.sort_values(by=['time_id']) # sort by time before splitting

    df = df.loc[df['time_id'] >= 400]

    df_train, df_test = train_test_split(df, test_size=0.1, shuffle=False)
    df_train, df_val = train_test_split(df_train, test_size=2/9, shuffle=False)

    print(f"Create and save new dataset with {len(ls_ids)} samples...")
    # create the dataset from the pandas dataframe
    train_dataset = TimeSeriesDataSet(
        df_train,
        group_ids=["investment_id"],
        target="target",
        time_idx="time_id",
        min_encoder_length=max_encoder_length // 2,
        max_encoder_length=max_encoder_length,
        min_prediction_length=1,
        max_prediction_length=max_prediction_length,
        static_categoricals=["investment_id"],
        static_reals=[],
        time_varying_known_categoricals=[],
        time_varying_known_reals=f_cols,
        time_varying_unknown_categoricals=[],
        time_varying_unknown_reals=['target'],
        target_normalizer=GroupNormalizer( # normalize the targe for each investment_id along corresponding time_idx
            groups=["investment_id"], 
            transformation=None # NOTE: do not use softplus or relu for encoder normalization with DeepAR
            # transformation="softplus" # NOTE: do not use softplus or relu for encoder normalization with DeepAR
        ),
        # Add additional features
        add_relative_time_idx=True,
        add_target_scales=True,
        add_encoder_length=True,
    )
    val_dataset = TimeSeriesDataSet.from_dataset(train_dataset, df_val, predict=True, stop_randomization=True)
    test_dataset = TimeSeriesDataSet.from_dataset(train_dataset, df_test, predict=True, stop_randomization=True)
    # Save dataset to accelerate
    train_dataset.save(os.path.join(dir_pf_dataset, f'pf_train_{len(ls_ids)}_samples.pf'))
    val_dataset.save(os.path.join(dir_pf_dataset, f'pf_val_{len(ls_ids)}_samples.pf'))
    test_dataset.save(os.path.join(dir_pf_dataset, f'pf_test_{len(ls_ids)}_samples.pf'))



In [12]:
dir_pf_dataset = os.path.join(DIR_DATA, 'pf_dataset_deepar')
if not os.path.exists(dir_pf_dataset):
    os.makedirs(dir_pf_dataset)

ls_all_invest_ids = sorted([int(fn.split('.')[0]) for fn in os.listdir(os.path.join(DIR_BYID, 'target'))])

### Create dataset

In [13]:
import random
random.seed(11785)

n_samples = [500, 1000, 1500, 2000]
n_samples = [10]
for n_sample in n_samples:
    ls_subset = random.sample(ls_all_invest_ids, n_sample)
    create_dataset(ls_subset, dir_pf_dataset)


Create and save new dataset with 10 samples...


### Create test datasets

In [18]:
ls_test_500 = pd.read_pickle('/media/user/12TB1/HanLi/GitHub/CMU11785-project/src/data/test_500_ids.pkl')
ls_test_1000 = pd.read_pickle('/media/user/12TB1/HanLi/GitHub/CMU11785-project/src/data/test_1000_ids.pkl')
ls_test_1500 = pd.read_pickle('/media/user/12TB1/HanLi/GitHub/CMU11785-project/src/data/test_1500_ids.pkl')
ls_test_2000 = pd.read_pickle('/media/user/12TB1/HanLi/GitHub/CMU11785-project/src/data/test_2000_ids.pkl')
ls_test_all = pd.read_pickle('/media/user/12TB1/HanLi/GitHub/CMU11785-project/src/data/test_all_ids.pkl')

dir_pf_dataset = os.path.join(DIR_DATA, 'pf_dataset_tft')
if not os.path.exists(dir_pf_dataset):
    os.makedirs(dir_pf_dataset)

create_dataset(ls_test_1000, dir_pf_dataset, max_prediction_length=3, max_encoder_length=14)

Create and save new dataset with 1000 samples...


In [4]:
ls_test_500 = pd.read_pickle('/media/user/12TB1/HanLi/GitHub/CMU11785-project/src/data/test_500_ids.pkl')


dir_pf_dataset = os.path.join(DIR_DATA, 'pf_dataset_test')
if not os.path.exists(dir_pf_dataset):
    os.makedirs(dir_pf_dataset)

create_dataset(ls_test_500, dir_pf_dataset, max_prediction_length=3, max_encoder_length=14)

Create and save new dataset with 500 samples...


### Example read dataset

In [None]:
n = 500
train_dataset = TimeSeriesDataSet.load(os.path.join(dir_pf_dataset, f'pf_train_{n}_samples.pf'))
val_dataset = TimeSeriesDataSet.load(os.path.join(dir_pf_dataset, f'pf_val_{n}_samples.pf'))
test_dataset = TimeSeriesDataSet.load(os.path.join(dir_pf_dataset, f'pf_test_{n}_samples.pf'))

# Create dataloader from dataset
batch_size = 64  # set this between 32 to 128
train_dataloader = train_dataset.to_dataloader(train=True, batch_size=batch_size, num_workers=32)
val_dataloader = val_dataset.to_dataloader(train=False, batch_size=batch_size * 5, num_workers=32)

In [24]:

ls_subset = random.sample(ls_all_invest_ids, 10)
ls_subset
create_dataset(ls_subset, dir_pf_dataset)

Create and save new dataset with 10 samples...
