In [1]:
import warnings
warnings.filterwarnings('ignore')

import os
import sys

import random
import pandas as pd
import numpy as np
import torch
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor, ModelCheckpoint
from pytorch_forecasting import TimeSeriesDataSet, TemporalFusionTransformer
from pytorch_forecasting.data import GroupNormalizer
from pytorch_forecasting.metrics import QuantileLoss
from pytorch_lightning.loggers import WandbLogger

from sklearn.model_selection import train_test_split

def create_dir(dir_to_add):
    if not os.path.exists(dir_to_add):
        os.makedirs(dir_to_add)

pl.seed_everything(11785)


NUM_WORKERS = 8                                                  # Use 4 for AWS
DIR_PROJECT = '/media/user/12TB1/HanLi/GitHub/CMU11785-project/' # Change this
DIR_TRAINED = os.path.join(DIR_PROJECT, 'local_trained')         # Don't change
DIR_DATA = os.path.join(DIR_PROJECT, 'src/data')                 # Don't change
DIR_LOCAL_DATA = os.path.join(DIR_PROJECT, 'local_data')         # Don't change
DIR_BYID = os.path.join(DIR_LOCAL_DATA, 'content/databyid')      # Might need to change

create_dir(DIR_TRAINED)
create_dir(DIR_LOCAL_DATA)


Global seed set to 11785


In [2]:
# Function to create timeseries dataset
# TODO: Miao to add PCA and AR pre-processing steps
def create_dataset(
    ls_train_val_ids,
    ls_test_ids, # Should be a subset of ls_train_val_ids
    starting_time_id=400, # Use data after 400 time idx
    max_prediction_length=3,
    max_encoder_length=14,
    save_path=os.path.join(DIR_LOCAL_DATA, 'tft_246'),
):
    """ Train : Val : Test = 7 : 2 : 1
    """
    assert set(ls_test_ids).issubset(set(ls_train_val_ids))
    print(f"Reading raw data {len(ls_train_val_ids)} train/val samples and {len(ls_test_ids)} test samples...")
    f_cols = [f"f_{i}" for i in range(300)]
    ls_dfs = []
    for id in ls_train_val_ids:
        df_f_id = pd.DataFrame(np.load(os.path.join(DIR_BYID, f'feats/{id}.npy')), columns=f_cols)
        df_t_id = pd.DataFrame(np.load(os.path.join(DIR_BYID, f'target/{id}.npy')), columns=['target'])
        df_f_id['investment_id'] = id
        df_id = pd.concat([df_t_id, df_f_id], axis=1)
        ls_dfs.append(df_id)

    df = pd.concat(ls_dfs).reset_index().rename(columns={'index': 'time_id'})
    df = df.sort_values(by=['time_id']) # sort by time before splitting

    df = df.loc[df['time_id'] >= starting_time_id]
    df_train, df_test = train_test_split(df, test_size=0.1, shuffle=False)
    df_train, df_val = train_test_split(df_train, test_size=2/9, shuffle=False)
    df_test = df_test.loc[df_test['investment_id'].isin(ls_test_ids)].reset_index(drop=True)
    df_train['investment_id'] = df_train['investment_id'].astype(str)
    df_val['investment_id'] = df_val['investment_id'].astype(str)
    df_test['investment_id'] = df_test['investment_id'].astype(str)

    print('Dataframes read complete, creating TimeSeriesDataSet...')
    
    # create the dataset from the pandas dataframe
    train_dataset = TimeSeriesDataSet(
        df_train,
        group_ids=["investment_id"],
        target="target",
        time_idx="time_id",
        min_encoder_length=max_encoder_length // 2,
        max_encoder_length=max_encoder_length,
        min_prediction_length=1,
        max_prediction_length=max_prediction_length,
        static_categoricals=["investment_id"],
        static_reals=[],
        time_varying_known_categoricals=[],
        time_varying_known_reals=f_cols,
        time_varying_unknown_categoricals=[],
        time_varying_unknown_reals=['target'],
        target_normalizer=GroupNormalizer( # normalize the targe for each investment_id along corresponding time_idx
            groups=["investment_id"], 
            transformation=None # NOTE: do not use softplus or relu for encoder normalization with DeepAR
            # transformation="softplus" # NOTE: do not use softplus or relu for encoder normalization with DeepAR
        ),
        # Add additional features
        add_relative_time_idx=True,
        add_target_scales=True,
        add_encoder_length=True,
    )
    val_dataset = TimeSeriesDataSet.from_dataset(train_dataset, df_val, predict=True, stop_randomization=True)
    test_dataset = TimeSeriesDataSet.from_dataset(train_dataset, df_test, predict=True, stop_randomization=True)

    if save_path is not None:
        print(f"Save datasets with {len(ls_train_val_ids)} train/val samples and {len(ls_test_ids)} test samples...")
        # Save dataset so we can use it next time
        create_dir(save_path)
        train_dataset.save(os.path.join(save_path, f'tft_train_{len(ls_train_val_ids)}_samples.pf'))
        val_dataset.save(os.path.join(save_path, f'tft_val_{len(ls_train_val_ids)}_samples.pf'))
        test_dataset.save(os.path.join(save_path, f'tft_test_{len(ls_test_ids)}_samples.pf'))

    return train_dataset, val_dataset, test_dataset


# Read subset list
ls_train_val_ids = pd.read_pickle(os.path.join(DIR_DATA, 'test_246_ids.pkl'))
ls_test_ids = random.sample(ls_train_val_ids, 200)
train_dataset, val_dataset, test_dataset = create_dataset(ls_train_val_ids, ls_test_ids)


Create and save new dataset with 246 train/val samples and 200 test samples...
Dataframes read complete, creating TimeSeriesDataSet...
