In [None]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from statsmodels.tsa.api import VAR

from sklearn.model_selection import train_test_split

import pytorch_lightning as pl
from pytorch_forecasting import TimeSeriesDataSet
from pytorch_forecasting.data import GroupNormalizer

import warnings
warnings.filterwarnings('ignore')

class V_AR():
    '''
    Vector Autoregressions (VAR)
    Y_t = \sum{A_i*Y_i} + Res
    Y: T * K
    T: number of time steps
    K: number of features

    '''
    def __init__(self, data):
        '''
        arg data: features
            p: max lag order

        '''
        self.data = data
        model = VAR(data)
        self.results = model.fit()
    
    def residual(self):
        '''
        transform features to its VAR residuals
        return: VAR residuals

        '''
        fit = self.results.fittedvalues
        return self.data - fit

In [None]:
class data_process():
    def __init__(self, train_1000_address, inv_list_address, good_inv_list_address):
        inv = pd.read_pickle(inv_list_address)
        inv_good = pd.read_pickle(good_inv_list_address)
        self.s = [i for i in inv if i in inv_good]
        df = pd.read_pickle(train_1000_address)
        self.target = df['target']
        self.feature = df.drop('target', axis=1)
    
    def scale():
        scaler = MinMaxScaler()
        m = []
        name = []
        for x in self.s:
            f = self.feature.loc[x].dropna()
            columns = ['f_{}'.format(i) for i in range(300)]
            f_scale = pd.DataFrame(scaler.fit_transform(f), index=f.index, columns=columns)
            m.append(f_scale)
            name.append(x)
        df_scale = pd.concat(m, keys=name, names=['investment_id'])
        return df_scale.join(self.target)

    def ar():
        m = []
        name = []
        for x in self.s:
            f = self.feature.loc[x].dropna()
            V = V_AR(f)
            m.append(V.residual())
            name.append(x)
        df_ar = pd.concat(m, keys=name, names=['investment_id'])
        return df_ar.join(self.target)
  
    def pca():
        m = []
        name = []
        for x in self.s:
            f = self.feature.loc[x].dropna()
            n = min(150, f.shape[0])
            pca = PCA(n_components=n)
            pca.fit(f)
            evr = pca.explained_variance_ratio_
            num = np.cumsum(evr)
            n_comp = np.where(num>0.9)[0][0]
            pca = PCA(n_components=n_comp)
            columns = ['fpca_{}'.format(i) for i in range(n_comp)]
            f_pca = pca.fit_transform(f)
            f_pca = pd.DataFrame(f_pca, index=f.index, columns=columns)
            m.append(f_pca)
            name.append(x)
        df_pca = pd.concat(m, keys=name, names=['investment_id'])
        return df_pca.join(self.target)

In [None]:
class dataset():
    def __init__(df_address, inv_list_address, start_time):
        f = pd.read_pickle(df_address)
        inv_id = pd.read_pickle(inv_list_address)
        iterables = [inv_id, range(start_time, 1220)]
        index = pd.MultiIndex.from_product(iterables, names=['investment_id', 'time_id'])
        n = len(inv_id) * len(range(start_time, 1220))
        df = pd.DataFrame(np.empty(n), index=index)
        df = df.join(f)
        df = df.drop(0, axis=1)
        df = df.fillna(0).reset_index()
        df['investment_id'] = df['investment_id'].astype(str)
        self.df = df.sort_values(by=['time_id'])
        # self.df.tp_pickle(./df_246_*******.pkl)
        
    def data_generate():
        df_train, df_test = train_test_split(self.df, test_size=0.1, shuffle=False)
        df_train, df_val = train_test_split(df_train, test_size=2/9, shuffle=False)
        max_prediction_length=3
        max_encoder_length=14
        train_dataset = TimeSeriesDataSet(
            df_train,
            group_ids=["investment_id"],
            target="target",
            time_idx="time_id",
            min_encoder_length=max_encoder_length // 2,
            max_encoder_length=max_encoder_length,
            min_prediction_length=1,
            max_prediction_length=max_prediction_length,
            static_categoricals=["investment_id"],
            static_reals=[],
            time_varying_known_categoricals=[],
            time_varying_known_reals=df.columns.tolist()[2:-1],
            time_varying_unknown_categoricals=[],
            time_varying_unknown_reals=['target'],
            target_normalizer=GroupNormalizer( # normalize the targe for each investment_id along corresponding time_idx
                groups=["investment_id"], 
                transformation=None # NOTE: do not use softplus or relu for encoder normalization with DeepAR
                # transformation="softplus" # NOTE: do not use softplus or relu for encoder normalization with DeepAR
            ),
            # Add additional features
            add_relative_time_idx=True,
            add_target_scales=True,
            add_encoder_length=True,
            allow_missing_timesteps=True,
        )
        val_dataset = TimeSeriesDataSet.from_dataset(train_dataset, df_val, predict=True, stop_randomization=True)
        test_dataset = TimeSeriesDataSet.from_dataset(train_dataset, df_test, predict=True, stop_randomization=True)
        # Save dataset to accelerate
        train_dataset.save('C:/Users/miaoy/Desktop/11785/HwData/project/pf_train_246_samples_arpca.pf')
        val_dataset.save('C:/Users/miaoy/Desktop/11785/HwData/project/pf_val_246_samples_arpca.pf')
        test_dataset.save('C:/Users/miaoy/Desktop/11785/HwData/project/pf_test_246_samples_arpca.pf')