In [None]:
import pandas as pd
import numpy as np

import datetime
from dateutil.relativedelta import relativedelta

from tqdm import tqdm

from joblib import delayed, Parallel 

import sys
sys.path.append('../')
from utils import CHARAS_LIST

import warnings
warnings.filterwarnings('ignore')

In [None]:
class modelBase:
    def __init__(self, name):
        self.name = name
        self.train_idx = 0
        
        # initial train, valid and test periods are default accroding to original paper
        self.train_period = [19570101, 19741231]
        self.valid_period = [19750101, 19861231]
        self.test_period  = [19870101, 19871231]
    
    
    def train_model(self):
        # print('trained')
        pass

    
    def calBeta(self, month):
        """
        Calculate specific month's beta. Should be specified by different models
        -> return np.array, dim = (N, K)
        """
        # return np.zeros([13000, 3])
        pass
    
        
    def calFactor(self, month):
        """
        Calculate specific month's factor. Should be specified by different models
        -> return np.array, dim = (K, 1)
        """
        # return np.zeros([3, 1])
        pass    
       
    
    def cal_delayed_Factor(self, month):
        """
        Calculate delayed month's factor, i.e. mean average of factors up to t-1. Should be specified by different models
        -> return np.array, dim = (K, 1)
        """
        pass
    
    
    def inference(self, month):       
        assert month >= self.test_period[0], f"Month error, {month} is not in test period {self.test_period}"
        
        mon_factor, mon_beta = self.calFactor(month), self.calBeta(month)
        
        assert mon_beta.shape[1] == mon_factor.shape[0], f"Dimension mismatch between mon_factor: {mon_factor.shape} and mon_beta: {mon_beta.shape}"
        
        # R_{N*1} = Beta_{N*K} @ F_{K*1}
        return mon_beta @ mon_factor
        
    
    def predict(self, month):
        assert month >= self.test_period[0] and month <= self.test_period[1], f"Month error, {month} is not in test period {self.test_period}"
        
        lag_factor, mon_beta = self.cal_delayed_Factor(month), self.calBeta(month)
        
        assert mon_beta.shape[1] == lag_factor.shape[0], f"Dimension mismatch between lag_factor: {lag_factor.shape} and mon_beta: {mon_beta.shape}"
        
        # R_{N*1} = Beta_{N*K} @ lag_F_avg{K*1}  
        return mon_beta @ lag_factor
    
    
    def refit(self):
        self.train_period[1] += 10000
        self.valid_period = (pd.Series(self.valid_period) + 10000).to_list()
        self.test_period = (pd.Series(self.test_period) + 10000).to_list()
        

In [None]:
from ipca import InstrumentedPCA

class IPCA(modelBase):
    def __init__(self, K, portfolio=True):
        super(IPCA, self).__init__(f'IPCA_{K}')
        self.K = K
        self.portfolio = portfolio
        self.__prepare_data()

    def __prepare_data(self):
        self.p_charas = pd.read_pickle('../data/p_charas.pkl')
        portfolio_ret=  pd.read_pickle('../data/portfolio_ret.pkl')
        self.p_charas['p_ret'] = np.zeros(self.p_charas.shape[0])
        self.train_p_charas = self.p_charas.loc[self.p_charas.DATE <= self.test_period[1]].copy(deep=False).reset_index().set_index(['index', 'DATE']).sort_index()
        for chr in charas:
            self.train_p_charas.loc[f'p_{chr}', 'p_ret'] = portfolio_ret.loc[portfolio_ret.DATE <= self.test_period[1]][chr].values
        
        
    def train_model(self):
        y = self.train_p_charas['p_ret']
        X = self.train_p_charas.drop('p_ret', axis=1)

        self.regr = InstrumentedPCA(n_factors=1, intercept=True)
        self.regr = self.regr.fit(X=X, y=y)
        self.Gamma, self.Factors = self.regr.get_factors(label_ind=False)
        
    
    def inference(self, month):
        X_pred = self.p_charas.loc[self.p_charas.DATE == month].copy(deep=False).reset_index().set_index(['index', 'DATE']).sort_index()
        return self.ipca_1.regr.predict(X_pred, mean_factor=True) # (N, 1)
    
    def predict(self, month):
        lag_X = self.p_charas.loc[self.p_charas.DATE < month].copy(deep=False).reset_index().groupby('index').mean()
        lag_X.DATE = self.p_charas.loc[self.p_charas.DATE < month].DATE.drop_duplicates()[-1]
        lag_X = lag_X.reset_index().set_index(['index', 'DATE']).sort_index()
        return self.ipca_1.regr.predict(X_pred, mean_factor=True) # (N, 1)
    

In [None]:
ipca_1 = IPCA(1)

In [None]:
p_charas = pd.read_pickle('../data/p_charas.pkl')

In [None]:
month = 19870130

In [None]:
p_charas.loc[p_charas.DATE == month].drop('acc', axis=1)

In [None]:
ipca_1.regr.predict(lag_X, mean_factor=True).shape

In [None]:
X_pred

In [None]:
ipca_1.train_model()

In [None]:
p_chara = pd.read_pickle('../data/p_charas.pkl')

In [None]:
ipca_1.test_period[1]

In [None]:
X = p_chara.loc[p_chara.DATE==19871231]

In [None]:
ipca_1.regr.predict(portfolio_ret.loc[portfolio_ret.DATE == 19871231][charas].values).shape