In [1]:
import pandas as pd
import numpy as np

import datetime
from dateutil.relativedelta import relativedelta

from tqdm import tqdm

from joblib import delayed, Parallel 

import sys
sys.path.append('../')
from utils import charas

import warnings
warnings.filterwarnings('ignore')

In [143]:
class modelBase:
    def __init__(self, name):
        self.name = name
        self.train_idx = 0
        
        # initial train, valid and test periods are default accroding to original paper
        self.train_period = [19570101, 19741231]
        self.valid_period = [19750101, 19861231]
        self.test_period  = [19870101, 19871231]
    
    
    def train_model(self):
        # print('trained')
        pass

    
    def calBeta(self, month):
        """
        Calculate specific month's beta. Should be specified by different models
        -> return np.array, dim = (N, K)
        """
        # return np.zeros([13000, 3])
        pass
    
        
    def calFactor(self, month):
        """
        Calculate specific month's factor. Should be specified by different models
        -> return np.array, dim = (K, 1)
        """
        # return np.zeros([3, 1])
        pass    
       
    
    def cal_delayed_Factor(self, month):
        """
        Calculate delayed month's factor, i.e. mean average of factors up to t-1. Should be specified by different models
        -> return np.array, dim = (K, 1)
        """
        pass
    
    
    def inference(self, month):       
        assert month >= self.test_period[0], f"Month error, {month} is not in test period {self.test_period}"
        
        mon_factor, mon_beta = self.calFactor(month), self.calBeta(month)
        
        assert mon_beta.shape[1] == mon_factor.shape[0], f"Dimension mismatch between mon_factor: {mon_factor.shape} and mon_beta: {mon_beta.shape}"
        
        # R_{N*1} = Beta_{N*K} @ F_{K*1}
        return mon_beta @ mon_factor
        
    
    def predict(self, month):
        assert month >= self.test_period[0] and month <= self.test_period[1], f"Month error, {month} is not in test period {self.test_period}"
        
        lag_factor, mon_beta = self.cal_delayed_Factor(month), self.calBeta(month)
        
        assert mon_beta.shape[1] == lag_factor.shape[0], f"Dimension mismatch between lag_factor: {lag_factor.shape} and mon_beta: {mon_beta.shape}"
        
        # R_{N*1} = Beta_{N*K} @ lag_F_avg{K*1}  
        return mon_beta @ lag_factor
    
    
    def refit(self):
        self.train_period[1] += 10000
        self.valid_period = (pd.Series(self.valid_period) + 10000).to_list()
        self.test_period = (pd.Series(self.test_period) + 10000).to_list()
        

In [18]:
def stock_R_matrix(start_date, end_date):
    R_matrix = pd.read_pickle('../data/stock_R_matrix.pkl')
    return R_matrix.T.loc[start_date: end_date].T

def portfolio_R_matrix(start_date, end_date):
    portfolio_ret = pd.read_pickle('../data/portfolio_ret.pkl')
    return portfolio_ret.loc[(portfolio_ret['DATE'] >= start_date) & (portfolio_ret['DATE'] <= end_date)].set_index('DATE').T

In [19]:
class PCA(modelBase):
    def __init__(self, K, portfolio=True):
        super(PCA, self).__init__(f'PCA_{K}')
        self.K = K
        self.portfolio = portfolio
        
        
    def __col_de_mean(self, matrix):
        return (matrix - matrix.mean()).fillna(0)
    
        
    def inference(self, month):
        if self.portfolio:
            r_matrix = self.__col_de_mean(portfolio_R_matrix(self.train_period[0], month)).astype(np.float32)
        else:
            r_matrix = self.__col_de_mean(stock_R_matrix(self.train_period[0], month))   
        u, sigma, vt = np.linalg.svd(r_matrix)
        # B_{N*K}
        B = u[:, :self.K]
        # F_{K*1}
        F = np.diag(sigma[:self.K]) @ vt[:self.K, -1]
        return B @ F
        
            
    def predict(self, month):
        if self.portfolio:
            r_matrix = self.__col_de_mean(portfolio_R_matrix(self.train_period[0], month)).astype(np.float32)
        else:
            r_matrix = self.__col_de_mean(stock_R_matrix(self.train_period[0], month))
        u, sigma, vt = np.linalg.svd(r_matrix)
        # B_{N*K}
        B = u[:, :self.K]
        # F_{K*1}
        lag_F = np.diag(sigma[:self.K]) @ vt[:self.K, :-1]
        return B @ np.mean(lag_F, axis=1)

In [23]:
pca_1 = PCA(1)

In [47]:
ff5 = pd.read_csv('../data/ff5.csv', index_col=0)
UMD = pd.read_csv('../data/UMD.csv', index_col=0)
UMD.columns = ['UMD']
FFf = pd.concat([ff5, UMD.loc[196307:]], axis=1)
FFf = FFf[['Mkt-RF', 'SMB', 'HML', 'CMA', 'RMW', 'UMD']]

In [88]:
FFf[FFf.columns[:3]].loc[20160101//100].values

array([-5.77, -3.48,  2.09])

201512

In [148]:
import statsmodels.api as sm

class FF(modelBase):
    def __init__(self, K, portfolio=True):
        super(FF, self).__init__(f'FF_{K}')
        self.K = K
        self.portfolio = portfolio
        self.train_period[0] = 19630731 # ff5 data from FF website is only available from 196307
        self.__prepare_FFf()
        
    
    def __prepare_FFf(self):
        ff5 = pd.read_csv('../data/ff5.csv', index_col=0)
        UMD = pd.read_csv('../data/UMD.csv', index_col=0)
        UMD.columns = ['UMD']
        FFf = pd.concat([ff5, UMD.loc[196307:]], axis=1)
        self.fname = ['Mkt-RF', 'SMB', 'HML', 'CMA', 'RMW', 'UMD']
        self.FFf = FFf[self.fname]
        self.portfolio_ret = pd.read_pickle('../data/portfolio_ret.pkl')
        self.portfolio_ret['DATE'] = self.portfolio_ret['DATE'].apply(lambda x: x//100)
        
    
    def train(self):
        self.beta_matrix = []
        X = self.FFf[self.fname[:self.K]].loc[self.train_period[0]//100:self.train_period[1]//100]
        for col in charas:
            y = self.portfolio_ret.set_index('DATE')[col].loc[self.train_period[0]//100:self.train_period[1]//100]
            model = sm.OLS(y.values, X.values).fit()
            self.beta_matrix.append(model.params)
        self.beta_matrix = pd.DataFrame(self.beta_matrix, columns=self.fname[:self.K], index=charas)
    
        
    def calBeta(self, month):
        return self.beta_matrix # N * K
        
            
    def calFactor(self, month):
        return self.FFf[self.fname[:self.K]].loc[month//100] # K * 1
        
        
    def cal_delayed_Factor(self, month):
        last_mon = int(str(pd.to_datetime(str(month)) - relativedelta(months=1)).split(' ')[0].replace('-', '')[:-2])
        return self.FFf[self.fname[:self.K]].loc[self.valid_period[0]:last_mon].mean()
        


In [151]:
ff3.FFf[ff3.fname[:3]].loc[]

Unnamed: 0,Mkt-RF,SMB,HML
196307,-0.39,-0.41,-0.97
196308,5.07,-0.80,1.80
196309,-1.57,-0.52,0.13
196310,2.53,-1.39,-0.10
196311,-0.85,-0.88,1.75
...,...,...,...
201608,0.49,1.70,3.13
201609,0.25,1.86,-1.23
201610,-2.02,-4.04,4.12
201611,4.86,7.04,8.19


In [146]:
ff3 = FF(3) 

In [149]:
ff3.cal_delayed_Factor(19870201)

Mkt-RF   NaN
SMB      NaN
HML      NaN
dtype: float64

In [141]:
ff3.FFf[ff3.fname[:ff3.K]].loc[19870201//100]

Mkt-RF    4.39
SMB       3.40
HML      -5.91
Name: 198702, dtype: float64

In [129]:
ff3.train()

In [131]:
ff3.calFactor(19870201)

In [28]:
mon_list = pd.read_pickle('../data/mon_list.pkl')

In [29]:
mon_list.to_csv()

0      19570329
1      19570430
2      19570531
3      19570628
4      19570731
         ...   
713    20160831
714    20160930
715    20161031
716    20161130
717    20161230
Name: DATE, Length: 718, dtype: int64

In [24]:
def model_inference_and_predict(model):
    mon_list = pd.read_pickle('../data/mon_list.pkl')
    test_mons = mon_list.loc[mon_list >= model.test_period[0]]
    inference_result = []
    predict_result = []
    T_bar = tqdm(test_mons.groupby(test_mons.apply(lambda x: x//10000)), colour='red', desc=f'{model.name} Inferencing & Predicting')
    
    for g in T_bar: # rolling train
        T_bar.set_postfix({'Year': g[0]})
        model.train()
        
        for m in g[1].to_list():
            inference_result.append(model.inference(m))
            predict_result.append(model.predict(m))
        # model refit (change train period and valid period)
        model.refit()

    inference_result = pd.DataFrame(inference_result, index=test_mons, columns=charas)
    inference_result.to_csv(f'../results/inference/{model.name}_inference.csv')
    
    predict_result = pd.DataFrame(predict_result, index=test_mons, columns=charas)
    predict_result.to_csv(f'../results/predict/{model.name}_predict.csv')

In [25]:
model_inference_and_predict(pca_1)

PCA_1 Inferencing & Predicting: 100%|[31m██████████[0m| 30/30 [00:24<00:00,  1.24it/s, Year=2016]


In [15]:
model_predict(pca_1)

PCA_1 Predicting: 0it [00:00, ?it/s]


In [16]:
print('A')

A
