In [1]:
import pandas as pd
import numpy as np

import datetime
from dateutil.relativedelta import relativedelta

from tqdm import tqdm

from joblib import delayed, Parallel 

import sys
sys.path.append('../')
from utils import charas

import warnings
warnings.filterwarnings('ignore')

In [2]:
class modelBase:
    def __init__(self, name):
        self.name = name
        self.train_idx = 0
        
        # initial train, valid and test periods are default accroding to original paper
        self.train_period = [19570101, 19741231]
        self.valid_period = [19750101, 19861231]
        self.test_period  = [19870101, 19871231]
    
    
    def train_model(self):
        # print('trained')
        pass

    
    def calBeta(self, month):
        """
        Calculate specific month's beta. Should be specified by different models
        -> return np.array, dim = (N, K)
        """
        # return np.zeros([13000, 3])
        pass
    
        
    def calFactor(self, month):
        """
        Calculate specific month's factor. Should be specified by different models
        -> return np.array, dim = (K, 1)
        """
        # return np.zeros([3, 1])
        pass    
       
    
    def cal_delayed_Factor(self, month):
        """
        Calculate delayed month's factor, i.e. mean average of factors up to t-1. Should be specified by different models
        -> return np.array, dim = (K, 1)
        """
        pass
    
    
    def inference(self, month):       
        assert month >= self.test_period[0], f"Month error, {month} is not in test period {self.test_period}"
        
        mon_factor, mon_beta = self.calFactor(month), self.calBeta(month)
        
        assert mon_beta.shape[1] == mon_factor.shape[0], f"Dimension mismatch between mon_factor: {mon_factor.shape} and mon_beta: {mon_beta.shape}"
        
        # R_{N*1} = Beta_{N*K} @ F_{K*1}
        return mon_beta @ mon_factor
        
    
    def predict(self, month):
        assert month >= self.test_period[0] and month <= self.test_period[1], f"Month error, {month} is not in test period {self.test_period}"
        
        lag_factor, mon_beta = self.cal_delayed_Factor(month), self.calBeta(month)
        
        assert mon_beta.shape[1] == lag_factor.shape[0], f"Dimension mismatch between lag_factor: {lag_factor.shape} and mon_beta: {mon_beta.shape}"
        
        # R_{N*1} = Beta_{N*K} @ lag_F_avg{K*1}  
        return mon_beta @ lag_factor
    
    
    def refit(self):
        self.train_period[1] += 10000
        self.valid_period = (pd.Series(self.valid_period) + 10000).to_list()
        self.test_period = (pd.Series(self.test_period) + 10000).to_list()
        

In [3]:
def stock_R_matrix(start_date, end_date):
    R_matrix = pd.read_pickle('../data/stock_R_matrix.pkl')
    return R_matrix.T.loc[start_date: end_date].T

def portfolio_R_matrix(start_date, end_date):
    portfolio_ret = pd.read_pickle('../data/portfolio_ret.pkl')
    return portfolio_ret.loc[(portfolio_ret['DATE'] >= start_date) & (portfolio_ret['DATE'] <= end_date)].set_index('DATE').T

In [None]:
class PCA(modelBase):
    def __init__(self, K, portfolio=True):
        super(PCA, self).__init__(f'PCA_{K}')
        self.K = K
        self.portfolio = portfolio
        
        
    def __col_de_mean(self, matrix):
        return (matrix - matrix.mean()).fillna(0)
    
        
    def inference(self, month):
        if self.portfolio:
            r_matrix = self.__col_de_mean(portfolio_R_matrix(self.train_period[0], month)).astype(np.float32)
        else:
            r_matrix = self.__col_de_mean(stock_R_matrix(self.train_period[0], month))   
        u, sigma, vt = np.linalg.svd(r_matrix)
        # B_{N*K}
        B = u[:, :self.K]
        # F_{K*1}
        F = np.diag(sigma[:self.K]) @ vt[:self.K, -1]
        return B @ F
        
            
    def predict(self, month):
        if self.portfolio:
            r_matrix = self.__col_de_mean(portfolio_R_matrix(self.train_period[0], month)).astype(np.float32)
        else:
            r_matrix = self.__col_de_mean(stock_R_matrix(self.train_period[0], month))
        u, sigma, vt = np.linalg.svd(r_matrix)
        # B_{N*K}
        B = u[:, :self.K]
        # F_{K*1}
        lag_F = np.diag(sigma[:self.K]) @ vt[:self.K, :-1]
        return B @ np.mean(lag_F, axis=1)

In [None]:
pca_1 = PCA(1)

In [4]:
import statsmodels.api as sm

class FF(modelBase):
    def __init__(self, K, portfolio=True):
        super(FF, self).__init__(f'FF_{K}')
        self.K = K
        self.portfolio = portfolio
        self.train_period[0] = 19630731 # ff5 data from FF website is only available from 196307
        self.__prepare_FFf()
        
    
    def __prepare_FFf(self):
        ff5 = pd.read_csv('../data/ff5.csv', index_col=0)
        UMD = pd.read_csv('../data/UMD.csv', index_col=0)
        UMD.columns = ['UMD']
        FFf = pd.concat([ff5, UMD.loc[196307:]], axis=1)
        self.fname = ['Mkt-RF', 'SMB', 'HML', 'CMA', 'RMW', 'UMD']
        self.FFf = FFf[self.fname]
        self.portfolio_ret = pd.read_pickle('../data/portfolio_ret.pkl')
        self.portfolio_ret['DATE'] = self.portfolio_ret['DATE'].apply(lambda x: x//100)
        
    
    def train(self):
        self.beta_matrix = []
        X = self.FFf[self.fname[:self.K]].loc[self.train_period[0]//100:self.train_period[1]//100]
        for col in charas:
            y = self.portfolio_ret.set_index('DATE')[col].loc[self.train_period[0]//100:self.train_period[1]//100]
            model = sm.OLS(y.values, X.values).fit()
            self.beta_matrix.append(model.params)
        self.beta_matrix = pd.DataFrame(self.beta_matrix, columns=self.fname[:self.K], index=charas)
    
        
    def calBeta(self, month):
        return self.beta_matrix # N * K
        
            
    def calFactor(self, month):
        return self.FFf[self.fname[:self.K]].loc[month//100] # K * 1
        
        
    def cal_delayed_Factor(self, month):
        last_mon = int(str(pd.to_datetime(str(month)) - relativedelta(months=1)).split(' ')[0].replace('-', '')[:-2])
        return self.FFf[self.fname[:self.K]].loc[self.valid_period[0]//100:last_mon].mean()
        


In [None]:
import numpy as np
from statsmodels.datasets import grunfeld
data = grunfeld.load_pandas().data
data.year = data.year.astype(np.int64)

In [None]:
N = len(np.unique(data.firm))
ID = dict(zip(np.unique(data.firm).tolist(),np.arange(1,N+1)))
data.firm = data.firm.apply(lambda x: ID[x])

In [None]:
data = data.set_index(['firm', 'year'])

In [None]:
data

In [None]:
import pandas as pd
portfolio_ret = pd.read_pickle('../data/portfolio_ret.pkl')

In [3]:
datashare_re = pd.read_pickle('../data/datashare_re.pkl')
month_ret = pd.read_pickle('../data/month_ret.pkl')
datashare_ret = pd.merge(datashare_re, month_ret, left_on=['permno', 'DATE'], right_on=['permno', 'date'], how='right')

In [None]:
datashare_ret = datashare_ret.set_index(['permno', 'DATE']).sort_index()

In [None]:
from ipca import InstrumentedPCA

y = datashare_ret['ret-rf']
X = datashare_ret.drop('ret-rf', axis=1)

# regr = InstrumentedPCA(n_factors=1, intercept=False)
# regr = regr.fit(X=X, y=y)
# Gamma, Factors = regr.get_factors(label_ind=True)

In [None]:
regr = InstrumentedPCA(n_factors=1, intercept=False)
regr = regr.fit(X=X, y=y)
Gamma, Factors = regr.get_factors(label_ind=True)

In [None]:
from ipca import InstrumentedPCA

y = data['invest']
X = data.drop('invest', axis=1)

regr = InstrumentedPCA(n_factors=1, intercept=False)
regr = regr.fit(X=X, y=y)
Gamma, Factors = regr.get_factors(label_ind=True)

In [6]:
mon_ret = pd.read_pickle('../data/month_ret.pkl')

In [7]:
mon_ret

Unnamed: 0,permno,date,month,ret-rf
0,10006,19570329,195703,1.6105
1,10014,19570329,195703,-0.2300
2,10022,19570329,195703,-0.6146
3,10030,19570329,195703,7.5607
4,10057,19570329,195703,-2.0030
...,...,...,...,...
3780454,93427,20161230,201612,-5.8711
3780455,93428,20161230,201612,-0.6324
3780456,93429,20161230,201612,7.2124
3780457,93434,20161230,201612,-4.1967


In [4]:
def cal_portfolio_ret(it, df):
    d, f = it[0], it[1]
    # long portfolio, qunatile 0.0~0.1; short portfolio, qunatile 0.9~1.0
    long_portfolio = df.loc[df.DATE == d][['permno', f]].sort_values(by=f, ascending=False)[:df.loc[df.DATE == d].shape[0]//10]['permno'].to_list()
    short_portfolio = df.loc[df.DATE == d][['permno', f]].sort_values(by=f, ascending=False)[-df.loc[df.DATE == d].shape[0]//10:]['permno'].to_list()
    # long-short portfolio return
    long_ret = mon_ret.loc[mon_ret.date == d].drop_duplicates('permno').set_index('permno').reindex(long_portfolio)['ret-rf'].dropna().mean()
    short_ret = mon_ret.loc[mon_ret.date == d].drop_duplicates('permno').set_index('permno').reindex(short_portfolio)['ret-rf'].dropna().mean()
    chara_ret = 0.5*(long_ret - short_ret)
    
    return chara_ret

In [5]:
datashare_re = pd.read_pickle('../data/datashare_re.pkl')

In [7]:
mon_list = pd.read_pickle('../data/mon_list.pkl')

In [6]:
def cal_portfolio_charas(month):
    mon_portfolio_chara = []
    for chr in charas:
        long_portfolio = datashare_re.loc[datashare_re.DATE == month].sort_values(by=chr, ascending=False).reset_index(drop=True)[:datashare_re.loc[datashare_re.DATE == month].shape[0]//10]['permno'].to_list()
        short_portfolio = datashare_re.loc[datashare_re.DATE == month].sort_values(by=chr, ascending=False).reset_index(drop=True)[-datashare_re.loc[datashare_re.DATE == month].shape[0]//10:]['permno'].to_list()
        
        long_charas = datashare_re.loc[datashare_re.DATE == month].set_index('permno').loc[long_portfolio][charas]
        short_charas = datashare_re.loc[datashare_re.DATE == month].set_index('permno').loc[short_portfolio][charas]
        
        mon_portfolio_chara.append(0.5*(long_charas.mean() - short_charas.mean()))
    return mon_portfolio_chara

In [9]:
_portfolio_chara_set = Parallel(n_jobs=-1)(delayed(cal_portfolio_charas)(mon) for mon in tqdm(mon_list, colour='yellow')) 



KeyboardInterrupt: 

In [37]:
month = 19570329
chara = 'acc'

long_portfolio = datashare_re.loc[datashare_re.DATE == month].sort_values(by=chara, ascending=False).reset_index(drop=True)[:datashare_re.loc[datashare_re.DATE == 19570329].shape[0]//10]['permno'].to_list()
short_portfolio = datashare_re.loc[datashare_re.DATE == month].sort_values(by=chara, ascending=False).reset_index(drop=True)[-datashare_re.loc[datashare_re.DATE == 19570329].shape[0]//10:]['permno'].to_list()

In [53]:
long_charas = datashare_re.loc[datashare_re.DATE == month].set_index('permno').loc[long_portfolio][charas]
short_charas = datashare_re.loc[datashare_re.DATE == month].set_index('permno').loc[short_portfolio][charas]

In [25]:
p_charas = pd.read_pickle('../data/p_charas.pkl')

In [31]:
p_charas.reset_index().set_index(['index', 'DATE']).sort_index()

Unnamed: 0_level_0,Unnamed: 1_level_0,absacc,acc,age,agr,bm,bm_ia,cashdebt,cashpr,cfp,cfp_ia,...,mom1m,mom36m,mom6m,mvel1,pricedelay,retvol,std_dolvol,std_turn,turn,zerotrade
index,DATE,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
p_absacc,19570329,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.018037,-0.005475,0.027948,-0.010210,-0.011654,-0.000605,0.009889,-0.002442,0.011167,-0.006901
p_absacc,19570430,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.006178,-0.025917,0.057461,-0.018304,-0.004248,0.015557,0.009324,0.005110,0.032214,0.014822
p_absacc,19570531,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.026820,-0.015947,0.043440,-0.015315,-0.006155,-0.013798,0.020463,-0.007109,0.015675,-0.006894
p_acc,19570329,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.018037,-0.005475,0.027948,-0.010210,-0.011654,-0.000605,0.009889,-0.002442,0.011167,-0.006901
p_acc,19570430,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.006178,-0.025917,0.057461,-0.018304,-0.004248,0.015557,0.009324,0.005110,0.032214,0.014822
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
p_turn,19570430,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.006911,0.063130,0.134569,-0.136311,0.002165,0.167582,0.179557,0.352256,0.564968,-0.231896
p_turn,19570531,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.081043,0.103107,0.112576,-0.152782,0.006164,0.126101,0.135037,0.287221,0.554865,-0.248049
p_zerotrade,19570329,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.024183,-0.080916,0.015711,-0.082324,0.021639,-0.024549,-0.042611,-0.024593,-0.109095,0.682537
p_zerotrade,19570430,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.031378,-0.058987,0.005707,-0.089725,0.022144,-0.019079,-0.061245,-0.046601,-0.133047,0.713167
