# Shrinking Factor Dimension: A Reduced-Rank Approach

In [1]:
import numpy as np
from numpy.linalg import inv, eig
from scipy.linalg import fractional_matrix_power as power
import pandas as pd
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()  #标准化
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import PLSRegression
from collections import namedtuple
import os

  data_klasses = (pandas.Series, pandas.DataFrame, pandas.Panel)


In [11]:
# olsnw检验
def olsnw(Y,X,nwlags=0):

    X = sm.add_constant(X) # add constant
    
    model = sm.OLS(Y,X)
    res = model.fit(cov_type='HAC',cov_kwds={'maxlags':nwlags},use_t=True)
    
    B = res.params
    tstat = res.tvalues
    pvals = res.pvalues
    bse = res.bse
    adjR2 = res.rsquared_adj
    Yhat = res.fittedvalues
    
    olsnw = namedtuple('olsnw',['beta_est','tstat','p_value','NW_est','adjR2','y_fitted'])
    
    return olsnw(B,tstat,pvals,bse,adjR2,Yhat)

In [2]:
def rraff(R,G,K):
    """ Using R=a+BG+u to extract K factors
    ___________
    Parameters:
    R: T-by-N, excess returns of N assets 组合收益
    G: T-by-L, L proxies   因子收益
    K: the number of factors to be extracted from the L proxies (G) 提取的因子数
    _______
    Return:
    rraf: T-by-K factors via RRA 
    pcaf: T-by-K factors via PCA
    plsf: T-by-K factors via PLS
    """
    T,L = G.shape
    N = R.shape[1]  #组合个数
    Z = sm.add_constant(G) # Z = [ones G] 在因子收益前添加常数
    M = L+1      #因子个数加一
    
    X = np.ones((T,1))

    W1 = np.identity(N) # N阶单位矩阵
    W2 = np.identity(M) 
    P0 = Z @ W2 @ Z.T   #协方差矩阵，@代表矩阵的乘法（叉乘）
    P = P0 - P0 @ X @ inv(X.T@P0@X) @ X.T @ P0
    Q = (G.T @ P @ G)/T**2
  
    A = power(Q,-.5).T @ (G.T @ P @ R/T**2) @ W1 @ (G.T @ P @ R/T**2).T @ power(Q,-.5) # power(A,B) ：求矩阵A中每一个元素的B次方
    v,E = eig(A) #计算矩阵特征值和特征向量
    ind = v.argsort()[::-1] # argsort()排序返回下标, a[::-1]  取从后向前（相反）的元素，最大特征值
    #将v中的元素从小到大排列，提取其对应的index(索引)

    E = E[:,ind]   # 按照ind的顺序提取列，按照特征值的大小排序
    Phi = power(Q,-.5) @ E[:,:K] # 提取最大的K个特征值对应的特征向量
    Gstar=G @ Phi
    
    # 此处Gstar可以作为RRA的结果，但是该估计量仅是一致估计量，不具有有效性，为了更加精确，继续进行计算
    
    Beta = inv(Gstar.T @ P @ Gstar) @ Gstar.T @ P @ R 
    Theta = Phi @ Beta
    Alpha = inv(X.T @ P0 @ X) @ X.T @ P0 @ (R - G @ Theta)
    Alpha = Alpha.T
    U = R - X @ Alpha.T - G @ Theta 
    
    S1 = np.diag(np.diag(U.T@U/T))   
    #返回矩阵的对角线元素
    S2 = np.diag(np.diag(Z.T@Z/T))
    
    W1 = inv(S1)
    W2 = inv(S2)
    P0 = Z @ W2 @ Z.T
    P = P0 - P0 @ X @ inv(X.T@P0@X) @ X.T @ P0
    Q = (G.T @ P @ G)/T**2
    A = power(Q,-.5).T @ (G.T @ P @ R/T**2) @ W1 @ (G.T @ P @ R/T**2).T @ power(Q,-.5)
    v,E = eig(A)
    ind = v.argsort()[::-1]   
    E = E[:,ind]
    Phi = power(Q,-.5) @ E[:,:K]
    Gstar=G @ Phi
    
    rraf = Gstar.real #返回复数类型参数的实部
    
    pca = PCA(n_components=K)
    G = sc.fit_transform(G)
    pcaf = pca.fit_transform(G)
    
    pls = PLSRegression(n_components=K,scale=False)
    X = sc.fit_transform(G)
    plsf = pls.fit(X=X,Y=R).x_scores_
    
    rraff = namedtuple('rraff',['rraf','pcaf','plsf'])
    
    return rraff(rraf,pcaf,plsf)

In [3]:
t0 = pd.to_datetime('197401',format='%Y%m') 
t1 = pd.to_datetime('201612',format='%Y%m') 

z1 = pd.read_excel('Portfolio_ret.xlsx',sheet_name='FF5',index_col=0) #五因子数据
z1.index = pd.to_datetime(z1.index,format="%Y%m")
# riskfree rate
rf = z1['RF'][t0:t1].values 
rf = rf.reshape(-1,1) #转换成1列
#excess mkt return
mkt = z1['Mkt-RF'][t0:t1].values 
mkt = mkt.reshape(-1,1)

In [4]:
z2 = pd.read_excel('Portfolio_ret.xlsx',sheet_name='Anomaly',header=1,index_col=0)  #异象因子数据
z2.index = pd.to_datetime(z2.index,format="%Y%m")
z2 = z2.loc[t0:t1,:]

f = sc.fit_transform(z2.iloc[:,:10]) # benchmark: 1, 3, 5, 6,10 factors，拟合并标准化
#先拟合fit，找到该训练部分的整体指标，如均值、方差、最大值最小值
#与transform配合使用，直接将其运用到测试集上（甚至交叉验证集）对(X_test)进行转换transform，从而实现数据的标准化、归一化，保证train、test处理方式相同，具有相同的均值方差
G = z2.values

T, L = G.shape # T为G的长度，月份，L为G的宽度，因子数

In [5]:
# Factors: 48 Industry Portfolios
z3 = pd.read_excel('Portfolio_ret.xlsx',sheet_name='FF48vw',header=1,index_col=0) #行业组合
z3.index = pd.to_datetime(z3.index,format="%Y%m")
z3 = z3.loc[t0:t1,:]

N = z3.shape[1] # number of portfolios
R48 = z3 - rf@np.ones((1,N))   # 收益率减无风险利率
R48 = R48.values

In [6]:
rraf, pcaf, plsf = rraff(R48,G,10)
rraf = sc.fit_transform(rraf) #标准化
pcaf = sc.fit_transform(pcaf)
plsf = sc.fit_transform(plsf)

In [7]:
R = R48/100 
NF = [1,3,5,6,10]
res_f = np.zeros((len(NF),N,3))
# np.zeros((2,3,4))  生成2个3*4的矩阵，高维数组
res_rra = np.zeros((len(NF),N,3))
res_pca = np.zeros((len(NF),N,3))
res_pls = np.zeros((len(NF),N,3))

for i in range(N):
    for j in range(len(NF)):
        Ri = R[:,i]
        Ttemp = len(Ri)
        
        b,tstat,pvals,bse,adjR2,yhat = olsnw(Y=Ri,X=f[:,:NF[j]])
        adj = (Ttemp-1)/(Ttemp-1-NF[j])
        temp1 = np.sum((Ri-yhat)**2)*adj #调整R2
        temp2 = np.sum(Ri**2); #收益的平方
        temp3 = np.sum((Ri-yhat+b[0])**2)*adj/Ttemp 
        res_f[j,i,:] = [temp1, temp2, temp3]
        
        b,tstat,pvals,bse,adjR2,yhat = olsnw(Y=Ri,X=rraf[:,:NF[j]])
        adj = (Ttemp-1)/(Ttemp-1-NF[j])
        temp1 = np.sum((Ri-yhat)**2)*adj
        temp2 = np.sum(Ri**2);
        temp3 = np.sum((Ri-yhat+b[0])**2)*adj/Ttemp
        res_rra[j,i,:] = [temp1, temp2, temp3]
        
        b,tstat,pvals,bse,adjR2,yhat = olsnw(Y=Ri,X=pcaf[:,:NF[j]])
        adj = (Ttemp-1)/(Ttemp-1-NF[j])
        temp1 = np.sum((Ri-yhat)**2)*adj
        temp2 = np.sum(Ri**2);
        temp3 = np.sum((Ri-yhat+b[0])**2)*adj/Ttemp
        res_pca[j,i,:] = [temp1, temp2, temp3]
        
        b,tstat,pvals,bse,adjR2,yhat = olsnw(Y=Ri,X=plsf[:,:NF[j]])
        adj = (Ttemp-1)/(Ttemp-1-NF[j])
        temp1 = np.sum((Ri-yhat)**2)*adj
        temp2 = np.sum(Ri**2);
        temp3 = np.sum((Ri-yhat+b[0])**2)*adj/Ttemp
        res_pls[j,i,:] = [temp1, temp2, temp3]

In [8]:
TRf = np.zeros((2,len(NF))) 
TRrra = np.zeros((2,len(NF)))
TRpca = np.zeros((2,len(NF)))
TRpls = np.zeros((2,len(NF)))
for j in range(len(NF)):
    TRf[:,j] = [1-res_f[j,:,0].sum()/res_f[j,:,1].sum(), np.sqrt(res_f[j,:,2]).mean()]
    TRrra[:,j] = [1-res_rra[j,:,0].sum()/res_rra[j,:,1].sum(), np.sqrt(res_rra[j,:,2]).mean()]
    TRpca[:,j] = [1-res_pca[j,:,0].sum()/res_pca[j,:,1].sum(), np.sqrt(res_pca[j,:,2]).mean()]
    TRpls[:,j] = [1-res_pls[j,:,0].sum()/res_pls[j,:,1].sum(), np.sqrt(res_pls[j,:,2]).mean()]


In [9]:
ls1 = ['TotalR2(%)' for i in range(5)]+['RMSPE (%)' for i in range(5)]
ls2 = [str(i)+' factors' for i in [1,3,5,6,10]]
ls = [ls1, ls2+ls2]
tuples = list(zip(*ls))   # *ls 可理解为解压，返回二维矩阵
col = pd.MultiIndex.from_tuples(tuples, names=['Performance','Model'])
index = ['FF','PCA','PLS','RRA']

In [10]:
# Factors: 48 Industry Portfolios
TotalR2_0 = np.array((TRf[0,:],TRpca[0,:],TRpls[0,:],TRrra[0,:]))
PE_0 = np.array((TRf[1,:],TRpca[1,:],TRpls[1,:],TRrra[1,:]))
data = np.concatenate((TotalR2_0,PE_0),axis=1) 

res0 = pd.DataFrame(data*100,index=index,columns=col) # target assets: 48 industry portfolios
#res0.to_excel('Panel A: 48 Industry Portfolios.xls')
res0
#注：运行结果与论文的中展示结果部分不一样，但是与附件Excel中的结果完全一致

Performance,TotalR2(%),TotalR2(%),TotalR2(%),TotalR2(%),TotalR2(%),RMSPE (%),RMSPE (%),RMSPE (%),RMSPE (%),RMSPE (%)
Model,1 factors,3 factors,5 factors,6 factors,10 factors,1 factors,3 factors,5 factors,6 factors,10 factors
FF,51.393095,55.569281,57.765512,58.340052,59.622064,4.459831,4.244148,4.122504,4.091064,4.027708
PCA,13.996545,24.589281,29.109066,29.336403,57.342061,6.089327,5.706049,5.541712,5.532191,4.186298
PLS,29.535977,55.514035,58.637457,59.382431,61.220821,5.497749,4.274815,4.092748,4.063146,3.964251
RRA,53.892899,58.790617,60.852092,61.121727,62.051293,4.341182,4.076555,3.974426,3.957533,3.912889
