In [138]:
import pandas as pd
import calendar
import numpy as np
from sklearn.decomposition import PCA
import statsmodels.api as sm
from sklearn.cross_decomposition import PLSRegression
from datetime import datetime

In [2]:
# loading dataframes 

r_daily = pd.read_csv('managed_portfolios_anom_d_55.csv')

# drop all momentum factors or factors that are constructed based on momentum

factor_drop_list = ['r_mom', 'r_indmom', 'r_valmom', 'r_valmomprof', 'r_mom12', 'r_momrev', 'r_indmomrev', 'r_exchsw', 'rme', 're_ew']

r_daily.drop(columns=factor_drop_list, inplace=True)

# set date to datetime format and set the date to the index 

r_daily['date'] = pd.to_datetime(r_daily['date'])
r_daily.set_index('date', inplace=True)

# following the procedure in the paper, if there are observations missing we set them to 0. 

r_daily.fillna(0, inplace=True) 

# create a list of factors for later analysis purposes 

factors = [col for col in r_daily.columns if col.startswith('r_')]

# create a monthly return dataframe for later analysis purposes 

r_monthly = r_daily.resample('M').sum()
r_monthly.index = r_monthly.index.strftime('%Y-%m')


           r_size   r_value    r_prof     r_dur  r_valprof  r_fscore  \
date                                                                   
1963-07  0.016922 -0.007892 -0.004998  0.006404  -0.008006  0.037811   
1963-08  0.009713 -0.000007  0.040164  0.000896   0.032660  0.016748   
1963-09 -0.000737  0.017398 -0.008914 -0.001776   0.009029 -0.000537   
1963-10  0.002907  0.019215  0.052118  0.004416   0.060489  0.046196   
1963-11  0.024772  0.009646 -0.016352 -0.016861  -0.014267  0.012256   
...           ...       ...       ...       ...        ...       ...   
2019-08  0.054356 -0.070987  0.038609  0.059314  -0.096482 -0.001678   
2019-09  0.020590  0.116108 -0.069150 -0.116748   0.111385  0.098645   
2019-10 -0.000495 -0.021761  0.056857  0.002966   0.062767  0.016516   
2019-11 -0.003029 -0.052016  0.019764  0.074420  -0.034936 -0.044870   
2019-12  0.003786  0.040248 -0.019682 -0.044288   0.047963  0.040852   

         r_debtiss  r_repurch   r_nissa  r_accruals  ...    r_i

In [34]:
# initialize pca model 

pca = PCA(n_components=len(factors))

# select our start date 

start_date = pd.to_datetime("1963-07-01")

# create an empty dataframe to store the average return for each PC from t until t-11

pc_avg_df = pd.DataFrame()

# create an empty list for the pc return dataframes. These will be concated in a later stage to one large dataframe

pc_return_dfs = []

# create our loop set up, this is actually an expanding PCA analysis. In each iteration a new month is added to the dataset and the return is computed. 

for year in range(1973, 2020):
    # as our out of sample procedure starts July 1973, we start in the 7th month in 1973
    for mo in range(6,13) if year == 1973 else range(1, 13):
        # first we have to find the last month of the day. For this we use the calender function with inputs from the loop variables
        last_day = calendar.monthrange(year, mo)[1]

        # we select our new end_date variable for which the PCA analysis is done, also with inputs from our loop and the last_day variable
        end_date = pd.to_datetime(f'{year}-{mo}-{last_day}')

        t_dt = pd.to_datetime(f'{year}-{mo}')
        t = t_dt.strftime('%Y-%m')
        
        # we select the datarange from our dataset (July 1963 = start_date until our defined end_date) and we fit the model
        pca_data = r_daily.loc[start_date:end_date]
        pca.fit(pca_data)

        # we extract the principal components. These principal components are put in a new dataframe for later analysis. 
        
        principal_components = pca.components_
        components_df = pd.DataFrame(data=principal_components.T, index=factors, columns=[f"PC{i+1}" for i in range(len(factors))])

        # calculating return for month t+1. If mo = 12, then year will increment with 1. 
        
        t_plus_1_year = year + 1 if mo == 12 else year
        t_plus_1_month = (mo % 12) + 1
 
        # creating a datetime variable for the month t+1 and storing this in our pc_return_data variable
        
        t_plus_1_dt =pd.to_datetime(f'{t_plus_1_year}-{t_plus_1_month}')
        t_plus_1 = t_plus_1_dt.strftime('%Y-%m')
        
        pc_return_data = {'date': t_plus_1}

        
        # in this loop we calculate the monthly factor returns (f) using the principal components and returns
        
        for f in range(len(factors)):
            # select our factor and extract its principal component from principal_df and its return from r_daily for all observations in month mo 
            pc = components_df.iloc[:, f]
            r_month = r_monthly.loc[t]
            # multiply the principal components with the returns and sum them up to get PC factor return for month mo 
            pc_return = (pc*r_month).sum()
            
            # place this in our dictionary for later transposing to dataframe
            
            pc_return_data[components_df.columns[f]] = pc_return

            r_pc_month_n_list = []
            
            for n in range(1, 12):
                # calculate the datetime for t - n
                t_minus_n_dt = t_dt - pd.DateOffset(months=n)

                # transpose it to our YYYY-MM format
                t_minus_n = t_minus_n_dt.strftime('%Y-%m')

                # select the return corresponding to our month t-n
                r_month_n = r_monthly.shift(n).loc[t_minus_n]
                
                pc_return_n = (pc*r_month_n).sum()
                
                r_pc_month_n_list.append(pc_return_n)

            r_pc_month_mean = (np.mean(r_pc_month_n_list))
            pc_avg_df.loc[t, f'PC{f+1}'] = r_pc_month_mean
                

        pc_return_df = pd.DataFrame.from_dict(pc_return_data, orient='index').T
        pc_return_df.set_index('date', inplace=True)
        pc_return_dfs.append(pc_return_df)

r_pc = pd.concat(pc_return_dfs)
print(r_pc)


              PC1       PC2       PC3       PC4       PC5       PC6       PC7  \
date                                                                            
1973-07 -0.107861  0.064028 -0.001788  0.058085 -0.068429 -0.028958  0.098368   
1973-08  0.528398  0.069094  0.025934  0.029249  0.298858   0.11805  0.150817   
1973-09 -0.057159 -0.025929  0.013906  0.017154  0.024258  0.022785  0.004781   
1973-10  0.026133 -0.091035  0.143031   0.10927  0.051393 -0.002975  0.062329   
1973-11 -0.249047  -0.05311  0.091882  0.073315 -0.039798  0.089521  0.081162   
...           ...       ...       ...       ...       ...       ...       ...   
2019-09  0.064238 -0.266023 -0.044962  0.143513  0.054072 -0.064805 -0.022804   
2019-10 -0.416507  0.057316  0.239891  -0.06507 -0.072235  0.014179  0.070646   
2019-11  0.036042 -0.110662  0.011783 -0.062658  0.021307   0.08512  0.050528   
2019-12  0.203826  0.043237 -0.041169  0.013195 -0.030844  0.045996 -0.009277   
2020-01  -0.07257  0.050186 

### Leveraging and demeaning our returns

In [146]:
start_date_dt = pd.to_datetime("1963-07-01")
start_date = start_date_dt.strftime('%Y-%m')
lev_dfs = []

for year in range(1973, 2020):
    # as our out of sample procedure starts July 1973, we start in the 8th month in 1973 as we need to demean our PC factor up to month t
    for mo in range(7,13) if year == 1973 else range(1, 13):
        
        # first we set our t variable to the current year and month from our loop
        
        t_dt = pd.to_datetime(f'{year}-{mo}')
        t_minus_one_dt = t_dt - pd.DateOffset(months=1)
        t = t_dt.strftime('%Y-%m')
        t_minus_one = t_minus_one_dt.strftime('%Y-%m')
        
        
        

        # calculate the variance of the individual factor returns up until month t 
        r_indiv_f_t = r_monthly.loc[start_date:t_minus_one]
        var_indiv_f_t = r_indiv_f_t.var(axis=0)
        avg_var_indiv_f_t = var_indiv_f_t.mean()
        #print(r_indiv_f_t.mean())

        # calculate the mean and variance of the PC factors up until month t 

        r_pc_t = r_pc.loc[:t]
        #print(r_pc_t)
        demeaned_r_pc_t = r_pc.loc[t].to_frame().T - r_pc_t.mean()

        # calculate the leverage factor and multiply this with the demeaned 
        
        leverage_t = np.sqrt(avg_var_indiv_f_t / r_pc_t.var(axis=0))
        lev_r_pc_t = demeaned_r_pc_t * leverage_t
        lev_df = lev_r_pc_t.loc[t].to_frame().T
        lev_dfs.append(lev_df)

lev_r_pc = pd.concat(lev_dfs)
lev_r_pc.fillna(0, inplace=True)
lev_r_pc_clean = lev_r_pc.drop(lev_r_pc.index[:1])
print(lev_r_pc_clean)

              PC1       PC2       PC3       PC4       PC5       PC6       PC7  \
1973-08  0.027544  0.027544  0.027544 -0.027544  0.027544  0.027544  0.027544   
1973-09 -0.019582 -0.044798  0.003415 -0.032646 -0.012330 -0.007556 -0.041936   
1973-10 -0.009463 -0.048015  0.057435  0.052938 -0.006226 -0.018304 -0.010595   
1973-11 -0.036417 -0.024998  0.023872  0.016959 -0.024968  0.031305  0.001225   
1973-12 -0.045197 -0.031865 -0.057385 -0.012954 -0.058990  0.000823 -0.070704   
...           ...       ...       ...       ...       ...       ...       ...   
2019-08  0.015143 -0.010710 -0.003863  0.023006 -0.031903  0.014737  0.029563   
2019-09  0.018044 -0.104763 -0.028228  0.073866  0.032851 -0.048214 -0.019140   
2019-10 -0.084967  0.018893  0.137633 -0.045922 -0.051184  0.010909  0.054842   
2019-11  0.012115 -0.045354  0.004716 -0.044446  0.011123  0.063923  0.038819   
2019-12  0.048035  0.013562 -0.026114 -0.000838 -0.023604  0.034562 -0.008584   

              PC8       PC9

In [147]:

positive_returns_PC = pc_avg_df > 0
negative_returns_PC = pc_avg_df < 0 


long_portfolio_PC = positive_returns_PC.astype(int)  
short_portfolio_PC = negative_returns_PC.astype(int) 

mom_1_10 = ['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10']
mom_11_20 = ['PC11', 'PC12', 'PC13', 'PC14', 'PC15', 'PC16', 'PC17', 'PC18', 'PC19', 'PC20']
mom_21_30 = ['PC21', 'PC22', 'PC23', 'PC24', 'PC25', 'PC26', 'PC27', 'PC28', 'PC29', 'PC30']
mom_31_40 = ['PC31', 'PC32', 'PC33', 'PC34', 'PC35', 'PC36', 'PC37', 'PC38', 'PC39', 'PC40']
mom_41_47 = ['PC41', 'PC42', 'PC43', 'PC44', 'PC45', 'PC46', 'PC47']

mom_list = [mom_1_10, mom_11_20, mom_21_30, mom_31_40, mom_41_47]

r_mean_set_dict = {}

for i, mom in enumerate(mom_list):
    r_PC_set_mom = (long_portfolio_PC[mom] * lev_r_pc_clean[mom]) - (short_portfolio_PC[mom] * lev_r_pc_clean[mom])
    r_PC_set_mean = r_PC_set_mom.mean(axis=1)
    r_mean_set_dict[f'mom_set_{i + 1}'] = r_PC_set_mean
    
mom_strategy = pd.concat(r_mean_set_dict, axis=1)

mom_strategy.index = pd.to_datetime(mom_strategy.index)
mom_strategy.index = mom_strategy.index.strftime('%Y-%m')
mom_strategy.dropna(inplace=True)
print(mom_strategy)
print(mom_strategy.mean(axis=0))


         mom_set_1     mom_set_2  mom_set_3  mom_set_4  mom_set_5
1973-08  -0.011018  5.898060e-18   0.011018  -0.011018  -0.011805
1973-09  -0.015165 -7.858623e-03  -0.008116   0.010338  -0.010157
1973-10   0.000377  4.387026e-03   0.014923   0.001492   0.006727
1973-11  -0.001341  7.608363e-03   0.009026  -0.007895  -0.011754
1973-12   0.025343 -1.724545e-02   0.014856   0.035678  -0.013378
...            ...           ...        ...        ...        ...
2019-08   0.007581  1.172042e-02   0.014178   0.013147   0.013921
2019-09   0.018852  1.682888e-02  -0.012982   0.006816   0.010975
2019-10  -0.030116 -2.536283e-02  -0.010249  -0.006112   0.002297
2019-11   0.005415 -1.020430e-02   0.025782   0.012907  -0.017823
2019-12   0.013376  3.304429e-03   0.027092  -0.008758   0.015674

[557 rows x 5 columns]
mom_set_1    0.002533
mom_set_2    0.002681
mom_set_3    0.001949
mom_set_4    0.000442
mom_set_5   -0.000223
dtype: float64


### Replicate table 3 panel A 

In [121]:
mom_strategy_1 = mom_strategy.loc[:'1996-09']
mom_strategy_2 = mom_strategy.loc['1996-09':]

print(f'the mean of every subset of PCs is:\n')
print(mom_strategy.mean(axis=0))

means = mom_strategy.mean(axis=0).tolist()
std = mom_strategy.std(axis=0).tolist()
N = mom_strategy.shape[0]

print(f'the t-statistic of every subset of PCS is:\n')
for m, s in zip(means, std):
    t_statistic = m / (s / (N**0.5))
    print(t_statistic)

print(f'the mean of every subset of PCs is (first half):\n')
print(mom_strategy_1.mean(axis=0))

means = mom_strategy_1.mean(axis=0).tolist()
std = mom_strategy_1.std(axis=0).tolist()
N = mom_strategy_1.shape[0]

print(f'the t-statistic of every subset of PCS is (first half):\n')
for m, s in zip(means, std):
    t_statistic = m / (s / (N**0.5))
    print(t_statistic)

print(f'the mean of every subset of PCs is (second half):\n')
print(mom_strategy_2.mean(axis=0))

means = mom_strategy_2.mean(axis=0).tolist()
std = mom_strategy_2.std(axis=0).tolist()
N = mom_strategy_2.shape[0]

print(f'the t-statistic of every subset of PCS is (second half):\n')
for m, s in zip(means, std):
    t_statistic = m / (s / (N**0.5))
    print(t_statistic)


the mean of every subset of PCs is:

mom_set_1    0.002540
mom_set_2    0.002673
mom_set_3    0.001956
mom_set_4    0.000438
mom_set_5   -0.000226
dtype: float64
the t-statistic of every subset of PCS is:

3.585772448496227
3.468917444909932
2.7802012640740026
0.6697822738369887
-0.2417122592478961
the mean of every subset of PCs is (first half):

mom_set_1    0.001906
mom_set_2    0.004615
mom_set_3    0.002420
mom_set_4    0.001383
mom_set_5   -0.000233
dtype: float64
the t-statistic of every subset of PCS is (first half):

2.4854716083950255
5.702453102875741
3.181301799181065
1.7031293806586758
-0.20684235761539926
the mean of every subset of PCs is (second half):

mom_set_1    0.003142
mom_set_2    0.000711
mom_set_3    0.001447
mom_set_4   -0.000526
mom_set_5   -0.000247
dtype: float64
the t-statistic of every subset of PCS is (second half):

2.6481426736425693
0.5485571997527239
1.226554306180192
-0.5162715197920651
-0.16512329368185433


### Replicatie tabel 3 panel B en C 

In [137]:
ff = pd.read_stata("fffactors.dta")

ff.set_index('yyyymm', inplace=True)


ff.index = pd.to_datetime(ff.index, format='%Y%m')


ff.index = ff.index.strftime('%Y-%m')

ff5 = ff[['mktrf', 'smb', 'hml', 'rmw', 'cma']].loc['1973-08':'2019-12']

mom_strategy_ff5 = pd.concat([mom_strategy, ff5], axis=1)

mom_strategy_ff5['P1'] = 0
mom_strategy_ff5['P2'] = 0
mom_strategy_ff5.loc[mom_strategy_ff5.index <= '1996-09', 'P1'] = 1
mom_strategy_ff5.loc[mom_strategy_ff5.index >= '1996-09', 'P2'] = 1




### Panel B

In [149]:
independent_vars = ['mom_set_1', 'P1', 'P2', 'mktrf', 'smb', 'hml', 'rmw', 'cma']
X = mom_strategy_ff5[independent_vars]

for i in range(2, 6):
    Y = mom_strategy_ff5[f'mom_set_{i}']
    modelB = sm.OLS(y, X).fit()
    print(modelB.summary())



                                 OLS Regression Results                                
Dep. Variable:              mom_set_1   R-squared (uncentered):                   1.000
Model:                            OLS   Adj. R-squared (uncentered):              1.000
Method:                 Least Squares   F-statistic:                          7.239e+31
Date:                Thu, 22 Feb 2024   Prob (F-statistic):                        0.00
Time:                        10:01:20   Log-Likelihood:                          20735.
No. Observations:                 557   AIC:                                 -4.145e+04
Df Residuals:                     549   BIC:                                 -4.142e+04
Df Model:                           8                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

### Panel C

In [140]:
y = mom_strategy_ff5['mom_set_1']

for i in range(2, 6):
    independent_vars = [f'mom_set_{i}', 'P1', 'P2', 'mktrf', 'smb', 'hml', 'rmw', 'cma']
    X = sm.add_constant(mom_strategy_ff5[independent_vars])
    modelC = sm.OLS(y, X).fit()
    print(modelC.summary())


independent_vars = ['P1', 'P2', 'mom_set_2', 'mom_set_3', 'mom_set_4', 'mom_set_5', 'mktrf', 'smb', 'hml', 'rmw', 'cma']
   

                            OLS Regression Results                            
Dep. Variable:              mom_set_1   R-squared:                       0.028
Model:                            OLS   Adj. R-squared:                  0.013
Method:                 Least Squares   F-statistic:                     1.945
Date:                Thu, 22 Feb 2024   Prob (F-statistic):             0.0514
Time:                        00:33:08   Log-Likelihood:                 1496.7
No. Observations:                 557   AIC:                            -2975.
Df Residuals:                     548   BIC:                            -2936.
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0093      0.017      0.555      0.5