In [1]:
import pandas as pd
import calendar
import numpy as np
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import PLSRegression
from datetime import datetime

In [2]:
# loading dataframes 

r_daily = pd.read_csv('managed_portfolios_anom_d_55.csv')

# drop all momentum factors or factors that are constructed based on momentum

factor_drop_list = ['r_mom', 'r_indmom', 'r_valmom', 'r_valmomprof', 'r_mom12', 'r_momrev', 'r_indmomrev', 'r_exchsw', 'rme', 're_ew']

r_daily.drop(columns=factor_drop_list, inplace=True)

# set date to datetime format and set the date to the index 

r_daily['date'] = pd.to_datetime(r_daily['date'])
r_daily.set_index('date', inplace=True)

# following the procedure in the paper, if there are observations missing we set them to 0. 

r_daily.fillna(0, inplace=True) 

# create a list of factors for later analysis purposes 

factors = [col for col in r_daily.columns if col.startswith('r_')]

# create a monthly return dataframe for later analysis purposes 

r_monthly = r_daily.resample('M').sum()
r_monthly.index = r_monthly.index.strftime('%Y-%m')


           r_size   r_value    r_prof     r_dur  r_valprof  r_fscore  \
date                                                                   
1963-07  0.016922 -0.007892 -0.004998  0.006404  -0.008006  0.037811   
1963-08  0.009713 -0.000007  0.040164  0.000896   0.032660  0.016748   
1963-09 -0.000737  0.017398 -0.008914 -0.001776   0.009029 -0.000537   
1963-10  0.002907  0.019215  0.052118  0.004416   0.060489  0.046196   
1963-11  0.024772  0.009646 -0.016352 -0.016861  -0.014267  0.012256   
...           ...       ...       ...       ...        ...       ...   
2019-08  0.054356 -0.070987  0.038609  0.059314  -0.096482 -0.001678   
2019-09  0.020590  0.116108 -0.069150 -0.116748   0.111385  0.098645   
2019-10 -0.000495 -0.021761  0.056857  0.002966   0.062767  0.016516   
2019-11 -0.003029 -0.052016  0.019764  0.074420  -0.034936 -0.044870   
2019-12  0.003786  0.040248 -0.019682 -0.044288   0.047963  0.040852   

         r_debtiss  r_repurch   r_nissa  r_accruals  ...    r_i

In [34]:
# initialize pca model 

pca = PCA(n_components=len(factors))

# select our start date 

start_date = pd.to_datetime("1963-07-01")

# create an empty dataframe to store the average return for each PC from t until t-11

pc_avg_df = pd.DataFrame()

# create an empty list for the pc return dataframes. These will be concated in a later stage to one large dataframe

pc_return_dfs = []

# create our loop set up, this is actually an expanding PCA analysis. In each iteration a new month is added to the dataset and the return is computed. 

for year in range(1973, 2020):
    # as our out of sample procedure starts July 1973, we start in the 7th month in 1973
    for mo in range(6,13) if year == 1973 else range(1, 13):
        # first we have to find the last month of the day. For this we use the calender function with inputs from the loop variables
        last_day = calendar.monthrange(year, mo)[1]

        # we select our new end_date variable for which the PCA analysis is done, also with inputs from our loop and the last_day variable
        end_date = pd.to_datetime(f'{year}-{mo}-{last_day}')

        t_dt = pd.to_datetime(f'{year}-{mo}')
        t = t_dt.strftime('%Y-%m')
        
        # we select the datarange from our dataset (July 1963 = start_date until our defined end_date) and we fit the model
        pca_data = r_daily.loc[start_date:end_date]
        pca.fit(pca_data)

        # we extract the principal components. These principal components are put in a new dataframe for later analysis. 
        
        principal_components = pca.components_
        components_df = pd.DataFrame(data=principal_components.T, index=factors, columns=[f"PC{i+1}" for i in range(len(factors))])

        # calculating return for month t+1. If mo = 12, then year will increment with 1. 
        
        t_plus_1_year = year + 1 if mo == 12 else year
        t_plus_1_month = (mo % 12) + 1
 
        # creating a datetime variable for the month t+1 and storing this in our pc_return_data variable
        
        t_plus_1_dt =pd.to_datetime(f'{t_plus_1_year}-{t_plus_1_month}')
        t_plus_1 = t_plus_1_dt.strftime('%Y-%m')
        
        pc_return_data = {'date': t_plus_1}

        
        # in this loop we calculate the monthly factor returns (f) using the principal components and returns
        
        for f in range(len(factors)):
            # select our factor and extract its principal component from principal_df and its return from r_daily for all observations in month mo 
            pc = components_df.iloc[:, f]
            r_month = r_monthly.loc[t]
            # multiply the principal components with the returns and sum them up to get PC factor return for month mo 
            pc_return = (pc*r_month).sum()
            
            # place this in our dictionary for later transposing to dataframe
            
            pc_return_data[components_df.columns[f]] = pc_return

            r_pc_month_n_list = []
            
            for n in range(1, 12):
                # calculate the datetime for t - n
                t_minus_n_dt = t_dt - pd.DateOffset(months=n)

                # transpose it to our YYYY-MM format
                t_minus_n = t_minus_n_dt.strftime('%Y-%m')

                # select the return corresponding to our month t-n
                r_month_n = r_monthly.shift(n).loc[t_minus_n]
                
                pc_return_n = (pc*r_month_n).sum()
                
                r_pc_month_n_list.append(pc_return_n)

            r_pc_month_mean = (np.mean(r_pc_month_n_list))
            pc_avg_df.loc[t, f'PC{f+1}'] = r_pc_month_mean
                

        pc_return_df = pd.DataFrame.from_dict(pc_return_data, orient='index').T
        pc_return_df.set_index('date', inplace=True)
        pc_return_dfs.append(pc_return_df)

r_pc = pd.concat(pc_return_dfs)
print(r_pc)


              PC1       PC2       PC3       PC4       PC5       PC6       PC7  \
date                                                                            
1973-07 -0.107861  0.064028 -0.001788  0.058085 -0.068429 -0.028958  0.098368   
1973-08  0.528398  0.069094  0.025934  0.029249  0.298858   0.11805  0.150817   
1973-09 -0.057159 -0.025929  0.013906  0.017154  0.024258  0.022785  0.004781   
1973-10  0.026133 -0.091035  0.143031   0.10927  0.051393 -0.002975  0.062329   
1973-11 -0.249047  -0.05311  0.091882  0.073315 -0.039798  0.089521  0.081162   
...           ...       ...       ...       ...       ...       ...       ...   
2019-09  0.064238 -0.266023 -0.044962  0.143513  0.054072 -0.064805 -0.022804   
2019-10 -0.416507  0.057316  0.239891  -0.06507 -0.072235  0.014179  0.070646   
2019-11  0.036042 -0.110662  0.011783 -0.062658  0.021307   0.08512  0.050528   
2019-12  0.203826  0.043237 -0.041169  0.013195 -0.030844  0.045996 -0.009277   
2020-01  -0.07257  0.050186 

### Leveraging and demeaning our returns

In [58]:
start_date_dt = pd.to_datetime("1963-07-01")
start_date = start_date_dt.strftime('%Y-%m')
lev_dfs = []

for year in range(1973, 2020):
    # as our out of sample procedure starts July 1973, we start in the 8th month in 1973 as we need to demean our PC factor up to month t
    for mo in range(7,13) if year == 1973 else range(1, 13):
        
        # first we set our t variable to the current year and month from our loop
        
        t_dt = pd.to_datetime(f'{year}-{mo}')
        t = t_dt.strftime('%Y-%m')
        
        # calculate the variance of the individual factor returns up until month t 
        r_indiv_f_t = r_monthly.loc[start_date:t]
        var_indiv_f_t = r_indiv_f_t.var(axis=0)
        avg_var_indiv_f_t = var_indiv_f_t.mean()
        #print(r_indiv_f_t.mean())

        # calculate the mean and variance of the PC factors up until month t 

        r_pc_t = r_pc.loc[:t]
        #print(r_pc_t)
        demeaned_r_pc_t = r_pc.loc[t].to_frame().T - r_pc_t.mean()

        # calculate the leverage factor and multiply this with the demeaned 
        
        leverage_t = np.sqrt(avg_var_indiv_f_t / r_pc_t.var(axis=0))
        lev_r_pc_t = demeaned_r_pc_t * leverage_t
        lev_df = lev_r_pc_t.loc[t].to_frame().T
        lev_dfs.append(lev_df)

lev_r_pc = pd.concat(lev_dfs)
lev_r_pc.fillna(0, inplace=True)
print(lev_r_pc)

              PC1       PC2       PC3       PC4       PC5       PC6       PC7  \
1973-07  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
1973-08  0.027464  0.027464  0.027464 -0.027464  0.027464  0.027464  0.027464   
1973-09 -0.019595 -0.044828  0.003417 -0.032669 -0.012339 -0.007561 -0.041964   
1973-10 -0.009503 -0.048216  0.057675  0.053160 -0.006252 -0.018380 -0.010639   
1973-11 -0.037041 -0.025426  0.024280  0.017249 -0.025396  0.031840  0.001246   
...           ...       ...       ...       ...       ...       ...       ...   
2019-08  0.015146 -0.010712 -0.003863  0.023011 -0.031910  0.014741  0.029570   
2019-09  0.018056 -0.104836 -0.028248  0.073917  0.032874 -0.048247 -0.019153   
2019-10 -0.084930  0.018884  0.137572 -0.045902 -0.051161  0.010904  0.054817   
2019-11  0.012112 -0.045340  0.004715 -0.044433  0.011119  0.063904  0.038808   
2019-12  0.048009  0.013554 -0.026100 -0.000838 -0.023591  0.034543 -0.008579   

              PC8       PC9

In [17]:
lev_r_pc.fillna(0, inplace=True) 

In [59]:

positive_returns_PC = pc_avg_df > 0
negative_returns_PC = pc_avg_df < 0 


long_portfolio_PC = positive_returns_PC.astype(int)  
short_portfolio_PC = negative_returns_PC.astype(int) 

r_pc_1_10 = ['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10']
r_pc_1_10 = ['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10']
r_pc_11_20 = ['PC11', 'PC12', 'PC13', 'PC14', 'PC15', 'PC16', 'PC17', 'PC18', 'PC19', 'PC20']
r_pc_21_30 = ['PC21', 'PC22', 'PC23', 'PC24', 'PC25', 'PC26', 'PC27', 'PC28', 'PC29', 'PC30']
r_pc_31_40 = ['PC31', 'PC32', 'PC33', 'PC34', 'PC35', 'PC36', 'PC37', 'PC38', 'PC39', 'PC40']
r_pc_41_47 = ['PC41', 'PC42', 'PC43', 'PC44', 'PC45', 'PC46', 'PC47']





pc_mom_return_1_10 = (long_portfolio_PC[r_pc_1_10].shift(-1) * 
                      lev_r_pc[r_pc_1_10]).mean(axis=1) - (short_portfolio_PC[r_pc_1_10].shift(-1) * lev_r_pc[r_pc_1_10]).mean(axis=1)

pc_mom_return_1_10 = pc_mom_return_1_10.shift(1)

pc_mom_return_11_21 = (long_portfolio_PC[r_pc_11_20].shift(-1) * 
                      lev_r_pc[r_pc_11_20]).mean(axis=1)- (short_portfolio_PC[r_pc_11_20].shift(-1) * lev_r_pc[r_pc_11_20]).mean(axis=1)

pc_mom_return_11_21 = pc_mom_return_11_21.shift(1)

pc_mom_return_21_31 = (long_portfolio_PC[r_pc_21_30].shift(-1) * 
                      lev_r_pc[r_pc_21_30]).mean(axis=1)- (short_portfolio_PC[r_pc_21_30].shift(-1) * lev_r_pc[r_pc_21_30]).mean(axis=1)

pc_mom_return_21_31 = pc_mom_return_21_31.shift(1)

pc_mom_return_31_41 = (long_portfolio_PC[r_pc_31_40].shift(-1) * 
                      lev_r_pc[r_pc_31_40]).mean(axis=1)- (short_portfolio_PC[r_pc_31_40].shift(-1) * lev_r_pc[r_pc_31_40]).mean(axis=1)

pc_mom_return_31_41 = pc_mom_return_31_41.shift(1)

pc_mom_return_41_47 = (long_portfolio_PC[r_pc_41_43].shift(-1) * 
                      lev_r_pc[r_pc_41_43]).mean(axis=1)- (short_portfolio_PC[r_pc_41_43].shift(-1) * lev_r_pc[r_pc_41_43]).mean(axis=1)

pc_mom_return_41_47 = pc_mom_return_41_47.shift(1)





In [60]:
print(pc_mom_return_1_10)
print(pc_mom_return_1_10.mean())

print(pc_mom_return_11_21)
print(pc_mom_return_11_21.mean())

print(pc_mom_return_21_31)
print(pc_mom_return_21_31.mean())

print(pc_mom_return_31_41)
print(pc_mom_return_31_41.mean())

print(pc_mom_return_41_47)
print(pc_mom_return_41_47.mean())


1973-06         NaN
1973-07         NaN
1973-08    0.000000
1973-09    0.010985
1973-10   -0.001007
             ...   
2019-08   -0.013044
2019-09    0.010410
2019-10    0.016689
2019-11    0.010368
2019-12   -0.000977
Length: 559, dtype: float64
0.009377004276008545
1973-06         NaN
1973-07         NaN
1973-08    0.000000
1973-09    0.010985
1973-10   -0.013493
             ...   
2019-08    0.014568
2019-09    0.011058
2019-10    0.018373
2019-11   -0.014468
2019-12   -0.001557
Length: 559, dtype: float64
0.009581521773544288
1973-06         NaN
1973-07         NaN
1973-08    0.000000
1973-09    0.010985
1973-10   -0.004693
             ...   
2019-08    0.010943
2019-09    0.014181
2019-10   -0.012991
2019-11    0.015000
2019-12    0.029836
Length: 559, dtype: float64
0.009616053974115451
1973-06         NaN
1973-07         NaN
1973-08    0.000000
1973-09   -0.005493
1973-10    0.001787
             ...   
2019-08   -0.002340
2019-09    0.011139
2019-10    0.019030
2019-11   -0.