In [23]:
import pandas as pd
from sklearn.cross_decomposition import PLSRegression
import calendar
import numpy as np
import statsmodels.api as sm

In [8]:

# loading dataframes 

r_daily = pd.read_csv('managed_portfolios_anom_d_55.csv')
r_daily['date'] = pd.to_datetime(r_daily['date'])
r_daily.set_index('date', inplace=True)

y_daily = r_daily['r_mom']


# drop all momentum factors or factors that are constructed based on momentum

factor_drop_list = ['r_mom', 'r_indmom', 'r_valmom', 'r_valmomprof', 'r_mom12', 'r_momrev', 'r_indmomrev', 'r_exchsw', 'rme', 're_ew']

r_daily.drop(columns=factor_drop_list, inplace=True)


# following the procedure in the paper, if there are observations missing we set them to 0. 

r_daily.fillna(0, inplace=True) 

# create a list of factors for later analysis purposes 

factors = [col for col in r_daily.columns if col.startswith('r_')]

# create a monthly return dataframe for later analysis purposes 

r_monthly = r_daily.resample('M').sum()
r_monthly.index = r_monthly.index.strftime('%Y-%m')
y_monthly = y_daily.resample('M').sum()
y_monthly.index = y_monthly.index.strftime('%Y-%m')




In [9]:

# select our start date 

start_date = pd.to_datetime("1963-07-01")

pls_avg_df = pd.DataFrame()

# create an empty list for the pls return dataframes. These will be concated in a later stage to one large dataframe

pls_return_dfs = []


# Create an empty dataframe for monthly PLS returns
pls_return_df = pd.DataFrame(columns=['date'] + [f"PLS{i+1}" for i in range(len(factors))])

# Initialize an empty list to store PLS weights
pls_weights_list = []

# Loop through years and months
for year in range(1973, 2020):
    # as our out of sample procedure starts July 1973, we start in the 7th month in 1973
    for mo in range(6,13) if year == 1973 else range(1, 13):
        # first we have to find the last month of the day. For this we use the calender function with inputs from the loop variables
        last_day = calendar.monthrange(year, mo)[1]

        # we select our new end_date variable for which the Pls analysis is done, also with inputs from our loop and the last_day variable
        end_date = pd.to_datetime(f'{year}-{mo}-{last_day}')

        t_dt = pd.to_datetime(f'{year}-{mo}')
        t = t_dt.strftime('%Y-%m')

        # Select data range for PLS analysis
        pls_data = r_daily.loc[start_date:end_date]
        pls_data_y = y_daily.loc[start_date:end_date]

        # Separate features (X) and target (y)
        X = pls_data
        y = pls_data_y

        # Fit PLS model
        pls = PLSRegression(n_components=47)  
        pls.fit(X, y)


        # Extract and store PLS weights
        pls_weights = pd.DataFrame(
            np.array(pls.x_weights_),  
            index=factors,
            columns=[f"PLS{i+1}" for i in range(47)] 
        )
        # calculating return for month t+1. If mo = 12, then year will increment with 1. 

        t_plus_1_year = year + 1 if mo == 12 else year
        t_plus_1_month = (mo % 12) + 1

        # creating a datetime variable for the month t+1 and storing this in our pls_return_data variable

        t_plus_1_dt =pd.to_datetime(f'{t_plus_1_year}-{t_plus_1_month}')
        t_plus_1 = t_plus_1_dt.strftime('%Y-%m')

        pls_return_data = {'date': t_plus_1}


        # in this loop we calculate the monthly factor returns (f) using the principal components and returns

        for f in range(len(factors)):
            # select our factor and extract its principal component from principal_df and its return from r_daily for all observations in month mo 
            pls = pls_weights.iloc[:, f]
            r_month = r_monthly.loc[t]

            # multiply the principal components with the returns and sum them up to get Pls factor return for month mo 
            pls_return = (pls*r_month).sum()

            # place this in our dictionary for later transposing to dataframe

            pls_return_data[pls_weights.columns[f]] = pls_return

            r_pls_month_n_list = []

            for n in range(1, 12):
                # calculate the datetime for t - n
                t_minus_n_dt = t_dt - pd.DateOffset(months=n)

                # transpose it to our YYYY-MM format
                t_minus_n = t_minus_n_dt.strftime('%Y-%m')

                # select the return corresponding to our month t-n
                r_month_n = r_monthly.shift(n).loc[t_minus_n]
                pls_return_n = (pls*r_month_n).sum()
                r_pls_month_n_list.append(pls_return_n)

            r_pls_month_mean = (np.mean(r_pls_month_n_list))
            pls_avg_df.loc[t, f'PLS{f+1}'] = r_pls_month_mean


        pls_return_df = pd.DataFrame.from_dict(pls_return_data, orient='index').T
        pls_return_df.set_index('date', inplace=True)
        pls_return_dfs.append(pls_return_df)

r_pls = pd.concat(pls_return_dfs)
print(r_pls)
print(pls_avg_df)


             PLS1      PLS2      PLS3      PLS4      PLS5      PLS6      PLS7  \
date                                                                            
1973-07 -0.048384  0.127402  0.045078 -0.032426 -0.060105  0.042384  0.039795   
1973-08  0.021698  -0.49487 -0.224569  0.007259  0.113017  0.206656 -0.176765   
1973-09 -0.007387  0.052631  0.019876 -0.032449  0.000953  0.002695 -0.042352   
1973-10  -0.02852 -0.095301  0.084225 -0.193214  0.060742   0.05589 -0.015636   
1973-11 -0.006779  0.236759  0.084113 -0.155524  0.017817  0.053698  0.032168   
...           ...       ...       ...       ...       ...       ...       ...   
2019-09   0.11085 -0.069182  0.202737  0.148629 -0.011815  0.076098 -0.115031   
2019-10 -0.290273  0.344427  -0.04857  0.107533 -0.135575 -0.068866  0.052702   
2019-11  0.057685 -0.029784  0.064527  0.064735 -0.022589  -0.03115  0.054327   
2019-12   0.07589 -0.182885 -0.044704 -0.055343  0.040831  0.007139 -0.002603   
2020-01 -0.079979  0.054259 

In [15]:

# Define the start date
start_date_dt = pd.to_datetime("1963-07-01")
start_date = start_date_dt.strftime('%Y-%m')

# Create an empty list to store leverage-adjusted PLS factors
lev_pls_dfs = []

for year in range(1973, 2020):
    # Loop through months
    for mo in range(7, 13) if year == 1973 else range(1, 13):
        # Define the current date
        t_dt = pd.to_datetime(f'{year}-{mo}')
        t = t_dt.strftime('%Y-%m')

        # Calculate the variance of the individual factor returns up until month t 
        r_indiv_f_t = r_monthly.loc[start_date:t]
        var_indiv_f_t = r_indiv_f_t.var(axis=0)
        avg_var_indiv_f_t = var_indiv_f_t.mean()

        # Calculate the mean and variance of the PLS factors up until month t 
        r_pls_t = r_pls.loc[:t]
        demeaned_r_pls_t = r_pls_t.loc[t].to_frame().T - r_pls_t.mean()

        # Calculate the leverage factor
        leverage_t = np.sqrt(avg_var_indiv_f_t / r_pls_t.var(axis=0))

        # Multiply leverage factor with the demeaned PLS factors
        lev_r_pls_t = demeaned_r_pls_t * leverage_t

        # Append the leveraged PLS factors to the list
        lev_pls_dfs.append(lev_r_pls_t)

# Concatenate the leveraged PLS factors into a single DataFrame
lev_r_pls = pd.concat(lev_pls_dfs)
lev_r_pls.fillna(0, inplace=True)
lev_r_pls_clean = lev_r_pls.drop(lev_r_pls.index[:1])

print(lev_r_pls_clean)

             PLS1      PLS2      PLS3      PLS4      PLS5      PLS6      PLS7  \
1973-08  0.027464 -0.027464 -0.027464  0.027464  0.027464  0.027464 -0.027464   
1973-09  0.004383  0.018026  0.019071 -0.022459 -0.007526 -0.029191  0.006193   
1973-10 -0.016745  0.001018  0.028800 -0.057234  0.016740 -0.009175  0.014051   
1973-11  0.010716  0.037917  0.025262 -0.033672 -0.005296 -0.009436  0.029376   
1973-12 -0.047765  0.046592  0.030252  0.039875 -0.069820 -0.070193  0.043161   
...           ...       ...       ...       ...       ...       ...       ...   
2019-08  0.006557 -0.018416 -0.000553  0.010668  0.006508  0.021635 -0.034623   
2019-09  0.029820 -0.024998  0.100544  0.086421 -0.005268  0.055454 -0.083816   
2019-10 -0.077505  0.095664 -0.027388  0.062683 -0.095848 -0.053885  0.035622   
2019-11  0.015656 -0.013584  0.030172  0.038095 -0.012967 -0.025384  0.036719   
2019-12  0.020501 -0.058161 -0.025439 -0.030668  0.033510  0.003519 -0.003842   

             PLS8      PLS9

In [18]:
positive_returns_PLS = pls_avg_df > 0
negative_returns_PLS = pls_avg_df < 0 


long_portfolio_PLS = positive_returns_PLS.astype(int)  
short_portfolio_PLS = negative_returns_PLS.astype(int) 

mom_1_10 = ['PLS1', 'PLS2', 'PLS3', 'PLS4', 'PLS5', 'PLS6', 'PLS7', 'PLS8', 'PLS9', 'PLS10']
mom_11_20 = ['PLS11', 'PLS12', 'PLS13', 'PLS14', 'PLS15', 'PLS16', 'PLS17', 'PLS18', 'PLS19', 'PLS20']
mom_21_30 = ['PLS21', 'PLS22', 'PLS23', 'PLS24', 'PLS25', 'PLS26', 'PLS27', 'PLS28', 'PLS29', 'PLS30']
mom_31_40 = ['PLS31', 'PLS32', 'PLS33', 'PLS34', 'PLS35', 'PLS36', 'PLS37', 'PLS38', 'PLS39', 'PLS40']
mom_41_47 = ['PLS41', 'PLS42', 'PLS43', 'PLS44', 'PLS45', 'PLS46', 'PLS47']

mom_list = [mom_1_10, mom_11_20, mom_21_30, mom_31_40, mom_41_47]

r_mean_set_dict = {}

for i, mom in enumerate(mom_list):
    r_PLS_set_mom = (long_portfolio_PLS[mom] * lev_r_pls_clean[mom]) - (short_portfolio_PLS[mom] * lev_r_pls_clean[mom])
    r_PLS_set_mean = r_PLS_set_mom.mean(axis=1)
    r_mean_set_dict[f'mom_set_{i + 1}'] = r_PLS_set_mean
    
mom_strategy = pd.concat(r_mean_set_dict, axis=1)

mom_strategy.index = pd.to_datetime(mom_strategy.index)
mom_strategy.index = mom_strategy.index.strftime('%Y-%m')
mom_strategy.dropna(inplace=True)
print(mom_strategy)
print(mom_strategy.mean(axis=0))



         mom_set_1  mom_set_2  mom_set_3     mom_set_4  mom_set_5
1973-08  -0.010985  -0.010985   0.005493  3.469447e-19  -0.003923
1973-09  -0.011532   0.006194  -0.004389  1.219057e-02  -0.007634
1973-10   0.008205   0.010577   0.005911 -8.102748e-03  -0.018663
1973-11  -0.000739   0.004773  -0.009324  9.673564e-03  -0.009205
1973-12   0.009930   0.003775   0.013661 -7.707549e-03  -0.009618
...            ...        ...        ...           ...        ...
2019-08   0.012136   0.007764   0.016175  2.787349e-02   0.005757
2019-09   0.036439  -0.004081  -0.000596 -1.408228e-03   0.023456
2019-10  -0.023593  -0.028676  -0.011334 -1.380217e-02   0.006452
2019-11  -0.009807   0.022844  -0.007627  3.226388e-02  -0.023963
2019-12   0.012088  -0.001273  -0.001551  1.615843e-02  -0.013521

[557 rows x 5 columns]
mom_set_1    0.003487
mom_set_2    0.003115
mom_set_3    0.002105
mom_set_4    0.003559
mom_set_5    0.000046
dtype: float64


In [19]:
mom_strategy_1 = mom_strategy.loc[:'1996-09']
mom_strategy_2 = mom_strategy.loc['1996-09':]

print(f'the mean of every subset of PCs is:\n')
print(mom_strategy.mean(axis=0))

means = mom_strategy.mean(axis=0).tolist()
std = mom_strategy.std(axis=0).tolist()
N = mom_strategy.shape[0]

print(f'the t-statistic of every subset of PCS is:\n')
for m, s in zip(means, std):
    t_statistic = m / (s / (N**0.5))
    print(t_statistic)

print(f'the mean of every subset of PCs is (first half):\n')
print(mom_strategy_1.mean(axis=0))

means = mom_strategy_1.mean(axis=0).tolist()
std = mom_strategy_1.std(axis=0).tolist()
N = mom_strategy_1.shape[0]

print(f'the t-statistic of every subset of PCS is (first half):\n')
for m, s in zip(means, std):
    t_statistic = m / (s / (N**0.5))
    print(t_statistic)

print(f'the mean of every subset of PCs is (second half):\n')
print(mom_strategy_2.mean(axis=0))

means = mom_strategy_2.mean(axis=0).tolist()
std = mom_strategy_2.std(axis=0).tolist()
N = mom_strategy_2.shape[0]

print(f'the t-statistic of every subset of PCS is (second half):\n')
for m, s in zip(means, std):
    t_statistic = m / (s / (N**0.5))
    print(t_statistic)

the mean of every subset of PCs is:

mom_set_1    0.003487
mom_set_2    0.003115
mom_set_3    0.002105
mom_set_4    0.003559
mom_set_5    0.000046
dtype: float64
the t-statistic of every subset of PCS is:

4.547693100697605
4.065621235068799
3.007214245176625
5.129749331295277
0.0630969208706754
the mean of every subset of PCs is (first half):

mom_set_1    0.003509
mom_set_2    0.003185
mom_set_3    0.003261
mom_set_4    0.005617
mom_set_5    0.000734
dtype: float64
the t-statistic of every subset of PCS is (first half):

4.34368373540407
3.490356564791017
4.094469971925525
7.220917272474888
0.8358982560949173
the mean of every subset of PCs is (second half):

mom_set_1    0.003416
mom_set_2    0.003070
mom_set_3    0.000934
mom_set_4    0.001460
mom_set_5   -0.000611
dtype: float64
the t-statistic of every subset of PCS is (second half):

2.627974691278017
2.5007907472288124
0.8163633386188286
1.28857895827996
-0.530983485009442


In [21]:
ff = pd.read_stata("fffactors.dta")

ff.set_index('yyyymm', inplace=True)


ff.index = pd.to_datetime(ff.index, format='%Y%m')


ff.index = ff.index.strftime('%Y-%m')

ff5 = ff[['mktrf', 'smb', 'hml', 'rmw', 'cma']].loc['1973-08':'2019-12']

mom_strategy_ff5 = pd.concat([mom_strategy, ff5], axis=1)

mom_strategy_ff5['P1'] = 0
mom_strategy_ff5['P2'] = 0
mom_strategy_ff5.loc[mom_strategy_ff5.index <= '1996-09', 'P1'] = 1
mom_strategy_ff5.loc[mom_strategy_ff5.index >= '1996-09', 'P2'] = 1


In [25]:
independent_vars = ['mom_set_1', 'P1', 'P2', 'mktrf', 'smb', 'hml', 'rmw', 'cma']
X = mom_strategy_ff5[independent_vars]

for i in range(2, 6):
    Y = mom_strategy_ff5[f'mom_set_{i}']
    modelB = sm.OLS(Y, X).fit()
    print(modelB.summary())

                                 OLS Regression Results                                
Dep. Variable:              mom_set_2   R-squared (uncentered):                   0.058
Model:                            OLS   Adj. R-squared (uncentered):              0.045
Method:                 Least Squares   F-statistic:                              4.245
Date:                Thu, 22 Feb 2024   Prob (F-statistic):                    5.84e-05
Time:                        11:16:45   Log-Likelihood:                          1453.8
No. Observations:                 557   AIC:                                     -2892.
Df Residuals:                     549   BIC:                                     -2857.
Df Model:                           8                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [27]:
y = mom_strategy_ff5['mom_set_1']

for i in range(2, 6):
    independent_vars = [f'mom_set_{i}', 'P1', 'P2', 'mktrf', 'smb', 'hml', 'rmw', 'cma']
    X = sm.add_constant(mom_strategy_ff5[independent_vars])
    modelC = sm.OLS(y, X).fit()
    print(modelC.summary())


independent_vars = ['P1', 'P2', 'mom_set_2', 'mom_set_3', 'mom_set_4', 'mom_set_5', 'mktrf', 'smb', 'hml', 'rmw', 'cma']

X = sm.add_constant(mom_strategy_ff5[independent_vars])
modelC1 = sm.OLS(y, X).fit()
print(modelC1.summary())
   

                            OLS Regression Results                            
Dep. Variable:              mom_set_1   R-squared:                       0.029
Model:                            OLS   Adj. R-squared:                  0.015
Method:                 Least Squares   F-statistic:                     2.050
Date:                Thu, 22 Feb 2024   Prob (F-statistic):             0.0390
Time:                        12:14:09   Log-Likelihood:                 1453.1
No. Observations:                 557   AIC:                            -2888.
Df Residuals:                     548   BIC:                            -2849.
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0143      0.018      0.789      0.4

In [None]:
X = sm.add_constant(mom_strategy_ff5[independent_vars])
modelC1 = sm.OLS(y, X).fit()
print(modelC1.summary())