In [4]:
import pandas as pd
from sklearn.cross_decomposition import PLSRegression
import calendar
import numpy as np
import statsmodels.api as sm

### data loading and pre-processing

In [6]:
url = 'https://github.com/Sebasleen/Seminargroup/raw/Seminar/Data/managed_portfolios_anom_d_55.csv'

r_daily = pd.read_csv(url)
momentum_list = ['r_mom', 'r_indmom', 'r_valmom', 'r_valmomprof', 'r_mom12', 'r_momrev', 'r_indmomrev', 'r_exchsw']
r_daily.drop(columns=momentum_list, inplace=True)

# set date to datetime format and set the date to the index 
r_daily['date'] = pd.to_datetime(r_daily['date'])
r_daily.set_index('date', inplace=True)

# following the procedure in the paper, if there are observations missing we set them to 0. 
r_daily.fillna(0, inplace=True)

# create a list of factors for later analysis purposes 
factors = [col for col in r_daily.columns if col.startswith('r_')]
r_daily.drop(columns=['rme', 're_ew'], inplace=True)

# create a monthly return dataframe for later analysis purposes 
r_monthly = r_daily.resample("M").sum()

### constructing the y variable 

In [8]:
# we first calculate the average return of each factor in the period t until t-11 (and drop our missing year)
r_monthly_average = r_monthly.rolling(window=12).mean()
r_monthly_average.dropna(inplace=True)

# create a boolean dataframe with True or False (True for positive returns or True for negative returns)
positive_returns = r_monthly_average > 0
negative_returns = r_monthly_average < 0

# create from the boolean dataframe the binary dataframe 
long_portfolio = positive_returns.astype(int)
short_portfolio = negative_returns.astype(int)

# shift 1 month ahead to calculate the return of strategy at t+1 
long_portfolio = long_portfolio.shift(1)
short_portfolio = short_portfolio.shift(1)

# reindex the long_portfolio dataframe to the daily frequency and the missing observations
long_portfolio_daily = long_portfolio.reindex(r_daily.index)
long_portfolio_daily = long_portfolio_daily.bfill()
long_portfolio_daily = long_portfolio_daily.loc['1964-07-01':]

# reindex the short_portfolio dataframe to the daily frequency and fill the missing observations

short_portfolio_daily = short_portfolio.reindex(r_daily.index)
short_portfolio_daily = short_portfolio_daily.bfill()
short_portfolio_daily = short_portfolio_daily.loc['1964-07-01':]

# create dataframe for returns that matches length of portfolio

r_daily_strategy = r_daily.loc['1964-07-01':]

# compute the return of the momentum strategy (this will be used as Y variable)
r_strategy = ((r_daily_strategy * long_portfolio_daily) - (r_daily_strategy * short_portfolio_daily)).sum(axis=1)


In [9]:
# first set our r_monthly back to expected YYYY-MM format 
r_monthly.index = r_monthly.index.strftime('%Y-%m')

# select our start date 
start_date = pd.to_datetime("1964-07-01")

# create an empty dataframe for our calculations of the average return over the period t-1 to t-11
pls_avg_df = pd.DataFrame()

# create an empty list for the pls return dataframes. These will be concated in a later stage to one large dataframe
pls_return_dfs = []

# Create an empty dataframe for monthly PLS returns
pls_return_df = pd.DataFrame(columns=['date'] + [f"PLS{i+1}" for i in range(len(factors))])

# Initialize an empty list to store PLS weights
pls_weights_list = []

# Loop through years and months
for year in range(1973, 2020):
    # as our out of sample procedure starts July 1973, we start in the 7th month in 1973
    for mo in range(6,13) if year == 1973 else range(1, 13):
        # first we have to find the last month of the day. For this we use the calender function with inputs from the loop variables
        last_day = calendar.monthrange(year, mo)[1]

        # we select our new end_date variable for which the Pls analysis is done, also with inputs from our loop and the last_day variable
        end_date = pd.to_datetime(f'{year}-{mo}-{last_day}')

        t_dt = pd.to_datetime(f'{year}-{mo}')
        t = t_dt.strftime('%Y-%m')

        # select data range for PLS analysis
        pls_data = r_daily.loc[start_date:end_date]
        pls_data_y = r_strategy[start_date:end_date]

        # assign features (X) and target (y)
        X = pls_data
        y = pls_data_y

        # fit PLS model
        pls = PLSRegression(n_components=47)
        pls.fit(X, y)


        # extract and store PLS weights
        pls_weights = pd.DataFrame(
            np.array(pls.x_weights_),
            index=factors,
            columns=[f"PLS{i+1}" for i in range(47)]
        )
        # calculating return for month t+1. If mo = 12, then year will increment with 1. 

        t_plus_1_year = year + 1 if mo == 12 else year
        t_plus_1_month = (mo % 12) + 1

        # creating a datetime variable for the month t+1 and storing this in our pls_return_data variable

        t_plus_1_dt =pd.to_datetime(f'{t_plus_1_year}-{t_plus_1_month}')
        t_plus_1 = t_plus_1_dt.strftime('%Y-%m')

        pls_return_data = {'date': t_plus_1}


        # in this loop we calculate the monthly factor returns (f) using the principal components and returns

        for f in range(len(factors)):
            # select our factor and extract its principal component from principal_df and its return from r_daily for all observations in month mo 
            pls = pls_weights.iloc[:, f]
            r_month = r_monthly.loc[t]

            # multiply the principal components with the returns and sum them up to get Pls factor return for month mo 
            pls_return = (pls*r_month).sum()

            # place this in our dictionary for later transposing to dataframe

            pls_return_data[pls_weights.columns[f]] = pls_return

            r_pls_month_n_list = []

            for n in range(1, 12):
                # calculate the datetime for t - n
                t_minus_n_dt = t_dt - pd.DateOffset(months=n)

                # transpose it to our YYYY-MM format
                t_minus_n = t_minus_n_dt.strftime('%Y-%m')

                # select the return corresponding to our month t-n
                r_month_n = r_monthly.shift(n).loc[t_minus_n]
                pls_return_n = (pls*r_month_n).sum()
                r_pls_month_n_list.append(pls_return_n)

            r_pls_month_mean = (np.mean(r_pls_month_n_list))
            pls_avg_df.loc[t, f'PLS{f+1}'] = r_pls_month_mean


        pls_return_df = pd.DataFrame.from_dict(pls_return_data, orient='index').T
        pls_return_df.set_index('date', inplace=True)
        pls_return_dfs.append(pls_return_df)

r_pls = pd.concat(pls_return_dfs)
print(r_pls)

             PLS1      PLS2      PLS3      PLS4      PLS5      PLS6      PLS7  \
date                                                                            
1973-07  0.052722  0.085601  0.121708 -0.057921 -0.011599 -0.077036  0.075644   
1973-08 -0.413006 -0.322405 -0.106428  0.040936  0.001378 -0.156146 -0.297679   
1973-09    0.0475  0.031771  0.015164  -0.01536  0.017781 -0.013689 -0.019541   
1973-10 -0.011489  0.020539 -0.124029 -0.172455 -0.044708 -0.002751  0.007047   
1973-11   0.20267  0.149573  0.053795 -0.122876  -0.01748  0.090709   0.01037   
...           ...       ...       ...       ...       ...       ...       ...   
2019-09  0.047111 -0.002464   0.20786  0.190474  0.000893 -0.089765 -0.107152   
2019-10  -0.05774 -0.365649 -0.294384  0.048312 -0.105279  0.083312  0.026218   
2019-11  0.017438  0.011799  0.076095  0.077808 -0.024021  0.055363  0.041651   
2019-12 -0.033881  0.192923  0.065169 -0.037057  0.036164 -0.016166  0.012103   
2020-01 -0.044939 -0.062202 

### leveraging and demeaing resulsts

In [11]:
# Define the start date
start_date_dt = pd.to_datetime("1964-07-01")
start_date = start_date_dt.strftime('%Y-%m')

# Create an empty list to store leverage-adjusted PLS factors
lev_pls_dfs = []

for year in range(1973, 2020):
    # Loop through months
    for mo in range(7, 13) if year == 1973 else range(1, 13):
        # Define the current date
        t_dt = pd.to_datetime(f'{year}-{mo}')
        t = t_dt.strftime('%Y-%m')

        # Calculate the variance of the individual factor returns up until month t 
        r_indiv_f_t = r_monthly.loc[start_date:t]
        var_indiv_f_t = r_indiv_f_t.var(axis=0)
        avg_var_indiv_f_t = var_indiv_f_t.mean()

        # Calculate the mean and variance of the PLS factors up until month t 
        r_pls_t = r_pls.loc[:t]
        demeaned_r_pls_t = r_pls_t.loc[t].to_frame().T - r_pls_t.mean()

        # Calculate the leverage factor
        nonzero_std = np.where(r_pls_t.std(axis=0) != 0, r_pls_t.std(axis=0), 1)
        leverage_t = np.sqrt(avg_var_indiv_f_t) / nonzero_std

        # Multiply leverage factor with the demeaned PLS factors
        lev_r_pls_t = demeaned_r_pls_t * leverage_t

        # Append the leveraged PLS factors to the list
        lev_pls_dfs.append(lev_r_pls_t)

# Concatenate the leveraged PLS factors into a single DataFrame
lev_r_pls = pd.concat(lev_pls_dfs)
lev_r_pls.fillna(0, inplace=True)
lev_r_pls_clean = lev_r_pls.drop(lev_r_pls.index[:1])

print(lev_r_pls_clean)

             PLS1      PLS2      PLS3      PLS4      PLS5      PLS6      PLS7  \
1973-08 -0.028377 -0.028377 -0.028377  0.028377  0.028377 -0.028377 -0.028377   
1973-09  0.022788  0.018135  0.001764 -0.003707  0.041618  0.038592  0.012623   
1973-10  0.012569  0.014423 -0.035332 -0.054076 -0.053897  0.034091  0.016115   
1973-11  0.040278  0.034986  0.024067 -0.027808 -0.011573  0.054763  0.015553   
1973-12  0.043524  0.048148  0.043131  0.028180  0.076442 -0.024679  0.035596   
...           ...       ...       ...       ...       ...       ...       ...   
2019-08 -0.004107  0.009902  0.012937  0.012454  0.003349 -0.023928 -0.028337   
2019-09  0.011448 -0.003078  0.097022  0.111564  0.007171 -0.064767 -0.074670   
2019-10 -0.017527 -0.110220 -0.138214  0.028787 -0.069780  0.060559  0.024574   
2019-11  0.003270  0.001345  0.035011  0.045868 -0.010783  0.040241  0.036006   
2019-12 -0.010921  0.054779  0.029847 -0.020947  0.032868 -0.011563  0.013980   

             PLS8      PLS9

### constructing the momentum strategy

In [13]:
# create two boolean dataframes: one for positive average returns and one for negative average returns
positive_returns_PLS = pls_avg_df > 0
negative_returns_PLS = pls_avg_df < 0

# convert the boolean dataframes to integers and 0's, one for long positions and one for short positions
long_portfolio_PLS= positive_returns_PLS.astype(int)
short_portfolio_PLS = negative_returns_PLS.astype(int)

# create the 5 subsets of PCs
mom_1_10 = ['PLS1', 'PLS2', 'PLS3', 'PLS4', 'PLS5', 'PLS6', 'PLS7', 'PLS8', 'PLS9', 'PLS10']
mom_11_20 = ['PLS11', 'PLS12', 'PLS13', 'PLS14', 'PLS15', 'PLS16', 'PLS17', 'PLS18', 'PLS19', 'PLS20']
mom_21_30 = ['PLS21', 'PLS22', 'PLS23', 'PLS24', 'PLS25', 'PLS26', 'PLS27', 'PLS28', 'PLS29', 'PLS30']
mom_31_40 = ['PLS31', 'PLS32', 'PLS33', 'PLS34', 'PLS35', 'PLS36', 'PLS37', 'PLS38', 'PLS39', 'PLS40']
mom_41_47 = ['PLS41', 'PLS42', 'PLS43', 'PLS44', 'PLS45', 'PLS46', 'PLS47']


# create a list of the subsets for our loop
mom_list = [mom_1_10, mom_11_20, mom_21_30, mom_31_40, mom_41_47]

# create an empty dictionary 
r_mean_set_dict = {}

# create a loop where the dummy dataframe is multiplied with the leveraged PC return dataframe. We shift the portfolio indicator with one, as we need to calculate the returns of t+1. 
for i, mom in enumerate(mom_list):
    # create the strategy: the return of the long positions minus the return of the short positions (accounting for the fact that negative short returns need to become positive
    r_PLS_set_mom = (long_portfolio_PLS[mom].shift(1) * lev_r_pls_clean[mom]) - (short_portfolio_PLS[mom].shift(1) * lev_r_pls_clean[mom])
    # we take the mean of the returns of the 10 PC subsets
    r_PLS_set_mean = r_PLS_set_mom.mean(axis=1)
    # we append it to our dictionary
    r_mean_set_dict[f'mom_set_{i + 1}'] = r_PLS_set_mean

# create the dataframe with the series of returns for each subset of PCS
mom_strategy = pd.concat(r_mean_set_dict, axis=1)

mom_strategy.index = pd.to_datetime(mom_strategy.index)
mom_strategy.index = mom_strategy.index.strftime('%Y-%m')
mom_strategy.dropna(inplace=True)

print(mom_strategy.mean(axis=0))

mom_set_1    0.003606
mom_set_2    0.002919
mom_set_3    0.003152
mom_set_4    0.001370
mom_set_5    0.002284
dtype: float64


### replicate Panel A table 3 using data from PLS

In [15]:
# select the full sample dataframe in the paper and create the two splitted periods 
mom_strategy_full = mom_strategy.loc['1973-07':'2019-12']
mom_strategy_1 = mom_strategy.loc['1973-07':'1996-09']
mom_strategy_2 = mom_strategy.loc['1996-09':]

print(f'the mean of every subset of PCs is:\n')
print(mom_strategy_full.mean(axis=0))

means = mom_strategy.mean(axis=0).tolist()
std = mom_strategy.std(axis=0).tolist()
N = mom_strategy.shape[0]

print(f'\n')
print(f'the t-statistic of every subset of PCS is:\n')
for m, s in zip(means, std):
    t_statistic = m / (s / (N**0.5))
    print(t_statistic)

print(f'\n')
print(f'the mean of every subset of PCs is (first half):\n')
print(mom_strategy_1.mean(axis=0))

means = mom_strategy_1.mean(axis=0).tolist()
std = mom_strategy_1.std(axis=0).tolist()
N = mom_strategy_1.shape[0]

print(f'\n')
print(f'the t-statistic of every subset of PCS is (first half):\n')
for m, s in zip(means, std):
    t_statistic = m / (s / (N**0.5))
    print(t_statistic)

print(f'\n')
print(f'the mean of every subset of PCs is (second half):\n')
print(mom_strategy_2.mean(axis=0))

means = mom_strategy_2.mean(axis=0).tolist()
std = mom_strategy_2.std(axis=0).tolist()
N = mom_strategy_2.shape[0]

print(f'\n')
print(f'the t-statistic of every subset of PCS is (second half):\n')
for m, s in zip(means, std):
    t_statistic = m / (s / (N**0.5))
    print(t_statistic)

the mean of every subset of PCs is:

mom_set_1    0.003606
mom_set_2    0.002919
mom_set_3    0.003152
mom_set_4    0.001370
mom_set_5    0.002284
dtype: float64


the t-statistic of every subset of PCS is:

4.338962178300348
3.4745586689435353
4.397096412048947
1.8486377789706578
3.096690261297527


the mean of every subset of PCs is (first half):

mom_set_1    0.004435
mom_set_2    0.003392
mom_set_3    0.004651
mom_set_4    0.002242
mom_set_5    0.002307
dtype: float64


the t-statistic of every subset of PCS is (first half):

5.381555157559653
3.5898111220841367
6.217532618013721
2.444767509204075
2.5751388706993823


the mean of every subset of PCs is (second half):

mom_set_1    0.002810
mom_set_2    0.002445
mom_set_3    0.001672
mom_set_4    0.000534
mom_set_5    0.002238
dtype: float64


the t-statistic of every subset of PCS is (second half):

1.9554474313284218
1.7661372039678263
1.379195935243663
0.4602978766948807
1.9145087931927494
