### Imports

In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import calendar
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import PLSRegression
from sklearn.preprocessing import StandardScaler





# Table 1


### Loading data

In [3]:
url = 'https://github.com/Sebasleen/Seminargroup/raw/Seminar/Data/anomalies.dta'

Anomalies = pd.read_stata(url)

# display unique values in the 'anomaly' column
print(Anomalies['anomaly'].unique())

# delete the global factors from the dataframe and create Anomalies US
column_name = 'anomaly'
values_to_dropUS = ['glbab', 'glcma', 'glhml', 'glqmj', 'glrmw', 'glsmb', 'glumd']
ElementsUS = Anomalies[column_name].isin(values_to_dropUS)
Anomalies_US = Anomalies[~ElementsUS]

# delete the US factors from dataframe and create Anomalies Global 
column_name = 'anomaly'
values_to_dropGF = ['ac', 'bab', 'cfp', 'cma', 'ep', 'hml', 'liq', 'ltrev', 'nsi', 'qmj', 'rmw', 'rvar',
                    'smb', 'strev', 'umd']
ElementsGF = Anomalies[column_name].isin(values_to_dropGF)
Anomalies_GF = Anomalies[~ElementsGF]

# print both anomalies US and anomalies global 
print(Anomalies_US)
print(Anomalies_GF)

['ac' 'bab' 'cfp' 'cma' 'ep' 'hml' 'liq' 'ltrev' 'nsi' 'qmj' 'rmw' 'rvar'
 'smb' 'strev' 'umd' 'glbab' 'glcma' 'glhml' 'glqmj' 'glrmw' 'glsmb'
 'glumd']
       year  month anomaly    ret   time  global
0      1963      7      ac  2.170   42.0     0.0
1      1963      8      ac -0.197   43.0     0.0
2      1963      9      ac  0.600   44.0     0.0
3      1963     10      ac  6.463   45.0     0.0
4      1963     11      ac -2.260   46.0     0.0
...     ...    ...     ...    ...    ...     ...
10111  2019      8     umd  7.600  715.0     0.0
10112  2019      9     umd -6.850  716.0     0.0
10113  2019     10     umd  0.240  717.0     0.0
10114  2019     11     umd -2.620  718.0     0.0
10115  2019     12     umd -2.130  719.0     0.0

[10116 rows x 6 columns]
       year  month anomaly       ret   time  global
10116  1987      2   glbab  2.236918  325.0     1.0
10117  1987      3   glbab  1.828450  326.0     1.0
10118  1987      4   glbab -5.521739  327.0     1.0
10119  1987      5   glba

### replicating table 1

### US factors

In [5]:
# calculate the mean and stand deviation of US factors
AnomaliesUS = Anomalies_US.groupby(['anomaly']).agg({'ret': ['mean', 'std', 'count']}).reset_index()
AnomaliesUS = Anomalies_US.pivot_table(index='anomaly', values='ret', aggfunc=['mean', 'std', 'count'])
AnomaliesUS.columns = ['Mean', 'SD', 'ret_number']
AnomaliesUS.reset_index(inplace=True)
AnomaliesUS.columns = ['anomaly', 'Mean', 'SD', 'ret_number']

# calculate additional statistics
AnomaliesUS['ret_semean'] = AnomaliesUS['SD'] / np.sqrt(AnomaliesUS['ret_number'])

# multiply by 12 to create annualized returns
AnomaliesUS['ret'] = AnomaliesUS['Mean'] * 12

# multiply by sqrt(12) to create annualized standard deviation
AnomaliesUS['sd'] = AnomaliesUS['SD'] * np.sqrt(12)

# calculate the t-stat by dividing the return by the standard error of the mean. Divide by 12 to annualize it
AnomaliesUS['tstat'] = AnomaliesUS['ret'] / AnomaliesUS['ret_semean'] /12

# format the table to correct decimals
AnomaliesUS[['Mean']] = AnomaliesUS[['ret']].apply(lambda x: x.map("{:.1f}%".format))
AnomaliesUS[['SD']] = AnomaliesUS[['sd']].apply(lambda x: x.map("{:.1f}%".format))
AnomaliesUS[['T-value']] = AnomaliesUS[['tstat']].apply(lambda x: x.map("{:.2f}".format))

print(AnomaliesUS[['anomaly','Mean','SD','T-value']])

   anomaly  Mean     SD T-value
0       ac  2.8%   6.6%    3.19
1      bab  9.8%  11.2%    6.55
2      cfp  3.4%   8.6%    2.94
3      cma  3.3%   6.9%    3.59
4       ep  3.5%   8.9%    2.95
5      hml  3.6%   9.7%    2.82
6      liq  4.4%  11.6%    2.77
7    ltrev  2.5%   8.7%    2.16
8      nsi  2.8%   8.2%    2.52
9      qmj  4.6%   7.7%    4.47
10     rmw  3.1%   7.5%    3.13
11    rvar  1.6%  17.3%    0.68
12     smb  2.7%  10.4%    1.97
13   strev  6.0%  10.6%    4.21
14     umd  7.8%  14.5%    4.02


### Global factors

In [7]:
# calculate the mean and stand deviation of Global factors
AnomaliesGF = Anomalies_GF.groupby(['anomaly']).agg({'ret': ['mean', 'std', 'count']}).reset_index()
AnomaliesGF = Anomalies_GF.pivot_table(index='anomaly', values='ret', aggfunc=['mean', 'std', 'count'])
AnomaliesGF.columns = ['Mean', 'SD', 'ret_number']
AnomaliesGF.reset_index(inplace=True)
AnomaliesGF.columns = ['anomaly', 'Mean', 'SD', 'ret_number']

# calculate additional statistics
AnomaliesGF['ret_semean'] = AnomaliesGF['SD'] / np.sqrt(AnomaliesGF['ret_number'])

# multiply by 12 to create annualized returns
AnomaliesGF['ret'] = AnomaliesGF['Mean'] * 12

# multiply by sqrt(12) to create annualized standard deviation
AnomaliesGF['sd'] = AnomaliesGF['SD'] * np.sqrt(12)

# calculate the t-stat by dividing the return by the standard error of the mean. Divide by 12 to annualize it
AnomaliesGF['tstat'] = AnomaliesGF['ret'] / AnomaliesGF['ret_semean'] /12

# format the table to correct decimals
AnomaliesGF[['Mean']] = AnomaliesGF[['ret']].apply(lambda x: x.map("{:.1f}%".format))
AnomaliesGF[['SD']] = AnomaliesGF[['sd']].apply(lambda x: x.map("{:.1f}%".format))
AnomaliesGF[['T-value']] = AnomaliesGF[['tstat']].apply(lambda x: x.map("{:.2f}".format))

print(AnomaliesGF[['anomaly','Mean','SD','T-value']])

  anomaly  Mean     SD T-value
0   glbab  9.6%   9.7%    5.70
1   glcma  1.9%   6.0%    1.74
2   glhml  4.0%   7.4%    2.92
3   glqmj  6.2%   6.8%    5.06
4   glrmw  4.3%   4.7%    4.91
5   glsmb  1.1%   7.1%    0.83
6   glumd  7.9%  12.1%    3.54


# Table 2 

### replicating table 2 (uses the same dataset as table 1)

In [9]:
# create an empty list to store results
results_list = []

# create a loop which iterates over each anomly in our dataset 
for anomaly in Anomalies['anomaly'].unique():
    subset = Anomalies[Anomalies['anomaly'] == anomaly]
    subset = subset.sort_values(by='time')

    # create a binary variable for positive returns in the past 12 months (the signal variable)
    subset['positive_return'] = subset['ret'].rolling(window=12, min_periods=12).mean().shift(1) > 0

    # drop the first 12 observations in the subset after the rolling window has been applied (so dropping N/A values)
    subset = subset.iloc[12:]

    # select our OLS model and fit our data
    y = subset['ret']
    X = sm.add_constant(subset['positive_return'].astype(int))
    model = sm.OLS(y, X)
    
    # select the correct covariance type (as used in the paper)
    results = model.fit(cov_type='cluster', cov_kwds={'groups': subset['time']})

    # append the results to our dictionary to create the results
    results_list.append({
        'anomaly': anomaly,
        'alpha': results.params['const'],
        'T-stat_alpha': results.tvalues['const'],
        'slope': results.params['positive_return'],
        'T-stat_slope': results.tvalues['positive_return'],
    })

results_table = pd.DataFrame(results_list)
print(results_table)


   anomaly     alpha  T-stat_alpha     slope  T-stat_slope
0       ac  0.150195      1.184450  0.101410      0.649822
1      bab -0.221412     -0.632211  1.319041      3.534152
2      cfp  0.127745      0.781292  0.235454      1.157989
3      cma  0.120082      0.974474  0.244693      1.545819
4       ep  0.101357      0.616107  0.302075      1.458207
5      hml  0.038477      0.204762  0.410255      1.780679
6      liq  0.157215      0.741922  0.356063      1.291807
7    ltrev -0.252989     -1.663307  0.757680      3.850110
8      nsi  0.172982      1.324451  0.089249      0.486779
9      qmj  0.086832      0.650364  0.434757      2.507550
10     rmw  0.040360      0.222250  0.337185      1.673841
11    rvar -0.463569     -1.638345  1.061609      2.737366
12     smb -0.104191     -0.615583  0.583455      2.508982
13   strev  0.485098      1.427336  0.013888      0.038600
14     umd  0.716042      2.697340 -0.094969     -0.288098
15   glbab  0.190820      0.577502  0.837610      2.3039

# Table 3

### Loading dataset

In [ ]:
# loading dataframes 

url = 'https://github.com/Sebasleen/Seminargroup/raw/Seminar/Data/managed_portfolios_anom_d_55.csv'

r_daily = pd.read_csv(url)

# drop all momentum factors or factors that are constructed based on momentum (including market return variables)

factor_drop_list = ['r_mom', 'r_indmom', 'r_valmom', 'r_valmomprof', 'r_mom12', 'r_momrev', 'r_indmomrev', 'r_exchsw', 'rme', 're_ew']

r_daily.drop(columns=factor_drop_list, inplace=True)

# set date to datetime format and set the date to the index 

r_daily['date'] = pd.to_datetime(r_daily['date'])
r_daily.set_index('date', inplace=True)

# following the procedure in the paper, if there are observations missing we set them to 0. (footnote 16)

r_daily.fillna(0, inplace=True)

# create a list of factors for later analysis purposes 

factors = [col for col in r_daily.columns if col.startswith('r_')]

# create a monthly return dataframe for later analysis purposes (by summing the daily returns)

#In case this doesn't run because of frequency "M": pandas has updated, so change 'M' to "ME"

r_monthly = r_daily.resample('M').sum()
r_monthly.index = r_monthly.index.strftime('%Y-%m')


### perform the PCA analysis

In [ ]:
# initialize pca model 

pca = PCA(n_components=len(factors))

scaler = StandardScaler()

# select our start date 

start_date = pd.to_datetime("1963-07-01")

# create an empty dataframe to store the average return for each PC from t until t-11. We need this to create the momentum signal for our strategy

pc_avg_df = pd.DataFrame()

# create an empty list for the pc return dataframes. These will be concated in a later stage to one large dataframe

pc_return_dfs = []

# create our loop set up, this is actually an expanding PCA analysis. In each iteration a new month is added to the dataset and the return is computed. 

for year in range(1973, 2020):
    # the sample of the paper starts from July 1973, but we use January to June 1973 to calculate the returns in order to obtain stable means for later demeaning purposes 
    for mo in range(1,13):
        # first we have to find the last month of the day. For this we use the calender function with inputs from the loop variables
        last_day = calendar.monthrange(year, mo)[1]

        # we select our new end_date variable for which the PCA analysis is done, also with inputs from our loop and the last_day variable
        end_date = pd.to_datetime(f'{year}-{mo}-{last_day}')

        t_dt = pd.to_datetime(f'{year}-{mo}')
        t = t_dt.strftime('%Y-%m')

        # we select the datarange from our dataset (July 1963 = start_date until our defined end_date) and we fit the model
        pca_data = r_daily.loc[start_date:end_date]
        scaled_data = scaler.fit_transform(pca_data)
        pca.fit(scaled_data)

        # we extract the principal components. These principal components are put in a new dataframe for later analysis. 

        principal_components = pca.components_
        components_df = pd.DataFrame(data=principal_components.T, index=factors, columns=[f"PC{i+1}" for i in range(len(factors))])

        # calculating return for month t+1. If mo = 12, then year will increment with 1. 

        t_plus_1_year = year + 1 if mo == 12 else year
        t_plus_1_month = (mo % 12) + 1

        # creating a datetime variable for the month t+1 and storing this in our pc_return_data variable

        t_plus_1_dt =pd.to_datetime(f'{t_plus_1_year}-{t_plus_1_month}')
        t_plus_1 = t_plus_1_dt.strftime('%Y-%m')

        pc_return_data = {'date': t_plus_1}


        # in this loop we calculate the monthly factor returns (f) using the principal components and returns

        for f in range(len(factors)):
            # select our factor and extract its principal component from principal_df and its return from r_daily for all observations in month mo 
            pc = components_df.iloc[:, f]
            r_month = r_monthly.loc[t]
            # multiply the principal components with the returns and sum them up to get PC factor return for month mo 
            pc_return = (pc*r_month).sum()

            # place this in our dictionary for later transposing to dataframe

            pc_return_data[components_df.columns[f]] = pc_return

            r_pc_month_n_list = []
            
            # in this loop we calculate the average return of the eigenvector at time t, for the period t until t-11. We store these results in a dataframe for later use.

            for n in range(0, 12):
                # calculate the datetime for t - n
                t_minus_n_dt = t_dt - pd.DateOffset(months=n)

                # transpose it to our YYYY-MM format
                t_minus_n = t_minus_n_dt.strftime('%Y-%m')

                # select the return corresponding to our month t-n
                r_month_n = r_monthly.shift(n).loc[t_minus_n]

                # calculate the dot product for month t-n
                pc_return_n = (pc*r_month_n).sum()
                
                # append this to our list to calculate the mean 
                r_pc_month_n_list.append(pc_return_n)
            
            # calculate the mean and append it to our average return dataframe
            r_pc_month_mean = (np.mean(r_pc_month_n_list))
            pc_avg_df.loc[t, f'PC{f+1}'] = r_pc_month_mean

        # append the PC returns to our PC_return dataframe for later analysis 
        pc_return_df = pd.DataFrame.from_dict(pc_return_data, orient='index').T
        pc_return_df.set_index('date', inplace=True)
        pc_return_dfs.append(pc_return_df)

r_pc = pd.concat(pc_return_dfs)
print(r_pc)

### demeaning and leveraging our PC returns

In [ ]:
# define our start date and create an empty list for our leveraged dataframes
start_date_dt = pd.to_datetime("1963-07-01")
start_date = start_date_dt.strftime('%Y-%m')
lev_dfs = []

for year in range(1973, 2020):
    # as we lost one month in calculating the t+1 PC return, we start the loop from february 1973. We try to use the full year of 1973 in order to obtain stable demeaned results. Hence we will cut of our sample later from July 1973 in order to match the dataset of the original paper. 
    for mo in range(2,13) if year == 1973 else range(1, 13):
        
        # first we set our t variable to the current year and month from our loop
        t_dt = pd.to_datetime(f'{year}-{mo}')
        t = t_dt.strftime('%Y-%m')
        
        # we also create a t_minus_one variable, because we have to calculate the variance up to month t (so excluding month t)
        t_minus_one_dt = t_dt - pd.DateOffset(months=1)
        t_minus_one = t_minus_one_dt.strftime('%Y-%m')

        # calculate the variance of the individual factor returns up until month t-1 
        r_indiv_f_t = r_monthly.loc[start_date:t_minus_one]
        var_indiv_f_t = r_indiv_f_t.var(axis=0)
        avg_var_indiv_f_t = var_indiv_f_t.mean()

        # calculate the mean and variance of the PC factors up until month t 
        r_pc_t = r_pc.loc[:t]
        demeaned_r_pc_t = r_pc.loc[t].to_frame().T - r_pc_t.mean()

        # calculate the leverage factor and multiply this with the demeaned 

        # To handle the divisions by zero for some cases we set those to 1, won't affect the outcome.
        nonzero_std = np.where(r_pc_t.std(axis=0) != 0, r_pc_t.std(axis=0), 1)
        leverage_t = np.sqrt(avg_var_indiv_f_t) / nonzero_std

        # Demeaning
        lev_r_pc_t = demeaned_r_pc_t * leverage_t
        lev_df = lev_r_pc_t.loc[t].to_frame().T
        lev_dfs.append(lev_df)

lev_r_pc = pd.concat(lev_dfs)
lev_r_pc.fillna(0, inplace=True)
lev_r_pc_clean = lev_r_pc.drop(lev_r_pc.index[:1])
print(lev_r_pc_clean)

### constructing the momentum strategy

In [ ]:
# create two boolean dataframes: one for positive average returns and one for negative average returns
positive_returns_PC = pc_avg_df > 0
negative_returns_PC = pc_avg_df < 0

# convert the boolean dataframes to integers and 0's, one for long positions and one for short positions
long_portfolio_PC = positive_returns_PC.astype(int)
short_portfolio_PC = negative_returns_PC.astype(int)

# create the 5 subsets of PCs
mom_1_10 = ['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10']
mom_11_20 = ['PC11', 'PC12', 'PC13', 'PC14', 'PC15', 'PC16', 'PC17', 'PC18', 'PC19', 'PC20']
mom_21_30 = ['PC21', 'PC22', 'PC23', 'PC24', 'PC25', 'PC26', 'PC27', 'PC28', 'PC29', 'PC30']
mom_31_40 = ['PC31', 'PC32', 'PC33', 'PC34', 'PC35', 'PC36', 'PC37', 'PC38', 'PC39', 'PC40']
mom_41_47 = ['PC41', 'PC42', 'PC43', 'PC44', 'PC45', 'PC46', 'PC47']

# create a list of the subsets for our loop
mom_list = [mom_1_10, mom_11_20, mom_21_30, mom_31_40, mom_41_47]

# create an empty dictionary 
r_mean_set_dict = {}

# create a loop where the dummy dataframe is multiplied with the leveraged PC return dataframe. We shift the portfolio indicator with one, as we need to calculate the returns of t+1. 
for i, mom in enumerate(mom_list):
    # create the strategy: the return of the long positions minus the return of the short positions (accounting for the fact that negative short returns need to become positive
    r_PC_set_mom = (long_portfolio_PC[mom] * lev_r_pc_clean[mom]) - (short_portfolio_PC[mom] * lev_r_pc_clean[mom])
    # we take the mean of the returns of the 10 PC subsets
    r_PC_set_mean = r_PC_set_mom.mean(axis=1)
    # we append it to our dictionary
    r_mean_set_dict[f'mom_set_{i + 1}'] = r_PC_set_mean

# create the dataframe with the series of returns for each subset of PCS
mom_strategy = pd.concat(r_mean_set_dict, axis=1)

mom_strategy.index = pd.to_datetime(mom_strategy.index)
mom_strategy.index = mom_strategy.index.strftime('%Y-%m')
mom_strategy.dropna(inplace=True)

print(mom_strategy.mean(axis=0))

### Replicating table 3

### replicating panel A

In [ ]:
# select the full sample dataframe in the paper and create the two splitted periods 
mom_strategy_full = mom_strategy.loc['1973-07':'2019-12']
mom_strategy_1 = mom_strategy.loc['1973-07':'1996-09']
mom_strategy_2 = mom_strategy.loc['1996-09':]

print(f'the mean of every subset of PCs is:\n')
print(mom_strategy_full.mean(axis=0))

means = mom_strategy.mean(axis=0).tolist()
std = mom_strategy.std(axis=0).tolist()
N = mom_strategy.shape[0]

print(f'the t-statistic of every subset of PCS is:\n')
for m, s in zip(means, std):
    t_statistic = m / (s / (N**0.5))
    print(t_statistic)

print(f'the mean of every subset of PCs is (first half):\n')
print(mom_strategy_1.mean(axis=0))

means = mom_strategy_1.mean(axis=0).tolist()
std = mom_strategy_1.std(axis=0).tolist()
N = mom_strategy_1.shape[0]

print(f'the t-statistic of every subset of PCS is (first half):\n')
for m, s in zip(means, std):
    t_statistic = m / (s / (N**0.5))
    print(t_statistic)

print(f'the mean of every subset of PCs is (second half):\n')
print(mom_strategy_2.mean(axis=0))

means = mom_strategy_2.mean(axis=0).tolist()
std = mom_strategy_2.std(axis=0).tolist()
N = mom_strategy_2.shape[0]

print(f'the t-statistic of every subset of PCS is (second half):\n')
for m, s in zip(means, std):
    t_statistic = m / (s / (N**0.5))
    print(t_statistic)

### Replicating panel B and C

### Loading data and merging datasets

In [ ]:
url = 'https://github.com/Sebasleen/Seminargroup/raw/Seminar/Data/fffactors.dta'

ff = pd.read_stata(url)

# set index to date column
ff.set_index('yyyymm', inplace=True)

# set it to datetime format and correct format
ff.index = pd.to_datetime(ff.index, format='%Y%m')
ff.index = ff.index.strftime('%Y-%m')

# select the needed factors over our sample 
ff5 = ff[['mktrf', 'smb', 'hml', 'rmw', 'cma']].loc['1973-07':'2019-12']

# merge the dataframes together
mom_strategy_ff5 = pd.concat([mom_strategy_full, ff5], axis=1)

# construct a dummy variable for period 1 and for period 2 
mom_strategy_ff5['P1'] = 0
mom_strategy_ff5['P2'] = 0
mom_strategy_ff5.loc[mom_strategy_ff5.index <= '1996-09', 'P1'] = 1
mom_strategy_ff5.loc[mom_strategy_ff5.index >= '1996-09', 'P2'] = 1


### conducting the regressions - Panel B

In [ ]:
# selecting our independent variables from our dataframe
independent_vars = ['mom_set_1', 'P1', 'P2', 'mktrf', 'smb', 'hml', 'rmw', 'cma']
X = mom_strategy_ff5[independent_vars]

# constructing a loop that iterates the regressions
for i in range(2, 6):
    Y = mom_strategy_ff5[f'mom_set_{i}']
    modelB = sm.OLS(Y, X, hasconst=False).fit()
    print(modelB.summary())



### conducting the regressions - Panel C

In [43]:
# selecting our dependent variable
y = mom_strategy_ff5['mom_set_1']

# constructing a loop that iterates the regressions
for i in range(2, 6):
    independent_vars = [f'mom_set_{i}', 'P1', 'P2', 'mktrf', 'smb', 'hml', 'rmw', 'cma']
    X = mom_strategy_ff5[independent_vars]
    modelC = sm.OLS(y, X, hasconst=False).fit()
    print(modelC.summary())

# last regression is one with all independent variables (panel C regression 5)
independent_vars = ['P1', 'P2', 'mom_set_2', 'mom_set_3', 'mom_set_4', 'mom_set_5', 'mktrf', 'smb', 'hml', 'rmw', 'cma']
X = mom_strategy_ff5[independent_vars]
modelC1 = sm.OLS(y, X, hasconst=False).fit()
print(modelC1.summary())

                                 OLS Regression Results                                
Dep. Variable:              mom_set_1   R-squared (uncentered):                   0.090
Model:                            OLS   Adj. R-squared (uncentered):              0.076
Method:                 Least Squares   F-statistic:                              6.772
Date:                Mon, 26 Feb 2024   Prob (F-statistic):                    1.72e-08
Time:                        20:24:44   Log-Likelihood:                          1481.5
No. Observations:                 558   AIC:                                     -2947.
Df Residuals:                     550   BIC:                                     -2912.
Df Model:                           8                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

# Table 4

### loading data and merging dataframes

In [45]:
merged_data = []

# set url for the datafiles

url_P10umd = 'https://github.com/Sebasleen/Seminargroup/raw/Seminar/Data/P10UMD.dta'
url_tsfactor = 'https://github.com/Sebasleen/Seminargroup/raw/Seminar/Data/TSFactor.dta'
url_umd = 'https://github.com/Sebasleen/Seminargroup/raw/Seminar/Data/FactorUMD.dta'

# import the datafiles and the needed columns
p10umd = pd.read_stata(url_P10umd)
tsfactor = pd.read_stata(url_tsfactor)[['year', 'month', 'TSMom', 'yyyymm']]
factor_umd = pd.read_stata(url_umd)[['year', 'month', 'umd']]
#oos_tsmom = pd.read_stata("oos_tsmom_scs.dta")[['yyyymm', 'tsmom1', 'tsmom2', 'tsmom3', 'tsmom4', 'tsmom5']]

# create yyyy-mm date indexes for tsfactor, p10umd and factor_umd

tsfactor.set_index('yyyymm', inplace=True)
tsfactor.index = pd.to_datetime(tsfactor.index, format='%Y%m')
tsfactor.index = tsfactor.index.strftime('%Y-%m')
tsfactor.drop(columns =['year', 'month'], inplace=True)

p10umd['date'] = pd.to_datetime(p10umd['year'].astype(str) + '-' + p10umd['month'].astype(str))
p10umd.set_index('date', inplace=True)
p10umd.index = pd.to_datetime(p10umd.index, format='%Y%m')
p10umd.index = p10umd.index.strftime('%Y-%m')
p10umd.drop(columns =['year', 'month'], inplace=True)

factor_umd['date'] = pd.to_datetime(factor_umd['year'].astype(str) + '-' + factor_umd['month'].astype(str))
factor_umd.set_index('date', inplace=True)
factor_umd.index = pd.to_datetime(factor_umd.index, format='%Y%m')
factor_umd.index = factor_umd.index.strftime('%Y-%m')
factor_umd.drop(columns =['year', 'month'], inplace=True)

# select our appropriate time frame for table 4: July 1964 until December 2019 (except for our PC column, which has the data range from July 1973 until December 2019

p10umd_range = p10umd.loc['1964-07':'2019-12']
factor_umd_range = factor_umd.loc['1964-07':'2019-12']
tsfactor_range = tsfactor.loc['1964-07':'2019-12']

# we already loaded the ff5 in the previous cell, therefore we only adjust the time range to match table 4. 
ff5_range = ff[['rf', 'mktrf', 'smb', 'hml', 'rmw', 'cma']].loc['1964-07':'2019-12']

# we multiply by 100, because our other datasets are in percentage
ff5_range = ff5_range * 100

# merge all datasets except for principal component return
merged_data = pd.concat([p10umd_range, factor_umd_range, tsfactor_range, ff5_range], axis=1)
print(merged_data)


# multiply the columns of mom_strategy_full by 100 (to obtain percentages)
mom_strategy_decile = mom_strategy_full * 100

# calculate the excess returns for the decile portfolios
for i in range(1, 11):
    merged_data[f'ExcessP{i}'] = merged_data[f'p{i}'] - merged_data['rf']

# assign portfolio 11 for high-minus-low
merged_data['ExcessP11'] = merged_data['ExcessP10'] - merged_data['ExcessP1']

# Create a second dataframe for the Principal Component model, which has less observations (July 1973 unitl December 2019). Hence we first select all observations within this timeframe from our merged data set
merged_data_partially = merged_data.loc['1973-07':'2019-12']
merged_data_pca = pd.concat([merged_data_partially, mom_strategy_decile], axis=1)

print(merged_data)


            p1    p2    p3    p4    p5    p6    p7    p8    p9   p10   umd  \
1964-07   1.93  2.01  3.41  3.03  3.45  2.52  0.10  3.06  2.06  0.70 -0.37   
1964-08  -1.32  0.49 -1.13 -0.07 -1.32 -1.93 -1.51 -1.08 -1.72 -0.91 -0.19   
1964-09   5.77  3.02  3.81  3.03  2.58  2.31  1.81  3.74  2.24  6.37 -0.39   
1964-10   1.52  1.79  1.31  0.50  0.74 -0.48  1.91  0.71  1.01  0.01  0.08   
1964-11  -3.05  0.33  0.83 -0.24 -0.32  0.20  0.00  1.79 -0.18  0.04  1.08   
...        ...   ...   ...   ...   ...   ...   ...   ...   ...   ...   ...   
2019-08 -12.06 -9.91 -8.14 -6.61 -3.51 -5.80 -1.72 -0.48  0.91 -0.31  7.60   
2019-09   5.32  5.26  5.89  5.16  0.36  3.03  4.44  1.42  0.34 -3.03 -6.85   
2019-10  -3.92  2.38  4.04  4.50  3.45  4.79  1.92  1.53  1.01 -0.40  0.24   
2019-11   3.88  6.00  6.84  5.82  5.18  3.79  4.48  1.69  2.95  3.43 -2.62   
2019-12  11.06  5.18  3.22  3.69  1.60  2.85  2.42  2.75  2.12  4.60 -2.13   

            TSMom    rf  mktrf   smb       hml   rmw   cma  
19

### replicating panel A

### the fama french 5 factor model (ff5)

In [47]:
# create an empty lists for the estimate results of the fama french 5 factor model 
estimatesFF5_list = []

# momentum sorted portfolios with FF5
for i in range(1, 12):
    X = sm.add_constant(merged_data[['mktrf', 'smb', 'hml', 'cma', 'rmw']])
    model = sm.OLS(merged_data[f'ExcessP{i}'], X).fit()
    estimatesFF5_list.append(model)

for i, est in enumerate(estimatesFF5_list, start=1):
    print(f"\nRegression Results for Excess Portfolio Returns with FF5 - Portfolio {i}:\n")
    print(est.summary())


Regression Results for Excess Portfolio Returns with FF5 - Portfolio 1:

                            OLS Regression Results                            
Dep. Variable:               ExcessP1   R-squared:                       0.681
Model:                            OLS   Adj. R-squared:                  0.679
Method:                 Least Squares   F-statistic:                     281.9
Date:                Mon, 26 Feb 2024   Prob (F-statistic):          4.13e-161
Time:                        20:41:58   Log-Likelihood:                -1956.1
No. Observations:                 666   AIC:                             3924.
Df Residuals:                     660   BIC:                             3951.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------

In [49]:
print("Results for FF5 Model:\n")
resultsFF5_list = []

# calculating the results for the Fama French 5 factor model 
for i, est_FF5 in enumerate(estimatesFF5_list, start=1):
    result_row = {
        'Decile': f'{i}',
        'FF5_Alpha': f'{est_FF5.params["const"]:.2f}',
    }
    t_stat_row = {
        'Decile': f'',
        'FF5_Alpha': f'({est_FF5.tvalues["const"]:.2f})',
    }
    resultsFF5_list.append(result_row)
    resultsFF5_list.append(t_stat_row)

resultsFF5_df = pd.DataFrame(resultsFF5_list)
print(resultsFF5_df)

Results for FF5 Model:

   Decile FF5_Alpha
0       1     -0.76
1           (-4.06)
2       2     -0.36
3           (-2.74)
4       3     -0.20
5           (-1.90)
6       4     -0.15
7           (-1.89)
8       5     -0.16
9           (-2.41)
10      6     -0.13
11          (-2.00)
12      7     -0.11
13          (-1.90)
14      8      0.04
15           (0.65)
16      9      0.09
17           (1.08)
18     10      0.56
19           (4.71)
20     11      1.32
21           (4.88)


### the umd model (including ff5)

In [51]:
estimatesUMD_list = []

# Momentum sorted portfolios with FF5 + UMD
for i in range(1, 12):
    X = sm.add_constant(merged_data[['mktrf', 'smb', 'hml', 'cma', 'rmw', 'umd']])
    model = sm.OLS(merged_data[f'ExcessP{i}'], X).fit()
    estimatesUMD_list.append(model)

for i, est in enumerate(estimatesUMD_list, start=1):
    print(f"\nRegression Results for Excess Portfolio Returns with FF5 + UMD - Portfolio {i}:\n")
    print(est.summary())



Regression Results for Excess Portfolio Returns with FF5 + UMD - Portfolio 1:

                            OLS Regression Results                            
Dep. Variable:               ExcessP1   R-squared:                       0.894
Model:                            OLS   Adj. R-squared:                  0.893
Method:                 Least Squares   F-statistic:                     930.5
Date:                Mon, 26 Feb 2024   Prob (F-statistic):          8.06e-318
Time:                        20:42:17   Log-Likelihood:                -1588.0
No. Observations:                 666   AIC:                             3190.
Df Residuals:                     659   BIC:                             3222.
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------

In [53]:
print("Results for FF5 + UMD Model:\n")
resultsUMD_list = []

# Results for FF5 + UMD
for i, est_umd in enumerate(estimatesUMD_list, start=1):
    result_row = {
        'Decile': f'{i}',
        'UMD_Alpha': f'{est_umd.params["const"]:.2f}',
        'UMD_Coefficient': f'{est_umd.params["umd"]:.2f}'
    }
    t_stat_row = {
        'Decile': f'',
        'UMD_Alpha': f'({est_umd.tvalues["const"]:.2f})',
        'UMD_Coefficient': f'({est_umd.tvalues["umd"]:.2f})'
    }
    resultsUMD_list.append(result_row)
    resultsUMD_list.append(t_stat_row)

resultsUMD_df = pd.DataFrame(resultsUMD_list)
print(resultsUMD_df)


Results for FF5 + UMD Model:

   Decile UMD_Alpha UMD_Coefficient
0       1     -0.11           -0.93
1           (-1.02)        (-36.49)
2       2      0.13           -0.70
3            (2.00)        (-46.80)
4       3      0.17           -0.54
5            (2.88)        (-38.35)
6       4      0.08           -0.33
7            (1.22)        (-22.75)
8       5     -0.04           -0.17
9           (-0.63)        (-12.29)
10      6     -0.09           -0.05
11          (-1.42)         (-3.50)
12      7     -0.16            0.07
13          (-2.67)          (4.75)
14      8     -0.11            0.22
15          (-2.00)         (16.97)
16      9     -0.14            0.33
17          (-2.42)         (23.81)
18     10      0.16            0.57
19           (2.20)         (32.54)
20     11      0.28            1.51
21           (2.44)         (56.81)


### factor momentum (using factors from table 1) including ff5

In [55]:
estimatesFMOMind_list = []

# Momentum sorted portfolios with FF5 + MOMind
for i in range(1, 12):
    X = sm.add_constant(merged_data[['mktrf', 'smb', 'hml', 'cma', 'rmw', 'TSMom']])
    model = sm.OLS(merged_data[f'ExcessP{i}'], X).fit()
    estimatesFMOMind_list.append(model)

for i, est in enumerate(estimatesFMOMind_list, start=1):
    print(f"\nRegression Results for Excess Portfolio Returns with FF5 + FMOMind - Portfolio {i}:\n")
    print(est.summary())


Regression Results for Excess Portfolio Returns with FF5 + FMOMind - Portfolio 1:

                            OLS Regression Results                            
Dep. Variable:               ExcessP1   R-squared:                       0.802
Model:                            OLS   Adj. R-squared:                  0.800
Method:                 Least Squares   F-statistic:                     444.0
Date:                Mon, 26 Feb 2024   Prob (F-statistic):          1.04e-227
Time:                        20:42:49   Log-Likelihood:                -1797.9
No. Observations:                 666   AIC:                             3610.
Df Residuals:                     659   BIC:                             3641.
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------

In [57]:
print("Results for FF5 + FMOMind Model:\n")
resultsFMOMind_list = []

# Results for FF5 + FMOMind
for i, est_FMOMind in enumerate(estimatesFMOMind_list, start=1):
    result_row = {
        'Decile': f'{i}',
        'FMOMind_Alpha': f'{est_FMOMind.params["const"]:.2f}',
        'FMOMind_Coefficient': f'{est_FMOMind.params["TSMom"]:.2f}'
    }
    t_stat_row = {
        'Decile': f'',
        'FMOMind_Alpha': f'({est_FMOMind.tvalues["const"]:.2f})',
        'FMOMind_Coefficient': f'({est_FMOMind.tvalues["TSMom"]:.2f})'
    }
    resultsFMOMind_list.append(result_row)
    resultsFMOMind_list.append(t_stat_row)

resultsFMOMind_df = pd.DataFrame(resultsFMOMind_list)
print(resultsFMOMind_df)

Results for FF5 + FMOMind Model:

   Decile FMOMind_Alpha FMOMind_Coefficient
0       1         -0.06               -2.46
1               (-0.37)            (-20.02)
2       2          0.15               -1.78
3                (1.47)            (-21.29)
4       3          0.17               -1.30
5                (1.88)            (-17.79)
6       4          0.12               -0.95
7                (1.67)            (-16.64)
8       5         -0.02               -0.47
9               (-0.39)             (-9.03)
10      6         -0.06               -0.22
11              (-1.00)             (-4.19)
12      7         -0.14                0.09
13              (-2.28)              (1.88)
14      8         -0.08                0.44
15              (-1.31)              (8.49)
16      9         -0.11                0.67
17              (-1.43)             (11.10)
18     10          0.16                1.42
19               (1.53)             (17.11)
20     11          0.21                3.8

### factor momentum (using the PC factors 1-10) including ff5

In [None]:
estimatesFMOMpc_list = []

# Momentum sorted portfolios with FF5 + UMD
for i in range(1, 12):
    X = sm.add_constant(merged_data_pca[['mktrf', 'smb', 'hml', 'cma', 'rmw', 'mom_set_1']])
    model = sm.OLS(merged_data_pca[f'ExcessP{i}'], X).fit()
    estimatesFMOMpc_list.append(model)

for i, est in enumerate(estimatesFMOMpc_list, start=1):
    print(f"\nRegression Results for Excess Portfolio Returns with FF5 + FMOMpc - Portfolio {i}:\n")
    print(est.summary())

In [None]:
print("Results for FF5 + FMOMpc Model:\n")
resultsFMOMpc_list = []

# Append results for FF5 + FMOMind
for i, est_FMOMpc in enumerate(estimatesFMOMpc_list, start=1):
    result_row = {
        'Decile': f'{i}',
        'FMOMpc_Alpha': f'{est_FMOMpc.params["const"]:.2f}',
        'FMOMpc_Coefficient': f'{est_FMOMpc.params["mom_set_1"]:.2f}'
    }
    t_stat_row = {
        'Decile': f'',
        'FMOMpc_Alpha': f'({est_FMOMpc.tvalues["const"]:.2f})',
        'FMOMpc_Coefficient': f'({est_FMOMpc.tvalues["mom_set_1"]:.2f})'
    }
    resultsFMOMpc_list.append(result_row)
    resultsFMOMpc_list.append(t_stat_row)

# convert the list of results to a DataFrame
resultsFMOMpc_df = pd.DataFrame(resultsFMOMpc_list)

# Display the results DataFrame
print(resultsFMOMpc_df)

### merging results into onde dataframe

In [None]:
resultsUMD_df = resultsUMD_df.drop(columns=['Decile'])
resultsFMOMind_df = resultsFMOMind_df.drop(columns=['Decile'])
resultsFMOMpc_df = resultsFMOMpc_df.drop(columns=['Decile'])

# Merge the results
table_results = pd.concat([resultsFF5_df, resultsUMD_df, resultsFMOMind_df, resultsFMOMpc_df], axis=1)

print(table_results)

### calculating alphas for the respective models

In [None]:
alpha_means_FF5 = []
alpha_means_UMD = []
alpha_means_FMOMind = []
alpha_means_FMOMpc = []

# calculate the absolute values of the alphas
for i, est_ff5 in enumerate(estimatesFF5_list, start=1):
    alpha_mean_ff5 = np.abs(est_ff5.params['const']).mean()
    alpha_means_FF5.append(alpha_mean_ff5)

for i, est_umd in enumerate(estimatesUMD_list, start=1):
    alpha_mean_umd = np.abs(est_umd.params['const']).mean()
    alpha_means_UMD.append(alpha_mean_umd)

for i, est_fmomind in enumerate(estimatesFMOMind_list, start=1):
    alpha_mean_fmomind = np.abs(est_fmomind.params['const']).mean()
    alpha_means_FMOMind.append(alpha_mean_fmomind)

for i, est_fmompc in enumerate(estimatesFMOMpc_list, start=1):
    alpha_mean_fmompc = np.abs(est_fmompc.params['const']).mean()
    alpha_means_FMOMpc.append(alpha_mean_fmompc)

# calculate the absolute mean alphas for the different models except for the winners-losers portfolio
Avg_alpha_FF5 = np.mean(alpha_means_FF5[:-1])
Avg_alpha_UMD = np.mean(alpha_means_UMD[:-1])
Avg_alpha_FMOMind = np.mean(alpha_means_FMOMind[:-1])
Avg_alpha_FMOMpc = np.mean(alpha_means_FMOMpc[:-1])

# create a DataFrame for average alphas
avg_alphas_df = pd.DataFrame({
    'Model': ['FF5', 'UMD', 'FMOMind', 'FMOMpc'],
    'Avg_alpha': [Avg_alpha_FF5, Avg_alpha_UMD, Avg_alpha_FMOMind, Avg_alpha_FMOMpc]
})

# print the results
print(avg_alphas_df)

### replicating panel B

In [None]:
# selecting our independent variables (using umd as our dependent variable)
independent_vars_ff5 = ['mktrf', 'smb', 'hml', 'cma', 'rmw']

# Augment the FF5 model with different subsets of PC factors
independent_vars_mom1 = ['mktrf', 'smb', 'hml', 'cma', 'rmw', 'mom_set_1']
independent_vars_mom2 = ['mktrf', 'smb', 'hml', 'cma', 'rmw', 'mom_set_2']
independent_vars_mom3 = ['mktrf', 'smb', 'hml', 'cma', 'rmw', 'mom_set_3']
independent_vars_mom4 = ['mktrf', 'smb', 'hml', 'cma', 'rmw', 'mom_set_4']
independent_vars_mom5 = ['mktrf', 'smb', 'hml', 'cma', 'rmw', 'mom_set_5']

# create a list to store the results
results_list = []

# fit the FF5 model
X_ff5 = sm.add_constant(merged_data_pca[independent_vars_ff5])
model_ff5 = sm.OLS(merged_data_pca['umd'], X_ff5)
results_ff5 = model_ff5.fit()

# record the results for the FF5 model (only alpha)
results_list.append({
    'Model': 'FF5',
    'Alpha': results_ff5.params['const'],
    'Alpha T-stat': results_ff5.tvalues['const'],
    'FMom Slope':' ',
    'FMom Slope T-stat':' ',
    'R-squared_adj': results_ff5.rsquared_adj
})

# Fit models with different subsets of PC factors
for subset_vars in [independent_vars_mom1, independent_vars_mom2, independent_vars_mom3, independent_vars_mom4, independent_vars_mom5]:
    X_subset = sm.add_constant(merged_data_pca[subset_vars])
    model_subset = sm.OLS(merged_data_pca['umd'], X_subset)
    results_subset = model_subset.fit()

    # Record the results for each subset, including the alpha and slope for pctsmom factor
    results_list.append({
        'Model': ' + '.join(subset_vars[-1:]),
        'Alpha': results_subset.params['const'],
        'Alpha T-stat': results_subset.tvalues['const'],
        'FMom Slope': results_subset.params[subset_vars[-1:][0]],
        'FMom Slope T-stat': results_subset.tvalues[subset_vars[-1:][0]],
        'R-squared_adj': results_subset.rsquared_adj
    })

# Create a DataFrame from the list of results
results_df = pd.DataFrame(results_list)

# Display the results
print(results_df)
