### Imports

In [19]:
import pandas as pd
import numpy as np
import statsmodels.api as sm


# Table 1


### Loading data

In [11]:
url = 'https://github.com/Sebasleen/Seminargroup/raw/Seminar/Data/anomalies.dta'

Anomalies = pd.read_stata(url)

# display unique values in the 'anomaly' column
print(Anomalies['anomaly'].unique())

# delete the global factors from the dataframe and create Anomalies US
column_name = 'anomaly'
values_to_dropUS = ['glbab', 'glcma', 'glhml', 'glqmj', 'glrmw', 'glsmb', 'glumd']
ElementsUS = Anomalies[column_name].isin(values_to_dropUS)
Anomalies_US = Anomalies[~ElementsUS]

# delete the US factors from dataframe and create Anomalies Global 
column_name = 'anomaly'
values_to_dropGF = ['ac', 'bab', 'cfp', 'cma', 'ep', 'hml', 'liq', 'ltrev', 'nsi', 'qmj', 'rmw', 'rvar',
                    'smb', 'strev', 'umd']
ElementsGF = Anomalies[column_name].isin(values_to_dropGF)
Anomalies_GF = Anomalies[~ElementsGF]

# print both anomalies US and anomalies global 
print(Anomalies_US)
print(Anomalies_GF)

['ac' 'bab' 'cfp' 'cma' 'ep' 'hml' 'liq' 'ltrev' 'nsi' 'qmj' 'rmw' 'rvar'
 'smb' 'strev' 'umd' 'glbab' 'glcma' 'glhml' 'glqmj' 'glrmw' 'glsmb'
 'glumd']
       year  month anomaly    ret   time  global
0      1963      7      ac  2.170   42.0     0.0
1      1963      8      ac -0.197   43.0     0.0
2      1963      9      ac  0.600   44.0     0.0
3      1963     10      ac  6.463   45.0     0.0
4      1963     11      ac -2.260   46.0     0.0
...     ...    ...     ...    ...    ...     ...
10111  2019      8     umd  7.600  715.0     0.0
10112  2019      9     umd -6.850  716.0     0.0
10113  2019     10     umd  0.240  717.0     0.0
10114  2019     11     umd -2.620  718.0     0.0
10115  2019     12     umd -2.130  719.0     0.0

[10116 rows x 6 columns]
       year  month anomaly       ret   time  global
10116  1987      2   glbab  2.236918  325.0     1.0
10117  1987      3   glbab  1.828450  326.0     1.0
10118  1987      4   glbab -5.521739  327.0     1.0
10119  1987      5   glba

### replicating table 1

### US factors

In [12]:
# calculate the mean and stand deviation of US factors
AnomaliesUS = Anomalies_US.groupby(['anomaly']).agg({'ret': ['mean', 'std', 'count']}).reset_index()
AnomaliesUS = Anomalies_US.pivot_table(index='anomaly', values='ret', aggfunc=['mean', 'std', 'count'])
AnomaliesUS.columns = ['Mean', 'SD', 'ret_number']
AnomaliesUS.reset_index(inplace=True)
AnomaliesUS.columns = ['anomaly', 'Mean', 'SD', 'ret_number']

# calculate additional statistics
AnomaliesUS['ret_semean'] = AnomaliesUS['SD'] / np.sqrt(AnomaliesUS['ret_number'])

# multiply by 12 to create annualized returns
AnomaliesUS['ret'] = AnomaliesUS['Mean'] * 12

# multiply by sqrt(12) to create annualized standard deviation
AnomaliesUS['sd'] = AnomaliesUS['SD'] * np.sqrt(12)

# calculate the t-stat by dividing the return by the standard error of the mean. Divide by 12 to annualize it
AnomaliesUS['tstat'] = AnomaliesUS['ret'] / AnomaliesUS['ret_semean'] /12

# format the table to correct decimals
AnomaliesUS[['Mean']] = AnomaliesUS[['ret']].apply(lambda x: x.map("{:.1f}%".format))
AnomaliesUS[['SD']] = AnomaliesUS[['sd']].apply(lambda x: x.map("{:.1f}%".format))
AnomaliesUS[['T-value']] = AnomaliesUS[['tstat']].apply(lambda x: x.map("{:.2f}".format))

print(AnomaliesUS[['anomaly','Mean','SD','T-value']])

   anomaly  Mean     SD T-value
0       ac  2.8%   6.6%    3.19
1      bab  9.8%  11.2%    6.55
2      cfp  3.4%   8.6%    2.94
3      cma  3.3%   6.9%    3.59
4       ep  3.5%   8.9%    2.95
5      hml  3.6%   9.7%    2.82
6      liq  4.4%  11.6%    2.77
7    ltrev  2.5%   8.7%    2.16
8      nsi  2.8%   8.2%    2.52
9      qmj  4.6%   7.7%    4.47
10     rmw  3.1%   7.5%    3.13
11    rvar  1.6%  17.3%    0.68
12     smb  2.7%  10.4%    1.97
13   strev  6.0%  10.6%    4.21
14     umd  7.8%  14.5%    4.02


### Global factors

In [13]:
# calculate the mean and stand deviation of Global factors
AnomaliesGF = Anomalies_GF.groupby(['anomaly']).agg({'ret': ['mean', 'std', 'count']}).reset_index()
AnomaliesGF = Anomalies_GF.pivot_table(index='anomaly', values='ret', aggfunc=['mean', 'std', 'count'])
AnomaliesGF.columns = ['Mean', 'SD', 'ret_number']
AnomaliesGF.reset_index(inplace=True)
AnomaliesGF.columns = ['anomaly', 'Mean', 'SD', 'ret_number']

# calculate additional statistics
AnomaliesGF['ret_semean'] = AnomaliesGF['SD'] / np.sqrt(AnomaliesGF['ret_number'])

# multiply by 12 to create annualized returns
AnomaliesGF['ret'] = AnomaliesGF['Mean'] * 12

# multiply by sqrt(12) to create annualized standard deviation
AnomaliesGF['sd'] = AnomaliesGF['SD'] * np.sqrt(12)

# calculate the t-stat by dividing the return by the standard error of the mean. Divide by 12 to annualize it
AnomaliesGF['tstat'] = AnomaliesGF['ret'] / AnomaliesGF['ret_semean'] /12

# format the table to correct decimals
AnomaliesGF[['Mean']] = AnomaliesGF[['ret']].apply(lambda x: x.map("{:.1f}%".format))
AnomaliesGF[['SD']] = AnomaliesGF[['sd']].apply(lambda x: x.map("{:.1f}%".format))
AnomaliesGF[['T-value']] = AnomaliesGF[['tstat']].apply(lambda x: x.map("{:.2f}".format))

print(AnomaliesGF[['anomaly','Mean','SD','T-value']])

  anomaly  Mean     SD T-value
0   glbab  9.6%   9.7%    5.70
1   glcma  1.9%   6.0%    1.74
2   glhml  4.0%   7.4%    2.92
3   glqmj  6.2%   6.8%    5.06
4   glrmw  4.3%   4.7%    4.91
5   glsmb  1.1%   7.1%    0.83
6   glumd  7.9%  12.1%    3.54


# Table 2 

### replicating table 2 (uses the same dataset as table 1)

In [15]:
# create an empty list to store results
results_list = []

# create a loop which iterates over each anomly in our dataset 
for anomaly in Anomalies['anomaly'].unique():
    subset = Anomalies[Anomalies['anomaly'] == anomaly]
    subset = subset.sort_values(by='time')

    # create a binary variable for positive returns in the past 12 months (the signal variable)
    subset['positive_return'] = subset['ret'].rolling(window=12, min_periods=12).mean().shift(1) > 0

    # drop the first 12 observations in the subset after the rolling window has been applied (so dropping N/A values)
    subset = subset.iloc[12:]

    # select our OLS model and fit our data
    y = subset['ret']
    X = sm.add_constant(subset['positive_return'].astype(int))
    model = sm.OLS(y, X)
    
    # select the correct covariance type (as used in the paper)
    results = model.fit(cov_type='cluster', cov_kwds={'groups': subset['time']})

    # append the results to our dictionary to create the results
    results_list.append({
        'anomaly': anomaly,
        'alpha': results.params['const'],
        'T-stat_alpha': results.tvalues['const'],
        'slope': results.params['positive_return'],
        'T-stat_slope': results.tvalues['positive_return'],
    })

results_table = pd.DataFrame(results_list)
print(results_table)


   Anomaly     Alpha  T-stat_Alpha     Slope  T-stat_Slope
0       ac  0.150195      1.184450  0.101410      0.649822
1      bab -0.221412     -0.632211  1.319041      3.534152
2      cfp  0.127745      0.781292  0.235454      1.157989
3      cma  0.120082      0.974474  0.244693      1.545819
4       ep  0.101357      0.616107  0.302075      1.458207
5      hml  0.038477      0.204762  0.410255      1.780679
6      liq  0.157215      0.741922  0.356063      1.291807
7    ltrev -0.252989     -1.663307  0.757680      3.850110
8      nsi  0.172982      1.324451  0.089249      0.486779
9      qmj  0.086832      0.650364  0.434757      2.507550
10     rmw  0.040360      0.222250  0.337185      1.673841
11    rvar -0.463569     -1.638345  1.061609      2.737366
12     smb -0.104191     -0.615583  0.583455      2.508982
13   strev  0.485098      1.427336  0.013888      0.038600
14     umd  0.716042      2.697340 -0.094969     -0.288098
15   glbab  0.190820      0.577502  0.837610      2.3039

# Table 3

### Loading dataset

In [20]:
# loading dataframes 

url = 'https://github.com/Sebasleen/Seminargroup/raw/Seminar/Data/managed_portfolios_anom_d_55.csv'

r_daily = pd.read_csv(url)

# drop all momentum factors or factors that are constructed based on momentum (including market return variables)

factor_drop_list = ['r_mom', 'r_indmom', 'r_valmom', 'r_valmomprof', 'r_mom12', 'r_momrev', 'r_indmomrev', 'r_exchsw', 'rme', 're_ew']

r_daily.drop(columns=factor_drop_list, inplace=True)

# set date to datetime format and set the date to the index 

r_daily['date'] = pd.to_datetime(r_daily['date'])
r_daily.set_index('date', inplace=True)

# following the procedure in the paper, if there are observations missing we set them to 0. (footnote 16)

r_daily.fillna(0, inplace=True)

# create a list of factors for later analysis purposes 

factors = [col for col in r_daily.columns if col.startswith('r_')]

# create a monthly return dataframe for later analysis purposes (by summing the daily returns)

r_monthly = r_daily.resample('M').sum()
r_monthly.index = r_monthly.index.strftime('%Y-%m')


ImportError: cannot import name 'freq_to_period_freqstr' from 'pandas._libs.tslibs.dtypes' (/Users/sjoerd/anaconda3/lib/python3.11/site-packages/pandas/_libs/tslibs/dtypes.cpython-311-darwin.so)