In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

In [2]:
file_url = 'https://raw.githubusercontent.com/Sebasleen/Seminargroup/Seminar/US%20Factors/anomalies.dta'

# Read the Stata file content
Anomalies = pd.read_stata(file_url)
print(Anomalies)

       year  month anomaly    ret   time  global
0      1963      7      ac  2.170   42.0     0.0
1      1963      8      ac -0.197   43.0     0.0
2      1963      9      ac  0.600   44.0     0.0
3      1963     10      ac  6.463   45.0     0.0
4      1963     11      ac -2.260   46.0     0.0
...     ...    ...     ...    ...    ...     ...
12638  2019      8   glumd  2.990  715.0     1.0
12639  2019      9   glumd -3.260  716.0     1.0
12640  2019     10   glumd -0.940  717.0     1.0
12641  2019     11   glumd  0.000  718.0     1.0
12642  2019     12   glumd  0.740  719.0     1.0

[12643 rows x 6 columns]


<font size='4' face='Times new Roman'>    

US factors:

- **-ac**: Accruals
- **-bab**: Betting against Beta
- **-cfp**: Cash-flow to price
- **-cma**: Investment (Conservative minus aggressive)
- **-ep**: Earnings to price
- **-hml**: Value
- **-liq**: Liquidity
- **-ltrev**: Long-term reversal
- **-nsi**: Net share issue
- **-qmj**: Quality minus junk
- **-rmw**: Profitability
- **-rvar**: Residual Variance
- **-smb**: Size
- **-strev**: Short-term reversals
- **-umd**: Momentum

Global factors:

- **glbab**: Betting against beta
- **glcma**: Investment (Conservative minus aggressive)
- **glhml**: Value (high minus low)
- **glqmj**: Quality minus junk
- **glrmw**: Profitability (robust minus weak)
- **glumd**: Momentum (up minus down)
- **glsmb**: Size (small minus big)

</font>

**Replicating the time series regressions**

In [5]:
# Create an empty list to store results
results_list = []

for anomaly in Anomalies['anomaly'].unique():
    subset = Anomalies[Anomalies['anomaly'] == anomaly]
    subset = subset.sort_values(by='time')
    
    # Create a binary variable for positive returns in the past 12 months
    subset['positive_return'] = subset['ret'].rolling(window=12, min_periods=12).mean().shift(1) > 0
    
    # Drop the first 12 observations in the subset after the rolling window has been applied
    subset = subset.iloc[12:]
    
    # Set up and fit the model
    y = subset['ret']
    X = sm.add_constant(subset['positive_return'].astype(int))
    model = sm.OLS(y, X)
    results = model.fit(cov_type='cluster', cov_kwds={'groups': subset['time']})
    
    # Add the results to a list
    results_list.append({
        'Anomaly': anomaly,
        'Alpha': results.params['const'],
        'T-stat_Alpha': results.tvalues['const'],
        'Slope': results.params['positive_return'],
        'T-stat_Slope': results.tvalues['positive_return'],
    })

results_table = pd.DataFrame(results_list)
print(results_table)

   Anomaly     Alpha  T-stat_Alpha     Slope  T-stat_Slope
0       ac  0.150195      1.184450  0.101410      0.649822
1      bab -0.221412     -0.632211  1.319041      3.534152
2      cfp  0.127745      0.781292  0.235454      1.157989
3      cma  0.120082      0.974474  0.244693      1.545819
4       ep  0.101357      0.616107  0.302075      1.458207
5      hml  0.038477      0.204762  0.410255      1.780679
6      liq  0.157215      0.741922  0.356063      1.291807
7    ltrev -0.252989     -1.663307  0.757680      3.850110
8      nsi  0.172982      1.324451  0.089249      0.486779
9      qmj  0.086832      0.650364  0.434757      2.507550
10     rmw  0.040360      0.222250  0.337185      1.673841
11    rvar -0.463569     -1.638345  1.061609      2.737366
12     smb -0.104191     -0.615583  0.583455      2.508982
13   strev  0.485098      1.427336  0.013888      0.038600
14     umd  0.716042      2.697340 -0.094969     -0.288098
15   glbab  0.190820      0.577502  0.837610      2.3039

**Replicating the pooled regression results**

In [7]:
# Sort the DataFrame based on 'time'
Anomalies_filtered = Anomalies.sort_values(by=['anomaly', 'time']).copy()

# Create an indicator variable for positive returns in the past 12 months
Anomalies_filtered['positive_return'] = (
    Anomalies_filtered.groupby('anomaly')['ret']
    .rolling(window=12, min_periods=12)
    .mean()
    .shift(1)
    .reset_index(level=0, drop=True)  # Reset the index for proper alignment
    > 0
)

# Drop the first 12 observations in the subset
Anomalies_filtered = Anomalies_filtered.groupby('anomaly').apply(lambda x: x.iloc[12:]).reset_index(drop=True)

# Extract dependent and explanatory variables
y = Anomalies_filtered['ret']
X = sm.add_constant(Anomalies_filtered['positive_return'].astype(int))

# Fit the pooled model
model = sm.OLS(y, X)
results_pooled = model.fit(cov_type='cluster', cov_kwds={'groups': Anomalies_filtered['time']})

# Display the regression results
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                    ret   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                 -0.003
Method:                 Least Squares   F-statistic:                  0.001553
Date:                Sun, 18 Feb 2024   Prob (F-statistic):              0.969
Time:                        01:11:01   Log-Likelihood:                -900.78
No. Observations:                 338   AIC:                             1806.
Df Residuals:                     336   BIC:                             1813.
Df Model:                           1                                         
Covariance Type:              cluster                                         
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
const               0.6687      0.377     

In [8]:
# Combine the pooled regression results and the individual
# time series results.

results_pooled_df = pd.DataFrame({
    'Anomaly': ['Pooled'],
    'Alpha': [results_pooled.params['const']],
    'T-stat_Alpha': [results_pooled.tvalues['const']],
    'Slope': [results_pooled.params['positive_return']],
    'T-stat_Slope': [results_pooled.tvalues['positive_return']],
})

results_table_2_combined = pd.concat([results_pooled_df, results_table], ignore_index=True)

# Display the combined results table
print(results_table_2_combined)

   Anomaly     Alpha  T-stat_Alpha     Slope  T-stat_Slope
0   Pooled  0.057428      0.719687  0.453466      4.219117
1       ac  0.150195      1.184450  0.101410      0.649822
2      bab -0.221412     -0.632211  1.319041      3.534152
3      cfp  0.127745      0.781292  0.235454      1.157989
4      cma  0.120082      0.974474  0.244693      1.545819
5       ep  0.101357      0.616107  0.302075      1.458207
6      hml  0.038477      0.204762  0.410255      1.780679
7      liq  0.157215      0.741922  0.356063      1.291807
8    ltrev -0.252989     -1.663307  0.757680      3.850110
9      nsi  0.172982      1.324451  0.089249      0.486779
10     qmj  0.086832      0.650364  0.434757      2.507550
11     rmw  0.040360      0.222250  0.337185      1.673841
12    rvar -0.463569     -1.638345  1.061609      2.737366
13     smb -0.104191     -0.615583  0.583455      2.508982
14   strev  0.485098      1.427336  0.013888      0.038600
15     umd  0.716042      2.697340 -0.094969     -0.2880

In [9]:
# List the factors in the same order as table 2
desired_order = [
    'Pooled',
    'smb', 'hml', 'rmw', 'cma', 'umd', 'ac',
    'bab', 'cfp', 'ep', 'liq', 'ltrev', 'nsi', 'qmj', 'rvar', 'strev',
    'glbab', 'glcma', 'glhml', 'glqmj', 'glrmw', 'glumd', 'glsmb'
]

cat_dtype = pd.CategoricalDtype(categories=desired_order, ordered=True)
results_table_2_combined['Anomaly'] = results_table_2_combined['Anomaly'].astype(cat_dtype)
results_table_2_combined = results_table_2_combined.sort_values(by='Anomaly').reset_index(drop=True)

print(results_table_2_combined)

   Anomaly     Alpha  T-stat_Alpha     Slope  T-stat_Slope
0   Pooled  0.057428      0.719687  0.453466      4.219117
1      smb -0.104191     -0.615583  0.583455      2.508982
2      hml  0.038477      0.204762  0.410255      1.780679
3      rmw  0.040360      0.222250  0.337185      1.673841
4      cma  0.120082      0.974474  0.244693      1.545819
5      umd  0.716042      2.697340 -0.094969     -0.288098
6       ac  0.150195      1.184450  0.101410      0.649822
7      bab -0.221412     -0.632211  1.319041      3.534152
8      cfp  0.127745      0.781292  0.235454      1.157989
9       ep  0.101357      0.616107  0.302075      1.458207
10     liq  0.157215      0.741922  0.356063      1.291807
11   ltrev -0.252989     -1.663307  0.757680      3.850110
12     nsi  0.172982      1.324451  0.089249      0.486779
13     qmj  0.086832      0.650364  0.434757      2.507550
14    rvar -0.463569     -1.638345  1.061609      2.737366
15   strev  0.485098      1.427336  0.013888      0.0386

In [10]:
# Define the factor_mapping before checking if labels have already been assigned
factor_mapping = {
    'Pooled': 'Pooled',
    'smb': 'Size',
    'hml': 'Value',
    'rmw': 'Profitability',
    'cma': 'Investment',
    'umd': 'Momentum',
    'ac': 'Accruals',
    'bab': 'Betting against Beta',
    'cfp': 'Cash-flow to price',
    'ep': 'Earnings to price',
    'liq': 'Liquidity',
    'ltrev': 'Long-term reversal',
    'nsi': 'Net share issue',
    'qmj': 'Quality minus junk',
    'rvar': 'Residual Variance',
    'strev': 'Short-term reversals',
    'glbab': 'Betting against beta (gl)',
    'glcma': 'Investment (gl)',
    'glhml': 'Value (gl)',
    'glqmj': 'Quality minus junk (gl)',
    'glrmw': 'Profitability (gl)',
    'glumd': 'Momentum (gl)',
    'glsmb': 'Size (gl)'
}

results_table_2_combined['Anomaly'] = results_table_2_combined['Anomaly'].map(factor_mapping)
print(results_table_2_combined)

                      Anomaly     Alpha  T-stat_Alpha     Slope  T-stat_Slope
0                      Pooled  0.057428      0.719687  0.453466      4.219117
1                        Size -0.104191     -0.615583  0.583455      2.508982
2                       Value  0.038477      0.204762  0.410255      1.780679
3               Profitability  0.040360      0.222250  0.337185      1.673841
4                  Investment  0.120082      0.974474  0.244693      1.545819
5                    Momentum  0.716042      2.697340 -0.094969     -0.288098
6                    Accruals  0.150195      1.184450  0.101410      0.649822
7        Betting against Beta -0.221412     -0.632211  1.319041      3.534152
8          Cash-flow to price  0.127745      0.781292  0.235454      1.157989
9           Earnings to price  0.101357      0.616107  0.302075      1.458207
10                  Liquidity  0.157215      0.741922  0.356063      1.291807
11         Long-term reversal -0.252989     -1.663307  0.757680 