In [174]:
import numpy as np
import seaborn as sns
import pandas as pd
import statsmodels.api as sm

In [176]:
file_url = 'https://raw.githubusercontent.com/Sebasleen/Seminargroup/Seminar/US%20Factors/anomalies.dta'

# Read the Stata file content
Anomalies = pd.read_stata(file_url)
print(Anomalies)

       year  month anomaly    ret   time  global
0      1963      7      ac  2.170   42.0     0.0
1      1963      8      ac -0.197   43.0     0.0
2      1963      9      ac  0.600   44.0     0.0
3      1963     10      ac  6.463   45.0     0.0
4      1963     11      ac -2.260   46.0     0.0
...     ...    ...     ...    ...    ...     ...
12638  2019      8   glumd  2.990  715.0     1.0
12639  2019      9   glumd -3.260  716.0     1.0
12640  2019     10   glumd -0.940  717.0     1.0
12641  2019     11   glumd  0.000  718.0     1.0
12642  2019     12   glumd  0.740  719.0     1.0

[12643 rows x 6 columns]


<font size='4' face='Times new Roman'>    

US factors:

- **-ac**: Accruals
- **-bab**: Betting against Beta
- **-cfp**: Cash-flow to price
- **-cma**: Investment (Conservative minus aggressive)
- **-ep**: Earnings to price
- **-hml**: Value
- **-liq**: Liquidity
- **-ltrev**: Long-term reversal
- **-nsi**: Net share issue
- **-qmj**: Quality minus junk
- **-rmw**: Profitability
- **-rvar**: Residual Variance
- **-smb**: Size
- **-strev**: Short-term reversals
- **-umd**: Momentum

Global factors:

- **glbab**: Betting against beta
- **glcma**: Investment (Conservative minus aggressive)
- **glhml**: Value (high minus low)
- **glqmj**: Quality minus junk
- **glrmw**: Profitability (robust minus weak)
- **glumd**: Momentum (up minus down)
- **glsmb**: Size (small minus big)

</font>

In [270]:
# Assuming your dataset is named 'Anomalies'
# Create an empty list to store results
results_list = []

# Iterate through unique anomalies
for anomaly in Anomalies['anomaly'].unique():
    # Filter the data for the current anomaly
    subset = Anomalies[Anomalies['anomaly'] == anomaly]
    
    # Sort the subset based on the 'time' column
    subset = subset.sort_values(by='time')
    
    # Create an indicator variable for positive returns in the past 12 months
    subset['positive_return'] = subset['ret'].rolling(window=12, min_periods=1).mean().shift(1) > 0
    
    # Drop the first 12 observations in the subset
    subset = subset.iloc[12:]
    
    # Extract dependent and explanatory variables
    y = subset['ret']
    X = sm.add_constant(subset['positive_return'].astype(int))
    
    # Fit the model
    model = sm.OLS(y, X)
    results = model.fit(cov_type='cluster', cov_kwds={'groups': subset['time']})
    
    # Append results to the list
    results_list.append({
        'Anomaly': anomaly,
        'Alpha': results.params['const'],
        'T-stat_Alpha': results.tvalues['const'],
        'Slope': results.params['positive_return'],
        'T-stat_Slope': results.tvalues['positive_return'],
    })

# Convert the list of dictionaries to a DataFrame
results_table = pd.DataFrame(results_list)

# Display the results table
print(results_table)

   Anomaly     Alpha  T-stat_Alpha     Slope  T-stat_Slope
0       ac  0.150195      1.184450  0.101410      0.649822
1      bab -0.221412     -0.632211  1.319041      3.534152
2      cfp  0.127745      0.781292  0.235454      1.157989
3      cma  0.120082      0.974474  0.244693      1.545819
4       ep  0.101357      0.616107  0.302075      1.458207
5      hml  0.038477      0.204762  0.410255      1.780679
6      liq  0.157215      0.741922  0.356063      1.291807
7    ltrev -0.252989     -1.663307  0.757680      3.850110
8      nsi  0.172982      1.324451  0.089249      0.486779
9      qmj  0.086832      0.650364  0.434757      2.507550
10     rmw  0.040360      0.222250  0.337185      1.673841
11    rvar -0.463569     -1.638345  1.061609      2.737366
12     smb -0.104191     -0.615583  0.583455      2.508982
13   strev  0.485098      1.427336  0.013888      0.038600
14     umd  0.716042      2.697340 -0.094969     -0.288098
15   glbab  0.190820      0.577502  0.837610      2.3039

In [274]:
# Assuming your dataset is named 'Anomalies'
# Create an empty list to store results
results_list = []

# Perform pooled regression
# Sort the entire dataset based on the 'time' column
Anomalies = Anomalies.sort_values(by='time')

# Exclude factors 'umd' and 'glumd'
Anomalies_pooled = Anomalies[~Anomalies['anomaly'].isin(['umd', 'glumd'])]

# Create an indicator variable for positive returns in the past 12 months for the entire dataset
Anomalies_pooled['positive_return'] = Anomalies_pooled['ret'].rolling(window=12, min_periods=1).mean().shift(1) > 0

# Extract dependent and explanatory variables for the entire dataset
y_pooled = Anomalies_pooled['ret']
X_pooled = sm.add_constant(Anomalies_pooled['positive_return'].astype(int))

# Fit the pooled model
model_pooled = sm.OLS(y_pooled, X_pooled)
results_pooled = model_pooled.fit(cov_type='cluster', cov_kwds={'groups': Anomalies_pooled['time']})

# Append pooled results to the list
results_list.append({
    'Anomaly': 'Pooled',
    'Alpha': results_pooled.params['const'],
    'T-stat_Alpha': results_pooled.tvalues['const'],
    'Slope': results_pooled.params['positive_return'],
    'T-stat_Slope': results_pooled.tvalues['positive_return'],
})

# Iterate through unique anomalies
for anomaly in Anomalies['anomaly'].unique():
    # Skip 'umd' and 'glumd'
    if anomaly in ['umd', 'glumd']:
        continue
    
    # Filter the data for the current anomaly
    subset = Anomalies[Anomalies['anomaly'] == anomaly]
    
    # Sort the subset based on the 'time' column
    subset = subset.sort_values(by='time')
    
    # Create an indicator variable for positive returns in the past 12 months
    subset['positive_return'] = subset['ret'].rolling(window=12, min_periods=1).mean().shift(1) > 0
    
    # Drop the first 12 observations in the subset
    subset = subset.iloc[12:]
    
    # Extract dependent and explanatory variables
    y = subset['ret']
    X = sm.add_constant(subset['positive_return'].astype(int))
    
    # Fit the model
    model = sm.OLS(y, X)
    results = model.fit(cov_type='cluster', cov_kwds={'groups': subset['time']})
    
    # Append results to the list
    results_list.append({
        'Anomaly': anomaly,
        'Alpha': results.params['const'],
        'T-stat_Alpha': results.tvalues['const'],
        'Slope': results.params['positive_return'],
        'T-stat_Slope': results.tvalues['positive_return'],
    })

# Convert the list of dictionaries to a DataFrame
results_table = pd.DataFrame(results_list)

# Display the results table
print(results_table)

   Anomaly     Alpha  T-stat_Alpha     Slope  T-stat_Slope
0   Pooled -0.108631     -1.727635  0.674769      8.876950
1       ac  0.150195      1.184450  0.101410      0.649822
2      hml  0.038477      0.204762  0.410255      1.780679
3    strev  0.485098      1.427336  0.013888      0.038600
4      cfp  0.127745      0.781292  0.235454      1.157989
5      smb -0.104191     -0.615583  0.583455      2.508982
6    ltrev -0.252989     -1.663307  0.757680      3.850110
7     rvar -0.463569     -1.638345  1.061609      2.737366
8      rmw  0.040360      0.222250  0.337185      1.673841
9      bab -0.221412     -0.632211  1.319041      3.534152
10      ep  0.101357      0.616107  0.302075      1.458207
11     qmj  0.086832      0.650364  0.434757      2.507550
12     nsi  0.172982      1.324451  0.089249      0.486779
13     cma  0.120082      0.974474  0.244693      1.545819
14     liq  0.157215      0.741922  0.356063      1.291807
15   glbab  0.190820      0.577502  0.837610      2.3039

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Anomalies_pooled['positive_return'] = Anomalies_pooled['ret'].rolling(window=12, min_periods=1).mean().shift(1) > 0
