In [1]:
!pip install pandasql
import os
from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.arima_model import ARIMA
from sklearn.metrics import mean_squared_error, mean_absolute_error
import math
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as sm
import numpy as np
import matplotlib.pyplot as plt
from sqlite3 import connect
from scipy.stats.mstats import winsorize, ks_2samp
from scipy import stats
import seaborn as sns
import pandasql as ps

### Declare Global Variables

conn = connect(':memory:', timeout = 10)
filePath = "/Users/aaron/AaronTuFIMA/Sloan"



In [2]:
### Import CRSP CSV

file = filePath + "/CRSPMonthly1990Through2022.csv"
crsp1990To2022 = pd.read_csv(file)
print(crsp1990To2022)
print(crsp1990To2022.columns)

  crsp1990To2022 = pd.read_csv(file)


         PERMNO      date  NAMEENDT  SHRCD  EXCHCD   SICCD    NCUSIP TICKER  \
0         10001  19900131       NaN   11.0     3.0    4920  39040610   GFGC   
1         10001  19900228       NaN   11.0     3.0    4920  39040610   GFGC   
2         10001  19900330       NaN   11.0     3.0    4920  39040610   GFGC   
3         10001  19900430       NaN   11.0     3.0    4920  39040610   GFGC   
4         10001  19900531       NaN   11.0     3.0    4920  39040610   GFGC   
...         ...       ...       ...    ...     ...     ...       ...    ...   
2975236   93436  20211130       NaN   11.0     3.0  9999.0  88160R10   TSLA   
2975237   93436  20211231       NaN   11.0     3.0  9999.0  88160R10   TSLA   
2975238   93436  20220131       NaN   11.0     3.0  9999.0  88160R10   TSLA   
2975239   93436  20220228       NaN   11.0     3.0  9999.0  88160R10   TSLA   
2975240   93436  20220331       NaN   11.0     3.0  9999.0  88160R10   TSLA   

                     COMNAM SHRCLS  ... CFACSHR    

In [3]:
### Clean CRSP Dataframe

#Parse relevant variables
crsp1990To2022 = crsp1990To2022.filter(['TICKER', 'date', 'SHRCD', 'EXCHCD', 'SICCD', 'SHROUT', 'ALTPRC', 'RET', 'DLRET', 'DLSTCD'])

#Rename columns
crsp1990To2022.columns = ['tic', 'date', 'shrcd', 'exchcd', 'siccd', 'shrout', 'altprc', 'ret', 'dlret', 'dlstcd']

#Convert variable types
crsp1990To2022['tic'] = crsp1990To2022['tic'].apply(str)
crsp1990To2022['date'] = pd.to_datetime(crsp1990To2022['date'], format = '%Y%m%d')
crsp1990To2022['shrcd'] = pd.to_numeric(crsp1990To2022['shrcd'], errors = 'coerce')
crsp1990To2022['exchcd'] = pd.to_numeric(crsp1990To2022['exchcd'], errors = 'coerce')
crsp1990To2022['siccd'] = pd.to_numeric(crsp1990To2022['siccd'], errors = 'coerce')
crsp1990To2022['shrout'] = pd.to_numeric(crsp1990To2022['shrout'], errors = 'coerce')
crsp1990To2022['altprc'] = pd.to_numeric(crsp1990To2022['altprc'], errors = 'coerce')
crsp1990To2022['ret'] = pd.to_numeric(crsp1990To2022['ret'], errors = 'coerce')
crsp1990To2022['dlret'] = pd.to_numeric(crsp1990To2022['dlret'], errors = 'coerce')
crsp1990To2022['dlstcd'] = pd.to_numeric(crsp1990To2022['dlstcd'], errors = 'coerce')

#Only keep dates after 2018
crsp2018To2022 = crsp1990To2022[crsp1990To2022['date'] >= "2018-01-01"]

#Only keep US-based common stocks
crsp2018To2022 = crsp2018To2022[(crsp2018To2022['shrcd'] == 10) | (crsp2018To2022['shrcd'] == 11)]

#Only keep NYSE, AMEX, and NASDAQ exchanges
crsp2018To2022 = crsp2018To2022[(crsp2018To2022['exchcd'] == 1) | (crsp2018To2022['exchcd'] == 2) | (crsp2018To2022['exchcd'] == 3) | (crsp2018To2022['exchcd'] == 31) | (crsp2018To2022['exchcd'] == 32) | (crsp2018To2022['exchcd'] == 33)]

#Convert SICCD to 2-digit level
crsp2018To2022['siccd'] = np.floor(crsp2018To2022['siccd'] / 100)

#Adjust delisting returns
crsp2018To2022['ret_adj'] = ""
for i in crsp2018To2022.index:
    if (pd.isnull(crsp2018To2022.loc[i, 'dlstcd'])) or (crsp2018To2022.loc[i, 'dlstcd'] == 100):
        crsp2018To2022.loc[i, 'ret_adj'] = crsp2018To2022.loc[i, 'ret']
    elif(pd.notnull(crsp2018To2022.loc[i, 'dlstcd'])) and (pd.notnull(crsp2018To2022.loc[i, 'dlret'])):
        crsp2018To2022.loc[i, 'ret_adj'] = crsp2018To2022.loc[i, 'dlret']
    elif(551 <= crsp2018To2022.loc[i, 'dlstcd'] <= 574) or (crsp2018To2022.loc[i, 'dlstcd'] in [500, 520, 580, 584]):
        crsp2018To2022.loc[i, 'ret_adj'] = -0.3
    else:
        crsp2018To2022.loc[i, 'ret_adj'] = -1
        
crsp2018To2022.drop('dlret', inplace = True, axis = 1)
crsp2018To2022.drop('dlstcd', inplace = True, axis = 1)

#Calculate market value
crsp2018To2022['mv'] = np.abs(crsp2018To2022['altprc'] * crsp2018To2022['shrout'])

#Add year and month columns
crsp2018To2022['date'] = pd.to_datetime(crsp2018To2022['date'])
crsp2018To2022['year'] = crsp2018To2022['date'].dt.year
crsp2018To2022['month'] = crsp2018To2022['date'].dt.month

#Sort DataFrame
crsp2018To2022 = crsp2018To2022.sort_values(by = ['tic', 'date'])

#Add deciles based on market value at the start of the year
g = crsp2018To2022.groupby(['year', 'tic'], as_index = False)['mv'].first()
g['size_quantile'] = g.groupby(['year'])['mv'].rank(pct = True)
g['size_decile'] = np.ceil(10 * g['size_quantile'])
g.drop(['mv', 'size_quantile'], inplace = True, axis = 1)
crsp2018To2022 = crsp2018To2022.merge(g, on = ['year', 'tic'], how = 'left')

#Drop missing values
crsp2018To2022 = crsp2018To2022.dropna()

#Reindex
crsp2018To2022 = crsp2018To2022.reset_index()
crsp2018To2022.drop('index', inplace=True, axis = 1)

print(crsp2018To2022)

         tic       date  shrcd  exchcd  siccd    shrout  altprc       ret  \
0          A 2018-01-31   11.0     1.0   38.0  323018.0   73.43  0.096461   
1          A 2018-02-28   11.0     1.0   38.0  322717.0   68.59 -0.065913   
2          A 2018-03-29   11.0     1.0   38.0  322477.0   66.90 -0.024639   
3          A 2018-04-30   11.0     1.0   38.0  322477.0   65.74 -0.015112   
4          A 2018-05-31   11.0     1.0   38.0  319952.0   61.92 -0.058108   
...      ...        ...    ...     ...    ...       ...     ...       ...   
191199  ZYXI 2021-12-31   11.0     3.0   99.0   39738.0    9.97 -0.221094   
191200  ZYXI 2022-01-31   11.0     3.0   99.0   43712.0    7.92 -0.116148   
191201  ZYXI 2022-01-31   11.0     3.0   99.0   43712.0    7.92 -0.116148   
191202  ZYXI 2022-02-28   11.0     3.0   99.0   43712.0    6.29 -0.205808   
191203  ZYXI 2022-03-31   11.0     3.0   99.0   39784.0    6.23 -0.009539   

         ret_adj           mv  year  month  size_decile  
0       0.096461 

In [4]:
#Import compustat CSV
file = filePath + "/CompustatAnnual1950To2022.csv"
columnsToUse = ['tic', 'datadate', 'fyear', 'indfmt', 'datafmt', 'che', 'act', 'lct', 'at', 'dp', 'ib', 'dlc', 'txp', 'oiadp']
compustat1950To2022 = pd.read_csv(file, usecols = columnsToUse)[columnsToUse]
print(compustat1950To2022)

          tic  datadate   fyear indfmt datafmt       che        act  \
0        AE.2  19611231  1961.0   INDL     STD       NaN        NaN   
1        AE.2  19621231  1962.0   INDL     STD       NaN        NaN   
2        AE.2  19631231  1963.0   INDL     STD       NaN      0.408   
3        AE.2  19641231  1964.0   INDL     STD     0.269      0.718   
4        AE.2  19651231  1965.0   INDL     STD     0.031      0.725   
...       ...       ...     ...    ...     ...       ...        ...   
591305  IVCGF  20201231  2020.0   INDL     STD   558.000  12100.000   
591306  IVCGF  20211231  2021.0   INDL     STD  1020.181  12385.477   
591307  DTRUY  20191231  2019.0   INDL     STD  6532.226  31890.187   
591308  DTRUY  20201231  2020.0   INDL     STD  9012.671  30632.474   
591309  DTRUY  20211231  2021.0   INDL     STD  8358.207  31787.117   

              lct         at        dp        ib        dlc      txp     oiadp  
0             NaN        NaN       NaN     0.050        NaN      N

In [5]:
#Clean compustat dataframe
compustat1950To2022.columns = ['tic', 'datadate', 'fyear', 'indfmt', 'datafmt', 'cash', 'ca', 'cl', 'ta', 'dep', 'ib', 'std', 'tp', 'oiadp']

#Convert variables
compustat1950To2022['tic'] = compustat1950To2022['tic'].apply(str)
compustat1950To2022['datadate'] = pd.to_datetime(compustat1950To2022['datadate'], format = '%Y%m%d')
compustat1950To2022['fyear'] = pd.to_numeric(compustat1950To2022['fyear'], errors = 'coerce')
compustat1950To2022['indfmt'] = compustat1950To2022['indfmt'].apply(str)
compustat1950To2022['datafmt'] = compustat1950To2022['datafmt'].apply(str)
compustat1950To2022['cash'] = pd.to_numeric(compustat1950To2022['cash'], errors = 'coerce')
compustat1950To2022['ca'] = pd.to_numeric(compustat1950To2022['ca'], errors = 'coerce')
compustat1950To2022['cl'] = pd.to_numeric(compustat1950To2022['cl'], errors = 'coerce')
compustat1950To2022['ta'] = pd.to_numeric(compustat1950To2022['ta'], errors = 'coerce')
compustat1950To2022['dep'] = pd.to_numeric(compustat1950To2022['dep'], errors = 'coerce')
compustat1950To2022['ib'] = pd.to_numeric(compustat1950To2022['ib'], errors = 'coerce')
compustat1950To2022['std'] = pd.to_numeric(compustat1950To2022['std'], errors = 'coerce')
compustat1950To2022['tp'] = pd.to_numeric(compustat1950To2022['tp'], errors = 'coerce')
compustat1950To2022['oiadp'] = pd.to_numeric(compustat1950To2022['oiadp'], errors = 'coerce')

#Only keep data after 2017
compustat2018To2022 = compustat1950To2022[compustat1950To2022['datadate'] >= "2017-01-01"]

#Only keep correct formats
compustat2018To2022 = compustat2018To2022[compustat2018To2022['indfmt'] == "INDL"]
compustat2018To2022 = compustat2018To2022[compustat2018To2022['datafmt'] == "STD"]

#Drop infinite/null values
compustat2018To2022.replace([np.inf, -np.inf], np.nan, inplace = True)
compustat2018To2022 = compustat2018To2022.dropna()

#Reindex
compustat2018To2022 = compustat2018To2022.reset_index()
compustat2018To2022.drop('index', inplace = True, axis = 1)

#Calclate change (delta) in variables
compustat2018To2022['delta_cash'] = compustat2018To2022.groupby(['tic'])['cash'].diff()
compustat2018To2022['delta_ca'] = compustat2018To2022.groupby(['tic'])['ca'].diff()
compustat2018To2022['delta_cl'] = compustat2018To2022.groupby(['tic'])['cl'].diff()
compustat2018To2022['delta_std'] = compustat2018To2022.groupby(['tic'])['std'].diff()
compustat2018To2022['delta_tp'] = compustat2018To2022.groupby(['tic'])['tp'].diff()

#Drop null values (first year no difference computed)
compustat2018To2022 = compustat2018To2022.dropna()

#calculate accruals
compustat2018To2022['accruals'] = (compustat2018To2022['delta_ca'] - compustat2018To2022['delta_cash']) - (compustat2018To2022['delta_cl'] - compustat2018To2022['delta_std'] - compustat2018To2022['delta_tp']) - compustat2018To2022['dep']
compustat2018To2022['accruals_subcomp_1'] = compustat2018To2022['delta_ca'] - compustat2018To2022['delta_cash']
compustat2018To2022['accruals_subcomp_2'] = -(compustat2018To2022['delta_cl'] - compustat2018To2022['delta_std'] - compustat2018To2022['delta_tp'])
compustat2018To2022['accruals_subcomp_3'] = -compustat2018To2022['dep']

In [6]:
#Calculate average total assets
avgATDF = compustat2018To2022.groupby(['tic'], as_index = False)['ta'].mean()
avgATDF.columns = ['tic', 'avg_at']
compustat2018To2022 = compustat2018To2022.merge(avgATDF, on = 'tic', how = 'left')

#Calculate earnings, accruals, and cash flow components
compustat2018To2022['earnings_comp'] = compustat2018To2022['oiadp'] / compustat2018To2022['avg_at']
compustat2018To2022['accruals_comp'] = compustat2018To2022['accruals'] / compustat2018To2022['avg_at']
compustat2018To2022['cash_flows_comp'] = compustat2018To2022['earnings_comp'] - compustat2018To2022['accruals_comp']

#Standardize sub-components
compustat2018To2022['accruals_subcomp_1'] /= compustat2018To2022['avg_at']
compustat2018To2022['accruals_subcomp_2'] /= compustat2018To2022['avg_at']
compustat2018To2022['accruals_subcomp_3'] /= compustat2018To2022['avg_at']

#Drop infinite/null values
compustat2018To2022.replace([np.inf, -np.inf], np.nan, inplace = True)
compustat2018To2022 = compustat2018To2022.dropna()

#Reindex
compustat2018To2022 = compustat2018To2022.reset_index()
compustat2018To2022.drop('index', inplace = True, axis = 1)

print(compustat2018To2022)

         tic   datadate   fyear indfmt datafmt      cash         ca  \
0        AIR 2018-05-31  2017.0   INDL     STD    41.600    942.700   
1        AIR 2019-05-31  2018.0   INDL     STD    41.100    952.500   
2        AIR 2020-05-31  2019.0   INDL     STD   424.700   1438.700   
3        AIR 2021-05-31  2020.0   INDL     STD    60.200    937.000   
4        AIR 2022-05-31  2021.0   INDL     STD    58.900   1007.200   
...      ...        ...     ...    ...     ...       ...        ...   
24318   KARO 2021-02-28  2020.0   INDL     STD    65.057     87.432   
24319   KARO 2022-02-28  2021.0   INDL     STD    47.427     72.275   
24320  IVCGF 2020-12-31  2020.0   INDL     STD   558.000  12100.000   
24321  DTRUY 2020-12-31  2020.0   INDL     STD  9012.671  30632.474   
24322  DTRUY 2021-12-31  2021.0   INDL     STD  8358.207  31787.117   

              cl         ta       dep  ...  delta_std  delta_tp  accruals  \
0        333.300   1524.700    40.500  ...     -2.000   -12.300   -30.

In [7]:
#Generate main dataframe with merged CRSP and Compustat data
crsp2018To2022.to_sql("crsp2018To2022", conn, if_exists = 'replace')
compustat2018To2022.to_sql("compustat2018To2022", conn, if_exists = 'replace')
query = '''
        SELECT DISTINCT crsp2018To2022.*, compustat2018To2022.earnings_comp, compustat2018To2022.accruals_comp, compustat2018To2022.cash_flows_comp,
                        compustat2018To2022.accruals_subcomp_1, compustat2018To2022.accruals_subcomp_2, compustat2018To2022.accruals_subcomp_3
        FROM crsp2018To2022
        LEFT JOIN compustat2018To2022
        ON crsp2018To2022.tic = compustat2018To2022.tic
        AND crsp2018To2022.date = compustat2018To2022.datadate
        '''
mainDF = pd.read_sql(query, conn)
mainDF.drop('index', inplace = True, axis = 1)

print(mainDF)

         tic                 date  shrcd  exchcd  siccd    shrout  altprc  \
0          A  2018-01-31 00:00:00   11.0     1.0   38.0  323018.0   73.43   
1          A  2018-02-28 00:00:00   11.0     1.0   38.0  322717.0   68.59   
2          A  2018-03-29 00:00:00   11.0     1.0   38.0  322477.0   66.90   
3          A  2018-04-30 00:00:00   11.0     1.0   38.0  322477.0   65.74   
4          A  2018-05-31 00:00:00   11.0     1.0   38.0  319952.0   61.92   
...      ...                  ...    ...     ...    ...       ...     ...   
191199  ZYXI  2021-12-31 00:00:00   11.0     3.0   99.0   39738.0    9.97   
191200  ZYXI  2022-01-31 00:00:00   11.0     3.0   99.0   43712.0    7.92   
191201  ZYXI  2022-01-31 00:00:00   11.0     3.0   99.0   43712.0    7.92   
191202  ZYXI  2022-02-28 00:00:00   11.0     3.0   99.0   43712.0    6.29   
191203  ZYXI  2022-03-31 00:00:00   11.0     3.0   99.0   39784.0    6.23   

             ret    ret_adj           mv  year  month  size_decile  \
0    

In [8]:
### Import fama frecnh data
file = filePath + "/F-F_Research_Data_5_Factors.csv"
ffDF = pd.read_csv(file)
print(ffDF)

       Date  Mkt_RF   SMB    HML   RMW   CMA    RF
0    196307   -0.39 -0.44  -0.89  0.68 -1.23  0.27
1    196308    5.07 -0.75   1.68  0.36 -0.34  0.25
2    196309   -1.57 -0.55   0.08 -0.71  0.29  0.27
3    196310    2.53 -1.37  -0.14  2.80 -2.02  0.29
4    196311   -0.85 -0.89   1.81 -0.51  2.31  0.27
..      ...     ...   ...    ...   ...   ...   ...
702  202201   -6.25 -3.95  12.74  0.73  7.73  0.00
703  202202   -2.29  2.90   3.09 -2.12  2.99  0.00
704  202203    3.06 -2.14  -1.82 -1.32  3.24  0.00
705  202204   -9.45 -0.38   6.16  3.51  5.87  0.00
706  202205   -0.34  0.02   8.38  1.61  3.82  0.03

[707 rows x 7 columns]


In [9]:
### Clean FF dataframe

#Change column names
ffDF.columns = ['date', 'rm_minus_rf', 'smb', 'hml', 'rmw', 'cma', 'rf']

##Add month and year columns
ffDF['year'] = ""
ffDF['month'] = ""
for i in ffDF.index:
    ffDF.loc[i, 'year'] = int(str(ffDF.loc[i, 'date'])[0:4])
    ffDF.loc[i, 'month'] = int(str(ffDF.loc[i, 'date'])[-2] + str(ffDF.loc[i, 'date'])[-1])
    
#Change values to decimal
ffDF['rm_minus_rf'] /= 100
ffDF['smb'] /= 100
ffDF['hml'] /= 100
ffDF['rmw'] /= 100
ffDF['cma'] /= 100
ffDF['rf'] /= 100

#Only keep dates after 2018
ffDF = ffDF[ffDF['date'] >= 201801]

#Reindex
ffDF = ffDF.reset_index()
ffDF.drop('index', inplace = True, axis = 1)

#Calculate annual market return - risk free rate
annualRmMinusRf = pd.DataFrame(index = [2018, 2019, 2020, 2021], columns = ['year', 'annual_rm_minus_rf'])
annualRmMinusRf['year'] = annualRmMinusRf.index
tempDF = ffDF.loc[0:11].copy()
tempDF['rm_minus_rf_compounded'] = (1 + tempDF['rm_minus_rf']).cumprod() - 1
annualRmMinusRf.loc[2018, 'annual_rm_minus_rf'] = tempDF.loc[11, 'rm_minus_rf_compounded']
tempDF = ffDF.loc[12:23].copy()
tempDF['rm_minus_rf_compounded'] = (1 + tempDF['rm_minus_rf']).cumprod() - 1
annualRmMinusRf.loc[2019, 'annual_rm_minus_rf'] = tempDF.loc[23, 'rm_minus_rf_compounded']
tempDF = ffDF.loc[24:35].copy()
tempDF['rm_minus_rf_compounded'] = (1 + tempDF['rm_minus_rf']).cumprod() - 1
annualRmMinusRf.loc[2020, 'annual_rm_minus_rf'] = tempDF.loc[35, 'rm_minus_rf_compounded']
tempDF = ffDF.loc[36:47].copy()
tempDF['rm_minus_rf_compounded'] = (1 + tempDF['rm_minus_rf']).cumprod() - 1
annualRmMinusRf.loc[2021, 'annual_rm_minus_rf'] = tempDF.loc[47, 'rm_minus_rf_compounded']


In [10]:
#Left join relevant FF (risk-free rate) data into main DataFrame
mainDF.to_sql("mainDF", conn, if_exists = 'replace')
ffDF.to_sql("ffDF", conn, if_exists = 'replace')
query = '''
        SELECT DISTINCT mainDF.*, ffDF.rf, ffDf.rm_minus_rf
        FROM mainDF
        LEFT JOIN ffDF
        ON mainDF.year = ffDF.year
        AND mainDF.month = ffDF.month
        '''
mainDF = pd.read_sql(query, conn)
mainDF['date'] = pd.to_datetime(mainDF['date'])
mainDF['ret'] = pd.to_numeric(mainDF['ret'], errors = 'coerce')
mainDF['ret_adj'] = pd.to_numeric(mainDF['ret_adj'], errors = 'coerce')
mainDF['size_decile'] = pd.to_numeric(mainDF['size_decile'], errors = 'coerce')
mainDF['ret_minus_rf'] = mainDF['ret_adj'] - mainDF['rf']

#Add abnormal return data
g = mainDF.groupby(['year', 'month', 'size_decile'], as_index = False).apply(lambda x: np.average(x['ret_adj']))
g.columns = ['year', 'month', 'size_decile', 'avg_ret_on_size_decile']
mainDF = mainDF.merge(g, on = ['year', 'month', 'size_decile'], how = 'left')
mainDF['abn_ret'] = mainDF['ret_adj'] - mainDF['avg_ret_on_size_decile']
mainDF.drop('index', inplace = True, axis = 1)

#Store mainDf for future use
tempMain = mainDF

print(mainDF)

         tic       date  shrcd  exchcd  siccd    shrout  altprc       ret  \
0          A 2018-01-31   11.0     1.0   38.0  323018.0   73.43  0.096461   
1          A 2018-02-28   11.0     1.0   38.0  322717.0   68.59 -0.065913   
2          A 2018-03-29   11.0     1.0   38.0  322477.0   66.90 -0.024639   
3          A 2018-04-30   11.0     1.0   38.0  322477.0   65.74 -0.015112   
4          A 2018-05-31   11.0     1.0   38.0  319952.0   61.92 -0.058108   
...      ...        ...    ...     ...    ...       ...     ...       ...   
191199  ZYXI 2021-12-31   11.0     3.0   99.0   39738.0    9.97 -0.221094   
191200  ZYXI 2022-01-31   11.0     3.0   99.0   43712.0    7.92 -0.116148   
191201  ZYXI 2022-01-31   11.0     3.0   99.0   43712.0    7.92 -0.116148   
191202  ZYXI 2022-02-28   11.0     3.0   99.0   43712.0    6.29 -0.205808   
191203  ZYXI 2022-03-31   11.0     3.0   99.0   39784.0    6.23 -0.009539   

         ret_adj           mv  ...  accruals_comp  cash_flows_comp  \
0    

In [11]:
### Replicate table 1A

mainDF = tempMain

#Generate DataFrame with non-null components (end of fiscal year data)
mainOnFYearEnd = mainDF.dropna()
mainOnFYearEnd['accruals_comp_quantile'] = mainOnFYearEnd['accruals_comp'].rank(pct = True)
mainOnFYearEnd['accruals_comp_decile'] = np.ceil(mainOnFYearEnd['accruals_comp_quantile'] * 10)

#Generate data for firms with December fiscal-year ends (For capm regressions)
mainOnFYearEndDecOnly = mainOnFYearEnd[mainOnFYearEnd['month'] == 12]

#Generate summary statistics (Table 1A)
table1A = pd.DataFrame(index = ['accruals_mean', 'accruals_median', 'cash_flows_mean', 'cash_flows_median', 'earnings_mean', 'earnings_median'], columns = range(1, 11))
for i in table1A.columns:
    tempDF = mainOnFYearEnd[mainOnFYearEnd['accruals_comp_decile'] == i]
    table1A.loc['accruals_mean', i] = tempDF['accruals_comp'].mean()
    table1A.loc['accruals_median', i] = tempDF['accruals_comp'].median()
    table1A.loc['cash_flows_mean', i] = tempDF['cash_flows_comp'].mean()
    table1A.loc['cash_flows_median', i] = tempDF['cash_flows_comp'].median()
    table1A.loc['earnings_mean', i] = tempDF['earnings_comp'].mean()
    table1A.loc['earnings_median', i] = tempDF['earnings_comp'].median()

print(table1A)

                         1         2         3         4         5         6   \
accruals_mean     -0.189894 -0.089031 -0.063193 -0.047538 -0.035422 -0.024555   
accruals_median   -0.147419 -0.088124 -0.062777 -0.047807 -0.035556 -0.024597   
cash_flows_mean   -0.029263  0.013161  0.035757  0.031993  0.003735 -0.007815   
cash_flows_median  0.115937  0.107742  0.104126  0.095763  0.084488  0.072694   
earnings_mean     -0.219157  -0.07587 -0.027436 -0.015545 -0.031686 -0.032371   
earnings_median   -0.056139  0.022564  0.040335  0.048498  0.047531  0.047679   

                         7         8         9         10  
accruals_mean     -0.013453 -0.000333  0.021679  0.142687  
accruals_median   -0.013712 -0.000731  0.020226  0.094279  
cash_flows_mean   -0.027452  -0.07524 -0.149409 -0.376474  
cash_flows_median  0.053451  0.014853 -0.030458 -0.194746  
earnings_mean     -0.040906 -0.075573  -0.12773 -0.233787  
earnings_median    0.039835  0.014194 -0.009105 -0.076039  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mainOnFYearEnd['accruals_comp_quantile'] = mainOnFYearEnd['accruals_comp'].rank(pct = True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mainOnFYearEnd['accruals_comp_decile'] = np.ceil(mainOnFYearEnd['accruals_comp_quantile'] * 10)


In [12]:
#Replicate Table 1B

mainDF = tempMain

#Initialize empty DataFrames
table1B = pd.DataFrame(columns = range(1, 11))
annualRetMinusRf = pd.DataFrame(index = range(2018, 2022), columns = range(1, 11))
for i in annualRetMinusRf.index:
    for j in annualRetMinusRf.columns:
        annualRetMinusRf[j][i] = []
        
sizeOfPortfolios = pd.DataFrame(index = ['size', 'size_mean', 'size_median'], columns = range(1, 11))
for i in sizeOfPortfolios.index:
    for j in sizeOfPortfolios.columns:
        sizeOfPortfolios[j][i] = []
        
#Calculate compound ret-rf
for i in table1B.columns:
    tempDF = mainOnFYearEndDecOnly[mainOnFYearEndDecOnly['accruals_comp_decile'] == i]
    for j in tempDF.index:
        if ((j - 11) in mainDF.index) and (mainDF.loc[j - 11, 'tic'] == tempDF.loc[j, 'tic']):
            tempDF2 = mainDF.loc[(j-11) : j].copy()
            tempDF2['ret_minus_rf_compounded'] = (1 + tempDF2['ret_minus_rf']).cumprod() - 1
            yearOfData = tempDF2.loc[j, 'year']
            annualRetMinusRf.loc[yearOfData, i].append(tempDF2.loc[j, 'ret_minus_rf_compounded'])
            sizeOfPortfolios.loc['size', i].append(np.log(tempDF2.loc[j, 'mv']))
            
#Take average for each portfolio
for i in annualRetMinusRf.index:
    for j in annualRetMinusRf.columns:
        annualRetMinusRf.loc[i, j] = np.average(annualRetMinusRf.loc[i, j])
        
for j in sizeOfPortfolios.columns:
    sizeOfPortfolios.loc['size_mean', j] = np.average(sizeOfPortfolios.loc['size', j])
    sizeOfPortfolios.loc['size_median', j] = np.median(sizeOfPortfolios.loc['size', j])
    
sizeOfPortfolios.drop('size', inplace = True, axis = 0)
annualRetMinusRf['year'] = annualRetMinusRf.index

print(sizeOfPortfolios)

                    1          2          3          4          5          6   \
size_mean     12.68047  13.629503   14.05715  14.315762  14.411314  14.348843   
size_median  12.467989  13.694159  14.161276  14.379677  14.625634  14.476226   

                    7          8          9          10  
size_mean     13.95186  13.656998  13.208173  12.624617  
size_median  14.115196  13.700092   13.18852  12.450896  


In [13]:
#Generate DataFrame for table 1B regressions
dataForTable1BReg = annualRetMinusRf.merge(annualRmMinusRf, on = 'year', how = 'left')
dataForTable1BReg = dataForTable1BReg.astype(float)

#Run regressions to get alpha and beta
for i in range(1, 11):
    tempDF = dataForTable1BReg.filter([i, 'annual_rm_minus_rf'])
    tempDF.columns = ['current_decile_data', 'annual_rm_minus_rf']
    tempReg = sm.ols(formula = "current_decile_data ~ annual_rm_minus_rf", data = tempDF).fit()
    tempRegSummary = tempReg.summary()
    print("Accruals decile = " + str(i) + ":")
    print(tempRegSummary)

Accruals decile = 1:
                             OLS Regression Results                            
Dep. Variable:     current_decile_data   R-squared:                       0.860
Model:                             OLS   Adj. R-squared:                  0.789
Method:                  Least Squares   F-statistic:                     12.25
Date:                 Fri, 04 Aug 2023   Prob (F-statistic):             0.0728
Time:                         23:04:06   Log-Likelihood:                 3.8953
No. Observations:                    4   AIC:                            -3.791
Df Residuals:                        2   BIC:                            -5.018
Df Model:                            1                                         
Covariance Type:             nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
Inter

Accruals decile = 10:
                             OLS Regression Results                            
Dep. Variable:     current_decile_data   R-squared:                       0.228
Model:                             OLS   Adj. R-squared:                 -0.159
Method:                  Least Squares   F-statistic:                    0.5892
Date:                 Fri, 04 Aug 2023   Prob (F-statistic):              0.523
Time:                         23:04:06   Log-Likelihood:                -1.6162
No. Observations:                    4   AIC:                             7.232
Df Residuals:                        2   BIC:                             6.005
Df Model:                            1                                         
Covariance Type:             nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
Inte

  warn("omni_normtest is not valid with less than 8 observations; %i "
  warn("omni_normtest is not valid with less than 8 observations; %i "
  warn("omni_normtest is not valid with less than 8 observations; %i "
  warn("omni_normtest is not valid with less than 8 observations; %i "
  warn("omni_normtest is not valid with less than 8 observations; %i "
  warn("omni_normtest is not valid with less than 8 observations; %i "
  warn("omni_normtest is not valid with less than 8 observations; %i "
  warn("omni_normtest is not valid with less than 8 observations; %i "
  warn("omni_normtest is not valid with less than 8 observations; %i "
  warn("omni_normtest is not valid with less than 8 observations; %i "


In [14]:
### Replicate Table 1C

mainDF = tempMain

#Generate DataFrame for Table 1C
table1C = pd.DataFrame(index = ['current_asset_mean', 'current_asset_median', 'current_liability_mean', 'current_liability_median', 'depreciation_expense_mean', 'depreciation_expense_mean'], columns = range(1, 11))

for i in table1C.columns:
    tempDF = mainOnFYearEnd[mainOnFYearEnd['accruals_comp_decile'] == i]
    table1C.loc['current_asset_mean', i] = tempDF['accruals_subcomp_1'].mean()
    table1C.loc['current_asset_median', i] = tempDF['accruals_subcomp_1'].median()
    table1C.loc['current_liability_mean', i] = tempDF['accruals_subcomp_2'].mean()
    table1C.loc['current_liability_median', i] = tempDF['accruals_subcomp_2'].median()
    table1C.loc['depreciation_expense_mean', i] = tempDF['accruals_subcomp_3'].mean()
    table1C.loc['depreciation_expense_mean', i] = tempDF['accruals_subcomp_3'].median()
    
print(table1C)

                                 1         2         3         4         5   \
current_asset_mean        -0.037913 -0.008948  0.000491  0.007268  0.012919   
current_asset_median      -0.013133 -0.003138  0.001545  0.004984  0.008166   
current_liability_mean    -0.085546  -0.02935 -0.020173 -0.018244  -0.01685   
current_liability_median  -0.045761  -0.02044 -0.014878 -0.012381 -0.011936   
depreciation_expense_mean -0.049782 -0.046931 -0.042306 -0.036731 -0.031254   
depreciation_expense_mean -0.049782 -0.046931 -0.042306 -0.036731 -0.031254   

                                 6         7         8         9         10  
current_asset_mean          0.01468  0.022935  0.029691  0.046607  0.121372  
current_asset_median       0.008351   0.01502  0.020873   0.04089  0.097672  
current_liability_mean    -0.012846 -0.014066 -0.010749 -0.004676  0.043045  
current_liability_median   -0.00813  -0.01054 -0.006457 -0.004442  0.005297  
depreciation_expense_mean -0.025679 -0.020339 -0.015881 

In [15]:
### Replicate Table 2

#Regenerate mainOnFYearEnd (to allow us to rerun the same code block for debugging)
mainDF = tempMain
mainOnFYearEnd = mainDF.dropna()
mainOnFYearEnd['accruals_comp_quantile'] = mainOnFYearEnd['accruals_comp'].rank(pct = True)
mainOnFYearEnd['accruals_comp_decile'] = np.ceil(mainOnFYearEnd['accruals_comp_quantile'] * 10)
mainOnFYearEnd['earnings_comp_quantile'] = mainOnFYearEnd['earnings_comp'].rank(pct = True)
mainOnFYearEnd['earnings_comp_decile'] = np.ceil(mainOnFYearEnd['earnings_comp_quantile'] * 10)
mainOnFYearEnd['cash_flows_comp_quantile'] = mainOnFYearEnd['cash_flows_comp'].rank(pct = True)
mainOnFYearEnd['cash_flows_comp_decile'] = np.ceil(mainOnFYearEnd['cash_flows_comp_quantile'] * 10)

#Generate column for next year's earnings component and accruals decile
mainOnFYearEnd['accruals_comp_next_year'] = mainOnFYearEnd.groupby(['tic'])['accruals_comp'].shift(-1)
mainOnFYearEnd['earnings_comp_next_year'] = mainOnFYearEnd.groupby(['tic'])['earnings_comp'].shift(-1)
mainOnFYearEnd['cash_flows_comp_next_year'] = mainOnFYearEnd.groupby(['tic'])['cash_flows_comp'].shift(-1)
mainOnFYearEnd['accruals_comp_decile_next_year'] = mainOnFYearEnd.groupby(['tic'])['accruals_comp_decile'].shift(-1)
mainOnFYearEnd['earnings_comp_decile_next_year'] = mainOnFYearEnd.groupby(['tic'])['earnings_comp_decile'].shift(-1)
mainOnFYearEnd['cash_flows_comp_decile_next_year'] = mainOnFYearEnd.groupby(['tic'])['cash_flows_comp_decile'].shift(-1)

#Generate DataFrame for table 2 regressions (drops most recent year: no available next-year data)
dataForTable2Reg = mainOnFYearEnd.dropna()

#Run regressions
table2AReg = sm.ols(formula = "earnings_comp_next_year ~ earnings_comp", data = dataForTable2Reg).fit()
table2ARegSummary = table2AReg.summary()
table2BReg = sm.ols(formula = "earnings_comp_decile_next_year ~ earnings_comp_decile", data = dataForTable2Reg).fit()
table2BRegSummary = table2BReg.summary()

#Run regressions for firms grouped by industry codes
alpha0DistTable2A = []
alpha0DistTable2B = []
alpha1DistTable2A = []
alpha1DistTable2B = []

for i in dataForTable2Reg['siccd'].unique():
    tempDF = dataForTable2Reg[dataForTable2Reg['siccd'] == i]
    regA = sm.ols(formula = "earnings_comp_next_year ~ earnings_comp", data = tempDF).fit()
    alpha0DistTable2A.append(regA.params[0])
    alpha1DistTable2A.append(regA.params[1])
    regB = sm.ols(formula = "earnings_comp_decile_next_year ~ earnings_comp_decile", data = tempDF).fit()
    alpha0DistTable2B.append(regB.params[0])
    alpha1DistTable2B.append(regB.params[1])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mainOnFYearEnd['accruals_comp_quantile'] = mainOnFYearEnd['accruals_comp'].rank(pct = True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mainOnFYearEnd['accruals_comp_decile'] = np.ceil(mainOnFYearEnd['accruals_comp_quantile'] * 10)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mainOnFYearEnd['e

In [16]:
#Print Table 2A results
print("Table 2A")
print("Pooled regressions")
print(table2ARegSummary)
print("Industry Level regressions")
print("Mean - alpha_0: " + str(np.average(alpha0DistTable2A)) + " alpha_1: " + str(np.average(alpha1DistTable2A)))
print("Q1 - alpha_0: " + str(np.quantile(alpha0DistTable2A, 0.25)) + " alpha_1: " + str(np.quantile(alpha1DistTable2A, 0.25)))
print("Median - alpha_0: " + str(np.median(alpha0DistTable2A)) + " alpha_1: " + str(np.median(alpha1DistTable2A)))
print("Q3 - alpha_0: " + str(np.quantile(alpha0DistTable2A, 0.75)) + " alpha_1: " + str(np.quantile(alpha1DistTable2A, 0.75)))

#Print Table 2B Results
print("Table 2B")
print("Pooled regressions")
print(table2BRegSummary)
print("Industry Level regressions")
print("Mean - alpha_0: " + str(np.average(alpha0DistTable2B)) + " alpha_1: " + str(np.average(alpha1DistTable2B)))
print("Q1 - alpha_0: " + str(np.quantile(alpha0DistTable2B, 0.25)) + " alpha_1: " + str(np.quantile(alpha1DistTable2B, 0.25)))
print("Median - alpha_0: " + str(np.median(alpha0DistTable2B)) + " alpha_1: " + str(np.median(alpha1DistTable2B)))
print("Q3 - alpha_0: " + str(np.quantile(alpha0DistTable2B, 0.75)) + " alpha_1: " + str(np.quantile(alpha1DistTable2B, 0.75)))

Table 2A
Pooled regressions
                               OLS Regression Results                              
Dep. Variable:     earnings_comp_next_year   R-squared:                       0.714
Model:                                 OLS   Adj. R-squared:                  0.714
Method:                      Least Squares   F-statistic:                 1.655e+04
Date:                     Fri, 04 Aug 2023   Prob (F-statistic):               0.00
Time:                             23:04:07   Log-Likelihood:                 1756.1
No. Observations:                     6635   AIC:                            -3508.
Df Residuals:                         6633   BIC:                            -3495.
Df Model:                                1                                         
Covariance Type:                 nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------

In [17]:
#Store mainOnFYearEnd for future use
tempMainOnFYearEnd = mainOnFYearEnd

### Replicate Table 3 ###
dataForTable3Reg = dataForTable2Reg

#Run regressions
table3AReg = sm.ols(formula = "earnings_comp_next_year ~ accruals_comp + cash_flows_comp", data = dataForTable3Reg).fit()
table3ARegSummary = table3AReg.summary()
table3BReg = sm.ols(formula = "earnings_comp_decile_next_year ~ accruals_comp_decile + cash_flows_comp_decile", data = dataForTable3Reg).fit()
table3BRegSummary = table3BReg.summary()

#Run regressions for firms grouped by industry codes
gamma0DistTable3A = []
gamma1DistTable3A = []
gamma2DistTable3A = []
gamma0DistTable3B = []
gamma1DistTable3B = []
gamma2DistTable3B = []

for i in dataForTable3Reg['siccd'].unique():
    tempDF = dataForTable3Reg[dataForTable3Reg['siccd'] == i]
    regA = sm.ols(formula = "earnings_comp_next_year ~ accruals_comp + cash_flows_comp", data = tempDF).fit()
    gamma0DistTable3A.append(regA.params[0])
    gamma1DistTable3A.append(regA.params[1])
    gamma2DistTable3A.append(regA.params[2])
    regB = sm.ols(formula = "earnings_comp_decile_next_year ~ accruals_comp_decile + cash_flows_comp_decile", data = tempDF).fit()
    gamma0DistTable3B.append(regB.params[0])
    gamma1DistTable3B.append(regB.params[1])
    gamma2DistTable3B.append(regB.params[2])
    
#Remove min and max values
gamma0DistTable3A.sort()
gamma0DistTable3A = gamma0DistTable3A[1 : -1]
gamma1DistTable3A.sort()
gamma1DistTable3A = gamma1DistTable3A[1 : -1]
gamma2DistTable3A.sort()
gamma2DistTable3A = gamma2DistTable3A[1 : -1]
gamma0DistTable3B.sort()
gamma0DistTable3B = gamma0DistTable3B[1 : -1]
gamma1DistTable3B.sort()
gamma1DistTable3B = gamma1DistTable3B[1 : -1]
gamma2DistTable3B.sort()
gamma2DistTable3B = gamma2DistTable3B[1 : -1]

#Print Table 3A results
print("Table 3A")
print("Pooled Regressions")
print(table3ARegSummary)
print("Industry Level Regressions")
print("Mean - gamma_0: " + str(np.average(gamma0DistTable3A)) + " gamma_1: " + str(np.average(gamma1DistTable3A)) + " gamma_2 " + str(np.average(gamma2DistTable3A)))
print("Q1 - gamma_0: " + str(np.quantile(gamma0DistTable3A, 0.25)) + " gamma_1: " + str(np.quantile(gamma1DistTable3A, 0.25)) + " gamma_2 " + str(np.quantile(gamma2DistTable3A, 0.25)))
print("Median - gamma_0: " + str(np.median(gamma0DistTable3A)) + " gamma_1: " + str(np.median(gamma1DistTable3A)) + " gamma_2 " + str(np.median(gamma2DistTable3A)))
print("Q3 - gamma_0: " + str(np.quantile(gamma0DistTable3A, 0.75)) + " gamma_1: " + str(np.quantile(gamma1DistTable3A, 0.75)) + " gamma_2 " + str(np.quantile(gamma2DistTable3A, 0.75)))

#Print Table 3B results
print("Table 3B")
print("Pooled Regressions")
print(table3BRegSummary)
print("Industry Level Regressions")
print("Mean - gamma_0: " + str(np.average(gamma0DistTable3B)) + " gamma_1: " + str(np.average(gamma1DistTable3B)) + " gamma_2 " + str(np.average(gamma2DistTable3B)))
print("Q1 - gamma_0: " + str(np.quantile(gamma0DistTable3B, 0.25)) + " gamma_1: " + str(np.quantile(gamma1DistTable3B, 0.25)) + " gamma_2 " + str(np.quantile(gamma2DistTable3B, 0.25)))
print("Median - gamma_0: " + str(np.median(gamma0DistTable3B)) + " gamma_1: " + str(np.median(gamma1DistTable3B)) + " gamma_2 " + str(np.median(gamma2DistTable3B)))
print("Q3 - gamma_0: " + str(np.quantile(gamma0DistTable3B, 0.75)) + " gamma_1: " + str(np.quantile(gamma1DistTable3B, 0.75)) + " gamma_2 " + str(np.quantile(gamma2DistTable3B, 0.75)))

Table 3A
Pooled Regressions
                               OLS Regression Results                              
Dep. Variable:     earnings_comp_next_year   R-squared:                       0.715
Model:                                 OLS   Adj. R-squared:                  0.715
Method:                      Least Squares   F-statistic:                     8313.
Date:                     Fri, 04 Aug 2023   Prob (F-statistic):               0.00
Time:                             23:04:07   Log-Likelihood:                 1767.1
No. Observations:                     6635   AIC:                            -3528.
Df Residuals:                         6632   BIC:                            -3508.
Df Model:                                2                                         
Covariance Type:                 nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------

In [18]:
### Replicate Table 4 ###

#Add compound abnormal return data for next year
mainOnFYearEnd = tempMainOnFYearEnd
dataForTable4Reg = mainOnFYearEnd.dropna().copy()
dataForTable4Reg['abn_ret_compounded_next_year'] = np.nan

for i in dataForTable4Reg.index:
    if((i + 4) in mainDF.index) and ((i + 15) in mainDF.index) and (mainDF.loc[i + 4, 'tic'] == dataForTable4Reg.loc[i, 'tic']) and (mainDF.loc[i + 15, 'tic'] == dataForTable4Reg.loc[i, 'tic']):
            tempDF = mainDF.loc[(i + 4) : (i + 15)]
            tempDF['abn_ret_compounded_next_year'] = (1 + tempDF['abn_ret']).cumprod() - 1
            dataForTable4Reg.loc[i, 'abn_ret_compounded_next_year'] = tempDF.loc[i + 15, 'abn_ret_compounded_next_year']
            
dataForTable4Reg = dataForTable4Reg.dropna()

#Rank abnormal return data
dataForTable4Reg['abn_ret_compounded_quantile_next_year'] = dataForTable4Reg['abn_ret_compounded_next_year'].rank(pct = True)
dataForTable4Reg['abn_ret_compounded_decile_next_year'] = np.ceil(dataForTable4Reg['abn_ret_compounded_quantile_next_year'] * 10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tempDF['abn_ret_compounded_next_year'] = (1 + tempDF['abn_ret']).cumprod() - 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tempDF['abn_ret_compounded_next_year'] = (1 + tempDF['abn_ret']).cumprod() - 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tempDF['abn_ret_compounded_next_year'] = (1 + t

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tempDF['abn_ret_compounded_next_year'] = (1 + tempDF['abn_ret']).cumprod() - 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tempDF['abn_ret_compounded_next_year'] = (1 + tempDF['abn_ret']).cumprod() - 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tempDF['abn_ret_compounded_next_year'] = (1 + t

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tempDF['abn_ret_compounded_next_year'] = (1 + tempDF['abn_ret']).cumprod() - 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tempDF['abn_ret_compounded_next_year'] = (1 + tempDF['abn_ret']).cumprod() - 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tempDF['abn_ret_compounded_next_year'] = (1 + t

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tempDF['abn_ret_compounded_next_year'] = (1 + tempDF['abn_ret']).cumprod() - 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tempDF['abn_ret_compounded_next_year'] = (1 + tempDF['abn_ret']).cumprod() - 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tempDF['abn_ret_compounded_next_year'] = (1 + t

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tempDF['abn_ret_compounded_next_year'] = (1 + tempDF['abn_ret']).cumprod() - 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tempDF['abn_ret_compounded_next_year'] = (1 + tempDF['abn_ret']).cumprod() - 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tempDF['abn_ret_compounded_next_year'] = (1 + t

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tempDF['abn_ret_compounded_next_year'] = (1 + tempDF['abn_ret']).cumprod() - 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tempDF['abn_ret_compounded_next_year'] = (1 + tempDF['abn_ret']).cumprod() - 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tempDF['abn_ret_compounded_next_year'] = (1 + t

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tempDF['abn_ret_compounded_next_year'] = (1 + tempDF['abn_ret']).cumprod() - 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tempDF['abn_ret_compounded_next_year'] = (1 + tempDF['abn_ret']).cumprod() - 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tempDF['abn_ret_compounded_next_year'] = (1 + t

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tempDF['abn_ret_compounded_next_year'] = (1 + tempDF['abn_ret']).cumprod() - 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tempDF['abn_ret_compounded_next_year'] = (1 + tempDF['abn_ret']).cumprod() - 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tempDF['abn_ret_compounded_next_year'] = (1 + t

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tempDF['abn_ret_compounded_next_year'] = (1 + tempDF['abn_ret']).cumprod() - 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tempDF['abn_ret_compounded_next_year'] = (1 + tempDF['abn_ret']).cumprod() - 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tempDF['abn_ret_compounded_next_year'] = (1 + t

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tempDF['abn_ret_compounded_next_year'] = (1 + tempDF['abn_ret']).cumprod() - 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tempDF['abn_ret_compounded_next_year'] = (1 + tempDF['abn_ret']).cumprod() - 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tempDF['abn_ret_compounded_next_year'] = (1 + t

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tempDF['abn_ret_compounded_next_year'] = (1 + tempDF['abn_ret']).cumprod() - 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tempDF['abn_ret_compounded_next_year'] = (1 + tempDF['abn_ret']).cumprod() - 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tempDF['abn_ret_compounded_next_year'] = (1 + t

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tempDF['abn_ret_compounded_next_year'] = (1 + tempDF['abn_ret']).cumprod() - 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tempDF['abn_ret_compounded_next_year'] = (1 + tempDF['abn_ret']).cumprod() - 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tempDF['abn_ret_compounded_next_year'] = (1 + t

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tempDF['abn_ret_compounded_next_year'] = (1 + tempDF['abn_ret']).cumprod() - 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tempDF['abn_ret_compounded_next_year'] = (1 + tempDF['abn_ret']).cumprod() - 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tempDF['abn_ret_compounded_next_year'] = (1 + t

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tempDF['abn_ret_compounded_next_year'] = (1 + tempDF['abn_ret']).cumprod() - 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tempDF['abn_ret_compounded_next_year'] = (1 + tempDF['abn_ret']).cumprod() - 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tempDF['abn_ret_compounded_next_year'] = (1 + t

In [19]:
print(dataForTable4Reg)

         tic       date  shrcd  exchcd  siccd    shrout  altprc       ret  \
9          A 2018-10-31   11.0     1.0   38.0  318770.0   64.79 -0.079402   
62        AA 2018-12-31   11.0     1.0   33.0  186494.0   26.58 -0.164414   
74        AA 2019-12-31   11.0     1.0   33.0  185573.0   21.51  0.057002   
86        AA 2020-12-31   11.0     1.0   33.0  185930.0   23.05  0.158291   
146      AAL 2018-12-31   11.0     3.0   45.0  460611.0   32.11 -0.200448   
...      ...        ...    ...     ...    ...       ...     ...       ...   
191126  ZYNE 2018-12-31   11.0     3.0   99.0   17627.0    2.97 -0.383817   
191138  ZYNE 2019-12-31   11.0     3.0   99.0   23211.0    6.04 -0.045814   
191150  ZYNE 2020-12-31   11.0     3.0   99.0   29975.0    3.30 -0.288793   
191175  ZYXI 2019-12-31   11.0     3.0   99.0   32792.0    7.87 -0.250476   
191187  ZYXI 2020-12-31   11.0     3.0   99.0   34792.0   13.46 -0.035817   

         ret_adj           mv  ...  cash_flows_comp_decile  \
9      -0.079

In [20]:
### Run first regression for Table 4A
table4AReg1 = sm.ols(formula = "earnings_comp_next_year ~ earnings_comp", data = dataForTable4Reg).fit()
table4AReg1Summary = table4AReg1.summary()
print(table4AReg1Summary)

                               OLS Regression Results                              
Dep. Variable:     earnings_comp_next_year   R-squared:                       0.713
Model:                                 OLS   Adj. R-squared:                  0.713
Method:                      Least Squares   F-statistic:                 1.616e+04
Date:                     Fri, 04 Aug 2023   Prob (F-statistic):               0.00
Time:                             23:04:14   Log-Likelihood:                 1713.8
No. Observations:                     6510   AIC:                            -3424.
Df Residuals:                         6508   BIC:                            -3410.
Df Model:                                1                                         
Covariance Type:                 nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------

In [21]:
#Record data from previous regression
alpha_0_4A = table4AReg1.params[0]

dataForTable4Reg['earnings_comp_next_year_minus_alpha_0'] = dataForTable4Reg['earnings_comp_next_year'] - alpha_0_4A
#Run second regression for Table 4A
table4AReg2 = sm.ols(formula = "abn_ret_compounded_next_year ~ earnings_comp_next_year_minus_alpha_0 + earnings_comp - 1", data = dataForTable4Reg).fit()
table4AReg2Summary = table4AReg2.summary()
print(table4AReg2Summary)

                                      OLS Regression Results                                     
Dep. Variable:     abn_ret_compounded_next_year   R-squared (uncentered):                   0.013
Model:                                      OLS   Adj. R-squared (uncentered):              0.013
Method:                           Least Squares   F-statistic:                              42.67
Date:                          Fri, 04 Aug 2023   Prob (F-statistic):                    3.89e-19
Time:                                  23:04:14   Log-Likelihood:                         -10524.
No. Observations:                          6510   AIC:                                  2.105e+04
Df Residuals:                              6508   BIC:                                  2.107e+04
Df Model:                                     2                                                  
Covariance Type:                      nonrobust                                                  
                    

In [22]:
#Run first regression for Table 4B
table4BReg1 = sm.ols(formula = "earnings_comp_decile_next_year ~ earnings_comp_decile", data = dataForTable4Reg).fit()
table4BReg1Summary = table4BReg1.summary()
print(table4BReg1Summary)

#Record data from previous regression
alpha_0_4B = table4BReg1.params[0]

dataForTable4Reg['earnings_comp_decile_next_year_minus_alpha_0'] = dataForTable4Reg['earnings_comp_decile_next_year'] - alpha_0_4B
#Run second regression for Table 4B
table4BReg2 = sm.ols(formula = "abn_ret_compounded_decile_next_year ~ earnings_comp_decile_next_year_minus_alpha_0 + earnings_comp_decile - 1", data = dataForTable4Reg).fit()
table4BReg2Summary = table4BReg2.summary()
print(table4BReg2Summary)

                                  OLS Regression Results                                  
Dep. Variable:     earnings_comp_decile_next_year   R-squared:                       0.703
Model:                                        OLS   Adj. R-squared:                  0.703
Method:                             Least Squares   F-statistic:                 1.541e+04
Date:                            Fri, 04 Aug 2023   Prob (F-statistic):               0.00
Time:                                    23:04:14   Log-Likelihood:                -12211.
No. Observations:                            6510   AIC:                         2.443e+04
Df Residuals:                                6508   BIC:                         2.444e+04
Df Model:                                       1                                         
Covariance Type:                        nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
-

In [23]:
### Replicate Table 5 ###
dataForTable5Reg = dataForTable4Reg

#Run first regression for Table 5A
table5AReg1 = sm.ols(formula = "earnings_comp_next_year ~ accruals_comp + cash_flows_comp", data = dataForTable5Reg).fit()
table5AReg1Summary = table5AReg1.summary()
print(table5AReg1Summary)

#Record data from previous regression
gamma_0_5A = table5AReg1.params[0]

dataForTable5Reg['earnings_comp_next_year_minus_gamma_0'] = dataForTable5Reg['earnings_comp_next_year'] - gamma_0_5A
#Run second regression for Table 5A
table5AReg2 = sm.ols(formula = "abn_ret_compounded_next_year ~ earnings_comp_next_year_minus_gamma_0 + accruals_comp + cash_flows_comp - 1", data = dataForTable5Reg).fit()
table5AReg2Summary = table5AReg2.summary()
print(table5AReg2Summary)

                               OLS Regression Results                              
Dep. Variable:     earnings_comp_next_year   R-squared:                       0.714
Model:                                 OLS   Adj. R-squared:                  0.714
Method:                      Least Squares   F-statistic:                     8117.
Date:                     Fri, 04 Aug 2023   Prob (F-statistic):               0.00
Time:                             23:04:14   Log-Likelihood:                 1724.2
No. Observations:                     6510   AIC:                            -3442.
Df Residuals:                         6507   BIC:                            -3422.
Df Model:                                2                                         
Covariance Type:                 nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------

In [24]:
#Run first regression for Table 5B
table5BReg1 = sm.ols(formula = "earnings_comp_decile_next_year ~ accruals_comp_decile + cash_flows_comp_decile", data = dataForTable5Reg).fit()
table5BReg1Summary = table5BReg1.summary()
print(table5BReg1Summary)

#Record data from previous regression
gamma_0_5B = table5BReg1.params[0]

dataForTable5Reg['earnings_comp_decile_next_year_minus_gamma_0'] = dataForTable5Reg['earnings_comp_decile_next_year'] - gamma_0_5A
#Run second regression for Table 5B
table5BReg2 = sm.ols(formula = "abn_ret_compounded_decile_next_year ~ earnings_comp_decile_next_year_minus_gamma_0 + accruals_comp_decile + cash_flows_comp_decile - 1", data = dataForTable5Reg).fit()
table5BReg2Summary = table5BReg2.summary()
print(table5BReg2Summary)

                                  OLS Regression Results                                  
Dep. Variable:     earnings_comp_decile_next_year   R-squared:                       0.638
Model:                                        OLS   Adj. R-squared:                  0.638
Method:                             Least Squares   F-statistic:                     5725.
Date:                            Fri, 04 Aug 2023   Prob (F-statistic):               0.00
Time:                                    23:04:14   Log-Likelihood:                -12860.
No. Observations:                            6510   AIC:                         2.573e+04
Df Residuals:                                6507   BIC:                         2.575e+04
Df Model:                                       2                                         
Covariance Type:                        nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]

In [31]:
### Replicate Table 6 ###

#Generate data (2-year abnormal return, ret-rf, and rm-rf data)
dataForTable6 = dataForTable5Reg.copy()
dataForTable6['abn_ret_compounded_next_2_years'] = np.nan
dataForTable6['ret_minus_rf_compounded_next_year'] = np.nan
dataForTable6['ret_minus_rf_compounded_next_2_years'] = np.nan
dataForTable6['rm_minus_rf_compounded_next_year'] = np.nan
dataForTable6['rm_minus_rf_compounded_next_2_years'] = np.nan

for i in dataForTable6.index:
    if((i + 4) in mainDF.index) and ((i + 27) in mainDF.index) and (mainDF.loc[i + 4, 'tic'] == dataForTable6.loc[i, 'tic']) and (mainDF.loc[i + 27, 'tic'] == dataForTable6.loc[i, 'tic']):
        #Add ret-rf, rm-rf, and abnormal return data
        tempDF = mainDF.loc[(i + 4) : (i + 27)].copy()
        #ret-rf
        tempDF['ret_minus_rf_compounded'] = (1 + tempDF['ret_minus_rf']).cumprod() - 1
        dataForTable6.loc[i, 'ret_minus_rf_compounded_next_year'] = tempDF.loc[i + 15, 'ret_minus_rf_compounded']
        dataForTable6.loc[i, 'ret_minus_rf_compounded_next_2_years'] = tempDF.loc[i + 27, 'ret_minus_rf_compounded']
        #rm-rf
        tempDF['rm_minus_rf_compounded'] = (1 + tempDF['rm_minus_rf']).cumprod() - 1
        dataForTable6.loc[i, 'rm_minus_rf_compounded_next_year'] = tempDF.loc[i + 15, 'rm_minus_rf_compounded']
        dataForTable6.loc[i, 'rm_minus_rf_compounded_next_2_years'] = tempDF.loc[i + 27, 'rm_minus_rf_compounded']
        #abnnormal returns
        tempDF['abn_ret_compounded'] = (1 + tempDF['abn_ret']).cumprod() - 1
        dataForTable6.loc[i, 'abn_ret_compounded_next_2_years'] = tempDF.loc[i + 27, 'abn_ret_compounded']

dataForTable6 = dataForTable6.dropna()

table6 = pd.DataFrame(index = range(1, 11), columns = ['sar_next_year_mean', 'sar_next_2_years_mean', 'sar_next_year_median', 'sar_next_2_years_median', 'alpha_next_year', 'alpha_next_2_years'])
for i in table6.index:
    tempDF = dataForTable6[dataForTable6['accruals_comp_decile'] == i]
    table6.loc[i, 'sar_next_year_mean'] = np.average(tempDF['abn_ret_compounded_next_year'])
    table6.loc[i, 'sar_next_2_years_mean'] = np.average(tempDF['abn_ret_compounded_next_2_years'])
    table6.loc[i, 'sar_next_year_median'] = np.median(tempDF['abn_ret_compounded_next_year'])
    table6.loc[i, 'sar_next_2_years_median'] = np.median(tempDF['abn_ret_compounded_next_2_years'])
    tempReg = sm.ols(formula = "ret_minus_rf_compounded_next_year ~ rm_minus_rf_compounded_next_year", data = tempDF).fit()
    tempAlpha = tempReg.params[0]
    tempReg2 = sm.ols(formula = "ret_minus_rf_compounded_next_2_years ~ rm_minus_rf_compounded_next_2_years", data = tempDF).fit()
    tempAlpha2 = tempReg2.params[0]
    table6.loc[i, 'alpha_next_year'] = tempAlpha
    table6.loc[i, 'alpha_next_2_years'] = tempAlpha2
    
print(table6)

   sar_next_year_mean sar_next_2_years_mean sar_next_year_median  \
1           -0.026791              0.223556            -0.283682   
2           -0.033469              0.123427            -0.203925   
3            -0.07142              0.091001             -0.16287   
4           -0.013536              0.038972            -0.071369   
5            0.035879              0.000329             -0.12099   
6            0.043491              0.058388            -0.071955   
7            0.035293             -0.044463            -0.105039   
8            0.189172              0.139877            -0.068227   
9            0.304495              0.110063            -0.131579   
10          -0.070307             -0.153351            -0.202839   

   sar_next_2_years_median alpha_next_year alpha_next_2_years  
1                -0.436385       -0.168421          -2.254907  
2                -0.228252       -0.145053           -1.07992  
3                -0.220246       -0.162773          -0.5211

In [32]:
### Replicate Table 7 ###
dataForTable7 = dataForTable6.copy()
table7AReg1 = sm.ols(formula = "abn_ret_compounded_next_year ~ accruals_comp", data = dataForTable7).fit()
table7AReg2 = sm.ols(formula = "abn_ret_compounded_next_2_years ~ accruals_comp", data = dataForTable7).fit()
table7BReg1 = sm.ols(formula = "abn_ret_compounded_next_year ~ accruals_subcomp_1 + accruals_subcomp_2 + accruals_subcomp_3", data = dataForTable7).fit()
table7BReg2 = sm.ols(formula = "abn_ret_compounded_next_2_years ~ accruals_subcomp_1 + accruals_subcomp_2 + accruals_subcomp_3", data = dataForTable7).fit()
table7AReg1Summary = table7AReg1.summary()
table7AReg2Summary = table7AReg2.summary()
table7BReg1Summary = table7BReg1.summary()
table7BReg2Summary = table7BReg2.summary()
print(table7AReg1Summary)
print(table7AReg2Summary)
print(table7BReg1Summary)
print(table7BReg2Summary)

                                 OLS Regression Results                                 
Dep. Variable:     abn_ret_compounded_next_year   R-squared:                       0.000
Model:                                      OLS   Adj. R-squared:                 -0.000
Method:                           Least Squares   F-statistic:                    0.8267
Date:                          Fri, 04 Aug 2023   Prob (F-statistic):              0.363
Time:                                  23:12:49   Log-Likelihood:                -7009.8
No. Observations:                          3918   AIC:                         1.402e+04
Df Residuals:                              3916   BIC:                         1.404e+04
Df Model:                                     1                                         
Covariance Type:                      nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
----------------------------