In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

## Reading Data (Audit Analytics Restatements)

In [2]:
df = pd.read_csv('restatements.csv', sep=',')

## Excluding Some Observations

In [3]:
# Excluding Immaterial Restatements (DATE_OF_8K_402 = blank)
df = df[df['DATE_OF_8K_402'].notna()]
print(len(df))

1029


In [4]:
# Only adverse restatements
df = df[df['RES_ADVERSE'] == 1]
print(len(df))

815


Maybe include clerrical errors as they can be an excuse to put the blame away from executives

In [5]:
# Exclude Clerrical Errors
df = df[df['RES_CLER_ERR'] != 1]
print(len(df))

757


In [6]:
# Exclude Accounting Rule Changes FIN 48, SAB No. 101, SAB No. 108. 
exclude_list = [70, 63, 64, 53, 57, 58, 65, 56, 60, 62, 61, 66, 59, 67, 68, 55, 49, 50, 51, 52]
df['RES_ACC_RES_FKEY_LIST'] = df['RES_ACC_RES_FKEY_LIST'].fillna('')

bool_idx = ~df['RES_ACC_RES_FKEY_LIST'].str.contains('|'.join(map(str, exclude_list)))

df = df[bool_idx]
print(len(df))

757


## Creation of Filtered Dataframe

I start by creating a dataframe that consists of values with year cik

In [7]:
# Creating the Dataframe
FR = pd.DataFrame()
FR['cik'] = df['COMPANY_FKEY']
FR['begin_year'] = df['RES_BEGIN_DATE'].str[-4:]
FR['end_year'] = df['RES_END_DATE'].str[-4:]
FR['fraud'] = df['RES_FRAUD']
FR['res'] = df['RES_ACCOUNTING']
print(len(FR))

757


From this dataframe I want to keep only one firm restatement year per restatement and exclude the other observations from the final data. I also exclude duplicate values.

In [8]:
FR2 = pd.DataFrame(columns=FR.columns)

FR['begin_year'] = FR['begin_year'].astype('int')
FR['end_year'] = FR['end_year'].astype('int')

for _, row in FR.iterrows():
    begin_year = row['begin_year']
    end_year = row['end_year']
    if end_year > begin_year:
        new_row = row.copy()
        new_row['year'] = end_year - 1
        FR2 = FR2.append(new_row)
    else:
        new_row = row.copy()
        new_row['year'] = end_year
        FR2 = FR2.append(new_row)
    
FR2 = FR2.reset_index(drop=True)

In [9]:
# Excluding Duplicate year cik combinations
dup = FR2.duplicated(subset=['cik','year'])
print(len(FR2[dup]))
FR2 = FR2.drop_duplicates(subset=['cik','year'])

32


In [10]:
FR2 = FR2.drop(columns=['begin_year','end_year'])

In [13]:
FR2.to_csv('res_cases.csv', index=False)
res_ciks = FR2['cik']
res_ciks.to_csv('res_ciks.csv', index=False)

## Excluding Incomplete Observations

In [13]:
fin_sample = pd.read_csv('year_cik_all.csv')

In [14]:
# Changing year to integer for merging
fin_sample['year'] = fin_sample['year'].astype('int')
FR2['year'] = FR2['year'].astype('int')
FR2['fraud'] = FR2['fraud'].astype('int')

In [15]:
# Merging FF and RES based on year and cik, keeping all rows of FF and only the matching of RES
merged = pd.merge(fin_sample, FR2, on=['year', 'cik'], how = 'outer', indicator = True)
merged = merged[merged['_merge'] != 'right_only']
# Adding a column for the target variable
merged['Target'] = merged['_merge'].apply(lambda x: 1 if x == 'both' else 0)
merged = merged.drop(columns=['_merge'])
# Changing the NA's to 0
merged['fraud'] = merged['fraud'].fillna(0)
merged['res'] = merged['res'].fillna(0)

In [16]:
print("Total Number of Total Observations:", len(merged))
print("The Number of Fraud Cases:",merged['fraud'].sum())
print("The Number of Accounting Restatements:",merged['res'].sum())

Total Number of Total Observations: 5782
The Number of Fraud Cases: 4.0
The Number of Accounting Restatements: 80
