In [1]:
import os
import re
import pandas as pd

## Creation of a Pandas DataFrame

In this notebook a Pandas DataFrame will be created from the previously created .txt files (See the jupyter notebook: 'MD&A Scraper'). The datafrane exists of 5 columns:
1. 'cik': The company's unique CIK.
2. 'year': The year of the report period end date.
3. 'month': The month of the report period end date.
4. 'day': The day of the report period end date.
5. 'mda': The (not yet cleaned) MD&A section of the respective 10-K report.

The mda column contains the whole MD&A section and contains a header and forward-looking disclosures.

In [33]:
# Specify the folder where the txt. files are stored
folder = 'C:/Users/siebm/Documents/Thesis Data/MD&A Storage'

# Create empty lists to store the information
ciks = []
years = []
months = []
days = []
mda_texts = []

# Run a for loop that goes through every .txt file and extracts the information
for filename in os.listdir(folder):
    # Only consider .txt files
    if filename.endswith('.txt'):
        # Opening the file
        with open(os.path.join(folder, filename), 'r') as f:
            # Reading the content
            content = f.read()
            # Extracting the cik
            cik = re.search('CIK: (\d+)', content).group(1).lstrip('0')
            # Extracting the date (yyyy-mm-dd)
            date = re.search(r"REPORT PERIOD END DATE: (\d{4})(\d{2})(\d{2})", content)
            # Splitting the date into year, month, and day
            year = date.group(1)
            month = date.group(2)
            day = date.group(3)
            # Some .txt files contain no MD&A
            # the following code prevents errors and stores empty string in the mda column for those files.
            mda_match = re.search('<SECTION>(.*?)</SECTION>', content, re.DOTALL)
            if mda_match:
                mda_text = mda_match.group(1)
            else:
                mda_text = ''
            # Appending the words, split by spaces, to the mda_texts list.
            mda_texts.append(mda_text.strip())
            # Appending the other columns
            ciks.append(cik)
            years.append(year)
            months.append(month)
            days.append(day)

# After the loop is done, populate the Pandas DataFrame           
df = pd.DataFrame({'cik': ciks, 'year': years, 'month': months, 'day': days, 'mda': mda_texts})

# Printing the df for inspection
print(df)

           cik  year month day  \
0         1750  2014    05  31   
1      1122304  2013    12  31   
2        19446  2014    07  31   
3      1451505  2013    12  31   
4       317771  2010    01  01   
...        ...   ...   ...  ..   
16117    96793  2010    06  30   
16118    96831  2009    12  31   
16119  1018963  2009    12  31   
16120    96943  2009    12  31   
16121  1051512  2009    12  31   

                                                     mda  
0      ITEM 7. MANAGEMENTS DISCUSSION AND ANALYSIS OF...  
1                                                         
2      Item 7. MANAGEMENTS DISCUSSION AND ANALYSIS OF...  
3      Item 7. Managements Discussion and Analysis of...  
4      ITEM 7. MANAGEMENTS DISCUSSION AND ANALYSIS OF...  
...                                                  ...  
16117  Item 7. Managements Discussion and Analysis of...  
16118  ITEM 7. MANAGEMENTS DISCUSSION AND ANALYSIS OF...  
16119  Item 7. Managements Discussion and Analysis of...  
1

The next code counts the amount of rows with and withou MD&A section.

In [34]:
# Filter rows based on whether mda column contains empty string or not
incomplete = df[df['mda'] == '']
complete = df[df['mda'] != '']

# Printing the resulting dataframes
print('Incomplete MD&A observations:', len(incomplete))
print('Complete MD&A observations:', len(complete))

Incomplete MD&A observations: 2432
Complete MD&A observations: 13690


In [42]:
# Check for duplicate combinations of CIK, year, and month
duplicates = complete.duplicated(subset=['cik', 'year','month'], keep=False)

# Printing rows to inspect the duplicates
complete[duplicates].sort_values('cik').head(2)

Unnamed: 0,cik,year,month,day,mda
14207,1004989,2010,12,31,Item 7. Managements Discussion and Analysis of...
4923,1004989,2010,12,31,Item 7. Managements Discussion and Analysis of...


Note: This code has not yet deleted, only examined, duplicate values. 

In [45]:
# Exporting the Dataframe as a csv file
complete.to_csv('mda_df.csv',index=False)