In [1]:
import pandas as pd
import re
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
#nltk.download('wordnet')
#nltk.download('omw-1.4')
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

## Importing the MD&A Data

See 'MD&A Extraction' on how the data has been collected.

In [2]:
mda = pd.read_csv('mda_df.csv')
mda

Unnamed: 0,cik,year,month,day,mda
0,1750,2014,5,31,ITEM 7. MANAGEMENTS DISCUSSION AND ANALYSIS OF...
1,19446,2014,7,31,Item 7. MANAGEMENTS DISCUSSION AND ANALYSIS OF...
2,1451505,2013,12,31,Item 7. Managements Discussion and Analysis of...
3,317771,2010,1,1,ITEM 7. MANAGEMENTS DISCUSSION AND ANALYSIS OF...
4,97134,2009,12,31,ITEM 7 Managements Discussion and Analysis of ...
...,...,...,...,...,...
13685,96763,2009,12,25,Item 7 Managements Discussion and Analysis of ...
13686,96793,2010,6,30,Item 7. Managements Discussion and Analysis of...
13687,96831,2009,12,31,ITEM 7. MANAGEMENTS DISCUSSION AND ANALYSIS OF...
13688,1018963,2009,12,31,Item 7. Managements Discussion and Analysis of...


Keeping only the observations with a fiscal year end of December.

In [3]:
mda = mda[mda['month'] == 12]

In [4]:
mda = mda[['year', 'cik', 'mda']]
mda = mda.reset_index(drop=True)

## Checking the Length of the MD&A Sections

In [5]:
lengths = mda['mda'].str.len()
min_index = lengths.idxmin()
min_row = mda.loc[min_index]

In [6]:
mda['mda'][min_index]

'Item 7. Managements Discussion and Analysis of Financial Condition and Results of Operations. Our Relationship with El Paso Corporation El Paso is an energy company founded in 1928 in El Paso, Texas that primarily operates in the regulated natural gas transportation sector and the exploration and production sector of the energy industry. El Paso owns our two percent general partner interest, all of our incentive distribution rights, a 42 percent limited partner interest in us and the remaining 14 percent interest in CIG not owned by us. We have an omnibus agreement with El Paso and our general partner that governs our relationship with them regarding the provision of specified services to us, as well as certain reimbursement and indemnification matters. As a substantial owner in us, El Paso is motivated to promote and support the successful execution of our business strategies, including utilizing our partnership as a growth vehicle for its natural gas transportation, storage and othe

Although the text refers to a certain Part II it still provides essential information about the company, I decide to not filter based on length as the above is the shortest MD&A in the sample.

## Deleting Headers

In [56]:
# Note that the pattern is not only removed if it is a header but also if it occurs in the actual section
pattern = r"item\s*7[:.]?\s*(managements)?\s*(discussion)?\s*(and)?\s*(analysis)?\s*(of)?\s*(financial)?\s*(condition)?\s*(and)?\s*(results)?\s*(of)?\s*(operations)?\s*-?\s*"
mda['mda'] = mda['mda'].apply(lambda x: re.sub(pattern, '', x, flags=re.IGNORECASE))

In [57]:
rows_w_headers = mda[mda['mda'].str.contains('item 7 management')]
print(rows_w_headers)

Empty DataFrame
Columns: [year, cik, mda]
Index: []


## Length and FOG Index  data

In [63]:
# Making sure every value begins with a word instead of a non-word
pattern = r'^\W+'
mda['mda'] = mda['mda'].apply(lambda x: re.sub(pattern, '', x))

In [64]:
mda['mda'][4]

'The following discussion contains forward looking statements that are not limited to historical facts, but reflect our current beliefs, expectations or intentions regarding future events. All forward looking statements involve risks and uncertainties that could cause actual results to differ materially from those in the forward looking statements. For examples of those risks and uncertainties, see the cautionary statements contained in Item 1A. Risk Factors Risk Factors Relating to the Company and Risk Factors Risk Factors Relating to the Airline Industry. We undertake no obligation to publicly update or revise any forward looking statements to reflect events or circumstances that may arise after the date of this report, except as required by applicable law. Overview We recorded a net loss of $282 million for the year ended December 31, 2009, as compared to a net loss of $586 million for the year ended December 31, 2008. Our net loss in 2009 was primarily the result of the global rece

In [66]:
mda.to_csv('mda_read_data.csv', index=False)

## Sentiment and Topic data

In [317]:
mda_sent = mda

In [318]:
# import the module and create a list of stopwords
swords = stopwords.words('english')

# Replace all signs and stopwords in transcripts and tokenize transcripts
mda_sent['mda'] = mda_sent['mda'].str.lower()\
          .str.replace('(@[a-z0-9]+)\w+',' ')\
          .str.replace('(http\S+)', ' ')\
          .str.replace('([^0-9a-z \t])',' ')\
          .str.replace(' +',' ')\
          .apply(lambda x: [i for i in x.split() if not i in swords])

In [319]:
# Lemmatizing
ltzr = WordNetLemmatizer()
mda_sent['mda'] = mda_sent['mda'].apply(lambda x: [ltzr.lemmatize(i) for i in x if i != ''])

In [322]:
mda_sent.to_csv('mda_sent_data.csv', index=False)

## Year and CIK of Complete MD&A Observations

In [323]:
year_cik_mda = mda[['year','cik']]

In [324]:
year_cik_mda.to_csv('year_cik_mda.csv',index=False)