In [1]:
import pandas as pd
import numpy as np
import textstat as ts
import nltk
#nltk.download('punkt')
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

## Importing and merging Text Data

In [2]:
tran = pd.read_csv('tran_read_data.csv')
mda = pd.read_csv('mda_read_data.csv')

In [3]:
tran = tran.drop(columns=['Unnamed: 0'])

In [4]:
df = pd.merge(tran,mda, on=['year','cik'])

In [5]:
print(len(df))
df = df[~df.isnull().any(axis=1)]
print(len(df))

5794
5788


## Calculate FOG Index

In [6]:
def calculate_fog(text):
    return ts.gunning_fog(text)

In [7]:
df['FOG_MDA'] = df['mda'].apply(calculate_fog)
df['FOG_Q1'] = df['transcript_qQ1'].apply(calculate_fog)
df['FOG_Q2'] = df['transcript_qQ2'].apply(calculate_fog)
df['FOG_Q3'] = df['transcript_qQ3'].apply(calculate_fog)
df['FOG_Q4'] = df['transcript_qQ4'].apply(calculate_fog)
df

Unnamed: 0,cik,year,transcript_qQ1,transcript_qQ2,transcript_qQ3,transcript_qQ4,mda,FOG_MDA,FOG_Q1,FOG_Q2,FOG_Q3,FOG_Q4
0,1800.0,2008,"Well, it’s certainly a key strategic priority....",Yes. I’ll let John take that question Phil. Su...,"Good morning. Certainly, I think that's possib...","Yes, we went through a fairly rigorous self-ev...",Financial Review Abbotts revenues are derived ...,13.20,12.52,14.74,11.80,209.76
1,1800.0,2010,Thanks Tom. This morning I will review the per...,"No, in my remarks, I said that we would be res...","Yes, Mike you had basically the same question....",Okay. I think you said that Piramal was around...,Financial Review Abbotts revenues are derived ...,13.47,13.95,10.23,15.42,10.84
2,1800.0,2011,"No, they're shared R&D on that program. Rick, ...","I don't have that at my fingertips here, Jami....",You're right about overall by the internationa...,"Thanks, Miles. We're very pleased at how we en...",Financial Review Abbotts revenues are derived ...,13.63,10.46,9.91,10.16,10.48
3,1800.0,2012,"Yes, that's probably around right. That's prob...","Good morning, and thanks for joining us. Also ...","Thanks, Miles. Today, we reported ongoing dilu...","Thanks, Miles. Before I review our financial p...",Financial Review Abbotts revenues are derived ...,13.87,9.78,12.73,12.39,9.57
4,1800.0,2013,"Well, in the U.S., first of all, I would say i...","Well, thank you and thank you all for your que...","Okay. Thanks, Brian. Good morning. This mornin...","Okay. Thanks, Brian, good morning. This mornin...",Financial Review Abbotts revenues are derived ...,13.56,9.48,9.50,10.43,11.71
...,...,...,...,...,...,...,...,...,...,...,...,...
5789,1561627.0,2013,"Thank you, Melissa, and good morning, everyone...","Thank you, Manny, and good morning, everyone. ...",Yes. We are also focusing on some of the carbo...,"Thanks, David, and good morning, everyone. I'm...","dollars in thousands, except per share amounts...",14.06,9.96,10.17,11.32,9.60
5790,1564822.0,2013,"I think when we think about M&A, we think abo...","No, I think there's probably less potential fo...",We absolute do. We have been updating but eve...,Sure. Our long-term target is to grow in line ...,included elsewhere in this Form 10 K.($ in mil...,17.77,12.51,11.15,11.86,11.61
5791,1564902.0,2013,"Thanks, Gene, and thank you to everyone on the...","Thanks, Jim, and good afternoon, everyone. Bef...","Thanks, Kelsey. Good afternoon, everyone, and ...",So I'll ask Jim Heaney to comment a little bit...,The following discussion contains managements ...,16.18,10.50,10.37,10.22,10.98
5792,1569134.0,2013,"Okay. So by my count, we've now told you a cou...","Thank you, Jennifer. Good afternoon, everyone....","Thanks, and good afternoon. As David mentioned...",But having said that to the extend that we jus...,Liquidity and Capital Resources Overview Initi...,17.60,8.53,14.64,12.94,13.88


In [8]:
df[['FOG_MDA','FOG_Q1','FOG_Q2','FOG_Q3','FOG_Q4']].describe()

Unnamed: 0,FOG_MDA,FOG_Q1,FOG_Q2,FOG_Q3,FOG_Q4
count,5788.0,5788.0,5788.0,5788.0,5788.0
mean,15.20373,15.487617,14.31966,14.115976,14.269286
std,8.917358,17.156485,15.081804,16.748772,15.331169
min,10.99,7.19,7.03,7.33,7.1
25%,13.67,10.5875,10.57,10.51,10.43
50%,14.58,11.81,11.65,11.53,11.41
75%,15.79,13.64,13.29,12.94,12.88
max,453.65,382.59,409.29,586.36,316.36


The FOG index generates a grade level between 0 and 20.

There seem to be a number of observations that score too high on the FOG index, this indicates some sort of fault in the data. It will be decided later on how to adress these faulty scores. For example setting these to the mean.

In [9]:
print(len(df[df['FOG_MDA'] > 20]))
print(len(df[df['FOG_Q1'] > 20]))
print(len(df[df['FOG_Q2'] > 20]))
print(len(df[df['FOG_Q3'] > 20]))
print(len(df[df['FOG_Q4'] > 20]))

129
363
243
216
238


In [10]:
mask1 = df['FOG_MDA'] > 20
mask2 = df['FOG_Q1'] > 20
mask3 = df['FOG_Q2'] > 20
mask4 = df['FOG_Q3'] > 20
mask5 = df['FOG_Q4'] > 20

df.loc[mask1, 'FOG_MDA'] = 20
df.loc[mask2, 'FOG_Q1'] = 20
df.loc[mask3, 'FOG_Q2'] = 20
df.loc[mask4, 'FOG_Q3'] = 20
df.loc[mask5, 'FOG_Q4'] = 20

In [11]:
df[['FOG_MDA','FOG_Q1','FOG_Q2','FOG_Q3','FOG_Q4']].describe()

Unnamed: 0,FOG_MDA,FOG_Q1,FOG_Q2,FOG_Q3,FOG_Q4
count,5788.0,5788.0,5788.0,5788.0,5788.0
mean,14.922538,12.538709,12.276501,12.077241,12.039986
std,1.734926,2.792045,2.545646,2.403117,2.499313
min,10.99,7.19,7.03,7.33,7.1
25%,13.67,10.5875,10.57,10.51,10.43
50%,14.58,11.81,11.65,11.53,11.41
75%,15.79,13.64,13.29,12.94,12.88
max,20.0,20.0,20.0,20.0,20.0


## Calculate Length (# Words)

In [12]:
def count_words(text):
    tokens = nltk.word_tokenize(text)
    return len(tokens)

In [13]:
df['Length_MDA'] = df['mda'].apply(count_words)
df['Length_Q1'] = df['transcript_qQ1'].apply(count_words)
df['Length_Q2'] = df['transcript_qQ2'].apply(count_words)
df['Length_Q3'] = df['transcript_qQ3'].apply(count_words)
df['Length_Q4'] = df['transcript_qQ4'].apply(count_words)
df

Unnamed: 0,cik,year,transcript_qQ1,transcript_qQ2,transcript_qQ3,transcript_qQ4,mda,FOG_MDA,FOG_Q1,FOG_Q2,FOG_Q3,FOG_Q4,Length_MDA,Length_Q1,Length_Q2,Length_Q3,Length_Q4
0,1800.0,2008,"Well, it’s certainly a key strategic priority....",Yes. I’ll let John take that question Phil. Su...,"Good morning. Certainly, I think that's possib...","Yes, we went through a fairly rigorous self-ev...",Financial Review Abbotts revenues are derived ...,13.20,12.52,14.74,11.80,20.00,10407,8971,9949,8877,7703
1,1800.0,2010,Thanks Tom. This morning I will review the per...,"No, in my remarks, I said that we would be res...","Yes, Mike you had basically the same question....",Okay. I think you said that Piramal was around...,Financial Review Abbotts revenues are derived ...,13.47,13.95,10.23,15.42,10.84,12934,10250,8910,9860,13648
2,1800.0,2011,"No, they're shared R&D on that program. Rick, ...","I don't have that at my fingertips here, Jami....",You're right about overall by the internationa...,"Thanks, Miles. We're very pleased at how we en...",Financial Review Abbotts revenues are derived ...,13.63,10.46,9.91,10.16,10.48,16226,9262,8435,12882,10732
3,1800.0,2012,"Yes, that's probably around right. That's prob...","Good morning, and thanks for joining us. Also ...","Thanks, Miles. Today, we reported ongoing dilu...","Thanks, Miles. Before I review our financial p...",Financial Review Abbotts revenues are derived ...,13.87,9.78,12.73,12.39,9.57,14833,8862,10139,16087,12548
4,1800.0,2013,"Well, in the U.S., first of all, I would say i...","Well, thank you and thank you all for your que...","Okay. Thanks, Brian. Good morning. This mornin...","Okay. Thanks, Brian, good morning. This mornin...",Financial Review Abbotts revenues are derived ...,13.56,9.48,9.50,10.43,11.71,12528,10853,12244,12293,11382
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5789,1561627.0,2013,"Thank you, Melissa, and good morning, everyone...","Thank you, Manny, and good morning, everyone. ...",Yes. We are also focusing on some of the carbo...,"Thanks, David, and good morning, everyone. I'm...","dollars in thousands, except per share amounts...",14.06,9.96,10.17,11.32,9.60,11146,9461,8888,7620,11507
5790,1564822.0,2013,"I think when we think about M&A, we think abo...","No, I think there's probably less potential fo...",We absolute do. We have been updating but eve...,Sure. Our long-term target is to grow in line ...,included elsewhere in this Form 10 K.($ in mil...,17.77,12.51,11.15,11.86,11.61,21029,7761,13910,8249,20266
5791,1564902.0,2013,"Thanks, Gene, and thank you to everyone on the...","Thanks, Jim, and good afternoon, everyone. Bef...","Thanks, Kelsey. Good afternoon, everyone, and ...",So I'll ask Jim Heaney to comment a little bit...,The following discussion contains managements ...,16.18,10.50,10.37,10.22,10.98,11786,5867,7906,7274,6633
5792,1569134.0,2013,"Okay. So by my count, we've now told you a cou...","Thank you, Jennifer. Good afternoon, everyone....","Thanks, and good afternoon. As David mentioned...",But having said that to the extend that we jus...,Liquidity and Capital Resources Overview Initi...,17.60,8.53,14.64,12.94,13.88,4254,4869,3751,4693,4575


In [14]:
df[['Length_MDA','Length_Q1','Length_Q2','Length_Q3','Length_Q4']].describe()

Unnamed: 0,Length_MDA,Length_Q1,Length_Q2,Length_Q3,Length_Q4
count,5788.0,5788.0,5788.0,5788.0,5788.0
mean,15773.333967,6147.081721,6455.010021,6730.585867,6828.710781
std,9745.249309,2372.624522,2423.806186,2722.509753,2566.081584
min,277.0,962.0,998.0,1235.0,1013.0
25%,9888.75,4460.75,4767.5,4880.75,5085.5
50%,13467.0,6014.5,6329.5,6538.0,6687.5
75%,18860.25,7615.25,7912.0,8152.0,8237.0
max,107700.0,30903.0,26404.0,35326.0,30518.0


## Exporting the Features

In [15]:
df_fin = df.drop(columns=['transcript_qQ1','transcript_qQ2','transcript_qQ3','transcript_qQ4','mda'])
df_fin.to_csv('read_features.csv', index=False)

In [16]:
df_fin.describe()

Unnamed: 0,cik,year,FOG_MDA,FOG_Q1,FOG_Q2,FOG_Q3,FOG_Q4,Length_MDA,Length_Q1,Length_Q2,Length_Q3,Length_Q4
count,5788.0,5788.0,5788.0,5788.0,5788.0,5788.0,5788.0,5788.0,5788.0,5788.0,5788.0,5788.0
mean,898104.3,2010.943849,14.922538,12.538709,12.276501,12.077241,12.039986,15773.333967,6147.081721,6455.010021,6730.585867,6828.710781
std,425161.5,1.609478,1.734926,2.792045,2.545646,2.403117,2.499313,9745.249309,2372.624522,2423.806186,2722.509753,2566.081584
min,1800.0,2008.0,10.99,7.19,7.03,7.33,7.1,277.0,962.0,998.0,1235.0,1013.0
25%,805264.0,2010.0,13.67,10.5875,10.57,10.51,10.43,9888.75,4460.75,4767.5,4880.75,5085.5
50%,1013857.0,2011.0,14.58,11.81,11.65,11.53,11.41,13467.0,6014.5,6329.5,6538.0,6687.5
75%,1158871.0,2012.0,15.79,13.64,13.29,12.94,12.88,18860.25,7615.25,7912.0,8152.0,8237.0
max,1575571.0,2013.0,20.0,20.0,20.0,20.0,20.0,107700.0,30903.0,26404.0,35326.0,30518.0
