In [2]:
import pandas as pd
import re
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

## Importing one dataset at a time

Due to the large amount of data, one csv is imported and cleaned at a time. All files undergo the same preprocessing steps, and are concatenated at the end of this notebook to create one final file.

In [305]:
# trs = Transcript Full Texts, det = Transcript Details
trs = pd.read_csv("T1600001.csv", nrows=3268144)
det = pd.read_csv("D1600001.csv")

In [306]:
print(len(det))
print(len(trs))

117086
3268144


## Preprocessing steps - Details

In [307]:
# Filtering to keep only the earnings call transcripts (keydeveventtypeid == 48)
det = det[det["keydeveventtypeid"] == 48]

In [308]:
# Keep only the useful columns
det = det[['companyid','keydevid','transcriptid','headline','mostimportantdateutc']]

In [309]:
# Extract time related variables
det['year'] = det['headline'].str.extract(r'(\d{4})')
det['quarter'] = det['headline'].str.extract('(Q\d|H\d|Q1|Q2)')
det['other'] = det['headline'].str.extract(r'(\w+ months)', flags=re.IGNORECASE)

# Filter out rows where year is missing
det = det[det['year'].notnull()]

# Mutate quarter: H1->Q2, H2->Q4, NA->Q4
det['quarter'] = det['quarter'].map({'Q1': 'Q1', 'Q2': 'Q2', 'Q3': 'Q3', 'Q4': 'Q4'})
det.loc[det['quarter'] == ' H1 ', 'quarter'] = 'Q2'
det.loc[det['quarter'] == ' H2 ', 'quarter'] = 'Q4'
det.loc[(det['quarter'].isna()) & (det['other'].isna()), 'quarter'] = 'Q4'
det.loc[det['other'] == 'Nine Months', 'quarter'] = 'Q3'

# Filter out rows where quarter is missing and drop the other column
det = det[det['quarter'].notnull()].drop('other', axis=1)

# Convert year to numeric type
det['year'] = pd.to_numeric(det['year'])

# Convert quarter to categorical data type
det['quarter'] = det['quarter'].astype('category')

# Convert transcriptid to numeric type
det['transcriptid'] = pd.to_numeric(det['transcriptid'])

In [310]:
# Check companyid-year-quarter duplicate
counts = det.groupby(['companyid', 'year', 'quarter']).size().reset_index(name='count').sort_values('count', ascending=False)

# Keep only distinct rows
det = det.drop_duplicates(subset=['companyid', 'year', 'quarter'], keep='first')

# Convert companyid column to integer type
det['companyid'] = det['companyid'].astype(int)

In [311]:
# Keep only the years 2008 till 2013
det = det[(det['year'] >= 2008) & (det['year'] <= 2013)]

## Merging details and transcripts

In [312]:
df = pd.merge(trs[['transcriptid', 'transcriptcomponenttypeid', 'componenttext']], det[['transcriptid','companyid','mostimportantdateutc','year','quarter']], on='transcriptid')

In [313]:
len(df)

4086

## Preprocessing steps - Transcripts

In [314]:
df = df[df['transcriptcomponenttypeid'].isin([2, 4])]
df = df.drop('transcriptcomponenttypeid', axis=1)

In [315]:
df['year'].unique()

array([2013, 2012, 2011, 2010, 2008, 2009], dtype=int64)

In [316]:
df['quarter'].value_counts()

Q3    738
Q4    632
Q1    484
Q2    330
Name: quarter, dtype: int64

In [317]:
# Group the rows by transcriptid and concatenate the componenttext columns
df['componenttext'] = df['componenttext'].astype(str)
df_combined = df.groupby('transcriptid')['componenttext'].agg(' '.join).reset_index()

In [318]:
df = df.drop(columns=['componenttext'])
df = df.drop_duplicates()
print(len(df))

47


In [319]:
df_combined = pd.merge(df_combined[['transcriptid', 'componenttext']], df[['transcriptid','companyid','year','quarter']], on='transcriptid')

In [320]:
df_combined

Unnamed: 0,transcriptid,componenttext,companyid,year,quarter
0,1618204,"Some swaps, customers move from one aircraft t...",695204,2013,Q2
1,1618206,"So to get back to your point, depending on how...",695204,2012,Q3
2,1618247,"Well we expect to close Flexjet in Q4. Now, op...",695204,2013,Q3
3,1619956,"Sure. Well, there's a couple of different thin...",24568,2013,Q1
4,1619980,"Yes. We signaled that during our Q1 call, beca...",24568,2013,Q2
5,1620029,"Sure, Matt. Well the first thing I will refer ...",24568,2012,Q3
6,1620032,"Yes, now we can. Thanks. Better than having me...",24568,2012,Q4
7,1620050,"So, at this point, Glen, I mean we look at it ...",24568,2012,Q1
8,1620053,Good afternoon and thank you for joining us. I...,24568,2012,Q2
9,1620269,"Thanks, Tom. Before I get into the details, I'...",24568,2013,Q4


In [321]:
# Rename the column to reflect the concatenated text
df_combined = df_combined.rename(columns={'componenttext': 'transcript'})

## Combining the data into a final file

In [322]:
#full_text = df_combined
full_text = pd.concat([full_text, df_combined])

In [326]:
full_text = full_text.drop_duplicates()
len(full_text)

88996

In [323]:
full_text.to_csv('proc_transcripts.csv', index=False)

## Transforming the structure and replacing companykey

In [53]:
# Transforming the dataframe so that it reflects firm-year observations
final_df = pd.pivot_table(full_text, values='transcript', index=['companyid','year'],
                          columns=['quarter'], aggfunc=lambda x: x)
final_df.columns= ['transcript_q' + str(col) for col in final_df.columns]
final_df = final_df.reset_index()

NameError: name 'full_text' is not defined

In [345]:
print(len(final_df))

26927


In [361]:
# Replacing the companyid column with the cik
cid_cik = pd.read_csv('CompanyId to CIK.csv', encoding='utf-8')
cik_dict = dict(zip(cid_cik['companyid'], cid_cik['cik']))
final_df['cik'] = final_df['companyid'].map(cik_dict)
print(len(final_df))

16624


In [363]:
final_df = final_df.dropna()

In [362]:
print('Number of company-year observations with all quarterly transcripts:',(final_df.notna().all(axis=1)).sum())

Number of company-year observations with all quarterly transcripts: 16124


In [364]:
final_df.to_csv('proc_transcripts.csv', index=False)

# Cleaning the Transcripts

In [111]:
tran = pd.read_csv("proc_transcripts.csv")
# I decide to place all transcripts in one column again to be more efficient
tran = tran.drop(columns=['companyid'])
tran = tran.melt(id_vars=['cik','year'], value_name='transcript')
tran = tran.sort_values(['cik', 'year', 'variable'])
tran = tran.reset_index(drop=True)
tran

Unnamed: 0,cik,year,variable,transcript
0,1750.0,2010,transcript_qQ1,"All of the 737s, Jon, and the older generation..."
1,1750.0,2010,transcript_qQ2,"In what aspect? Thank you, Tom, and good morni..."
2,1750.0,2010,transcript_qQ3,"Or higher. Probably higher than that. I mean, ..."
3,1750.0,2010,transcript_qQ4,"As I said, it's a new program. We've only been..."
4,1750.0,2011,transcript_qQ1,"Say it one more time, please. Yes, well, Joe, ..."
...,...,...,...,...
64491,1921034.0,2011,transcript_qQ4,"Good morning, ladies and gentlemen. The year o..."
64492,1921034.0,2013,transcript_qQ1,Good morning. Before talking about the financi...
64493,1921034.0,2013,transcript_qQ2,Can you repeat your question. I didn't underst...
64494,1921034.0,2013,transcript_qQ3,"Good afternoon, everybody. In Slide 12, we wil..."


## Filtering based on Length 

Let's identify and examine the row with the smallest length

In [112]:
lengths = tran['transcript'].str.len()
min_index = lengths.idxmin()
min_row = tran.loc[min_index]

In [113]:
tran['transcript'][min_index]

'Hi Jason.\r\n Sure.\r\n'

This is surely not the whole transcript and something has likely gone wrong in parsing, I decide to delete all rows containing less than 100 characters and examine the smallest length once again. I repeat this proces untill I have found a smallest transcript that seems to be a full transcript with valuable information. 
The final cut-off value is set to a length of 5000. This deletes only 45 transcripts.

In [114]:
tran = tran[tran['transcript'].str.len() >= 5000]

In [115]:
lengths = tran['transcript'].str.len()
min_index = lengths.idxmin()
min_row = tran.loc[min_index]

In [116]:
tran['transcript'][min_index]

"Thank you, John.  We are encouraged by our results in the first quarter.  Comp sales in our two largest categories, music and DVD, has reversed the trends we've seen over the last several years.  The initiatives we implemented for these two categories have helped us stabilize our sales and increase our market share despite operating 23% fewer stores in last year.\r\nWe continue to leverage our SG&A expenses which helped us reduce our EBITDA loss in the first quarter versus last year's first quarter.\r\nWe ended the quarter with cash of $21 million and zero borrowings on our line of credit as compared to the borrowings of $29 million last year.  We amended our credit facility extending it to April 2013 providing us with the capital to refund our business for the next three years.\r\nAs I mentioned earlier, we also hired Mike Honeyman as our President and COO to strengthen our management team and help drive our operating results.  We made progress in the first quarter but we still have 

In [117]:
# As we can see, only 45 transcripts have been excluded
len(tran)

64451

## Adressing inaudible sections in transcripts

Some transcripts need to be altered due to issues (e.g., "inaudible")

In [118]:
print(len(tran[tran['transcript'].str.contains('inaud', na=False)]))

10224


In [119]:
print(len(tran[tran['transcript'].str.contains('(inaudible)', na=False)]))

  print(len(tran[tran['transcript'].str.contains('(inaudible)', na=False)]))


10223


In [132]:
print(len(tran[tran['transcript'].str.contains('indiscern', na=False)]))

4


In [121]:
print(len(tran[tran['transcript'].str.contains('indiscernable', na=False)]))

4


In [122]:
tran['transcript'][36]

"Asulam, yes. Asulam is used on sugar cane, it's not - I mentioned sugar cane before but the product we're talking about the sugar cane is another product.\r\n It's not going to affect anything.\r\n No, no, no -\r\n Hi, Gene.\r\n It's the mark - the harvest for potatoes. This is a post-harvest -\r\n No, not at all; completely different business.\r\n Next question.\r\n What I said was that one of the reasons that business continues to grow is because of the organic pigments. As you know we entered the organic pigment business about three years ago, we moved upstream from intermediates. Magazines - 50% of the organic pigments are used in printings for magazines because the magazines are color magazines and that business continues to go.\r\n No, we are moving international in colors and chemicals in Europe too, absolutely, and also in South Asia to our Singapore office.\r\n Not that I know of, maybe you know more than me, but there is a whole push on generic drugs. My opinion is, I don't 

After inspecting multiple cases that contain ('inaudible') it seems that only a small part of the transcript is inaudible. I therefore decide to delete only these part (e.g. ('inaudible')) instead of the entire transcript

In [123]:
# Deleting these parts of the transcripts
tran['transcript'] = tran['transcript'].str.replace('(inaudible)', '')

  tran['transcript'] = tran['transcript'].str.replace('(inaudible)', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tran['transcript'] = tran['transcript'].str.replace('(inaudible)', '')


In [124]:
print(len(tran[tran['transcript'].str.contains('inaud', na=False)]))

3


In [125]:
tran[tran['transcript'].str.contains('inaud', na=False)]

Unnamed: 0,cik,year,variable,transcript
26373,911109.0,2011,transcript_qQ2,"Yes, we’ve been doing price increase. We’re vi..."
47866,1253986.0,2010,transcript_qQ3,"Yes, I mean - we have from time to time - what..."
58029,1414932.0,2012,transcript_qQ2,"That’s right.\r\n Sure, I mean number ones are..."


In [126]:
tran['transcript'][26373]
# this transcript contains: ('inaudbile')
tran['transcript'][47866]
# this transcript contains: ('inauduible')
tran['transcript'][58029]
# this transcript contains: ('inaudibe')

# Deleting these parts of the transcripts
tran['transcript'] = tran['transcript'].str.replace('(inaudbile)', '')
tran['transcript'] = tran['transcript'].str.replace('(inauduible)', '')
tran['transcript'] = tran['transcript'].str.replace('(inaudibe)', '')

  tran['transcript'] = tran['transcript'].str.replace('(inaudbile)', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tran['transcript'] = tran['transcript'].str.replace('(inaudbile)', '')
  tran['transcript'] = tran['transcript'].str.replace('(inauduible)', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tran['transcript'] = tran['transcript'].str.replace('(inauduible)', '')
  tran['transcript'] = tran['transcript'].str.replace('(inaudibe)', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] =

In [127]:
print(len(tran[tran['transcript'].str.contains('inaud', na=False)]))

0


In [97]:
tran[tran['transcript'].str.contains('indiscernable', na=False)]

Unnamed: 0,cik,year,variable,transcript
9592,110536.0,2011,transcript_qQ1,"Good morning, Jill, and thank you. And thank y..."
19893,849869.0,2010,transcript_qQ2,"Thank you Malcolm. Welcome, everyone, to our S..."
24926,896159.0,2011,transcript_qQ3,"['No. No, big moves from any particular catast..."
52272,1328650.0,2009,transcript_qQ1,"Yes. We had part of it was our, on average our..."


In [101]:
tran['transcript'][52272]

'Yes. We had part of it was our, on average our borrowings, our balance was higher and then we had some interest forgiveness in Q4 related to some sales and tax issues that actually made it a little lower for Q4. So I think in fact, Marshall I think asked a question on interest expense that I never got to, but I think that the balance, where it is right now, is probably not likely to change much going forward. And our credit facility is LIBOR plus 400. So, if that\'s helpful. We had two terminate during the quarter. That had the effect of, as I mentioned in the prepared remarks, about $1300 a day on our revenue per day and our margins per day. It\'s a little over 3 million in gross proceeds. In terms of our contract situation going forward, it\'s explicit in our monthly operational releases for the remainder of \'09, I think we\'ve got about 1760 roughly days on the term contracts. And so, that\'s where we sit in terms of term contracts for the rest of this year. Marshall, I think that

In [129]:
# [indiscernible], [indiscernable] 
# Deleting these parts of the transcripts
tran['transcript'] = tran['transcript'].str.replace('indiscernible', '')
tran['transcript'] = tran['transcript'].str.replace('indiscernable', '')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tran['transcript'] = tran['transcript'].str.replace('indiscernible', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tran['transcript'] = tran['transcript'].str.replace('indiscernable', '')


In [130]:
print(len(tran[tran['transcript'].str.contains('indiscern', na=False)]))

4


In [134]:
tran[tran['transcript'].str.contains('indiscern', na=False)]

Unnamed: 0,cik,year,variable,transcript
6890,77281.0,2011,transcript_qQ3,"[""Yeah. Obviously not as high. It's unusual to..."
20292,855654.0,2012,transcript_qQ1,"[""It is the basis - well, ALL can be a very ag..."
60442,1487326.0,2012,transcript_qQ3,"Thanks, Mick. Looking at Page 27, our revenue ..."
63689,1725526.0,2012,transcript_qQ2,"Well, again, if I alluded to -- we have not fi..."


In [139]:
tran['transcript'][6890]
# this transcript contains: ('indiscernbile')
tran['transcript'][20292]
# this transcript contains: ('indiscerniable')
tran['transcript'][60442]
# this transcript contains: ('indiscernbible')
tran['transcript'][63689]
# this transcript contains: ('indiscernibly')

# Deleting these parts of the transcripts
tran['transcript'] = tran['transcript'].str.replace('(indiscernbile)', '')
tran['transcript'] = tran['transcript'].str.replace('(indiscerniable)', '')
tran['transcript'] = tran['transcript'].str.replace('(indiscernbible)', '')
tran['transcript'] = tran['transcript'].str.replace('(indiscernibly)', '')

  tran['transcript'] = tran['transcript'].str.replace('(indiscernbile)', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tran['transcript'] = tran['transcript'].str.replace('(indiscernbile)', '')
  tran['transcript'] = tran['transcript'].str.replace('(indiscerniable)', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tran['transcript'] = tran['transcript'].str.replace('(indiscerniable)', '')
  tran['transcript'] = tran['transcript'].str.replace('(indiscernbible)', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_

The content of the transcripts seem alright for us to continue to the next phase.

## Length, FOG Index, and Topic Clustering (?) data

As the length and the FOG Index can, or must, be calculated usting the pre-lemmatized data (must be able to detect sentences) I use less pre processing on the data for these features.

I will however delete or replace the following signs in order to create written sentences:
1. "
2. --
3. \
4. '
5. [
6. ]
7. \r
8. \n

In [156]:
tran['transcript'] = tran['transcript'].str.replace('\n', '')
tran['transcript'] = tran['transcript'].str.replace('\r', '')
tran['transcript'] = tran['transcript'].str.replace('[', '')
tran['transcript'] = tran['transcript'].str.replace(']', '')
tran['transcript'] = tran['transcript'].str.replace("''", '')
tran['transcript'] = tran['transcript'].str.replace('"', '')
tran['transcript'] = tran['transcript'].str.replace('--', '')
tran['transcript'] = tran['transcript'].str.replace("\\", '')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tran['transcript'] = tran['transcript'].str.replace('\n', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tran['transcript'] = tran['transcript'].str.replace('\r', '')
  tran['transcript'] = tran['transcript'].str.replace('[', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tran['transcript']

Now transform and export the data again

In [158]:
# Transforming the dataframe so that it reflects firm-year observations
read_df = pd.pivot_table(tran, values='transcript', index=['cik','year'],
                          columns=['variable'], aggfunc=lambda x: x)
read_df = read_df.reset_index()

In [180]:
read_df.to_csv('tran_read_data.csv', index='False')

## Sentiment and Topic Clustering(?)

To calculate the sentiment scores and cluster the topics, I will remove stopwords and signs and lemmatize the words of the transcripts (I use lemmatizing as it holds more information on the context when it is compared to stemming).

In [166]:
# import the module and create a list of stopwords
swords = stopwords.words('english')

# Replace all signs and stopwords in transcripts and tokenize transcripts
tran['transcript'] = tran['transcript'].str.lower()\
          .str.replace('(@[a-z0-9]+)\w+',' ')\
          .str.replace('(http\S+)', ' ')\
          .str.replace('([^0-9a-z \t])',' ')\
          .str.replace(' +',' ')\
          .apply(lambda x: [i for i in x.split() if not i in swords])

  tran['transcript'] = tran['transcript'].str.lower()\
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tran['transcript'] = tran['transcript'].str.lower()\


In [173]:
# Lemmatizing
ltzr = WordNetLemmatizer()
tran['transcript'] = tran['transcript'].apply(lambda x: [ltzr.lemmatize(i) for i in x if i != ''])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tran['transcript'] = tran['transcript'].apply(lambda x: [ltzr.lemmatize(i) for i in x if i != ''])


In [176]:
# Transforming the dataframe so that it reflects firm-year observations
sent_df = pd.pivot_table(tran, values='transcript', index=['cik','year'],
                          columns=['variable'], aggfunc=lambda x: x)
sent_df = sent_df.reset_index()

In [177]:
sent_df.to_csv('tran_sent_data.csv', index='False')

## Extracting the unique combinations of year-firm observations

In [181]:
year_cik = read_df[['cik','year']]
year_cik = year_cik.drop_duplicates()

In [182]:
year_cik.to_csv('year_cik_transcripts.csv')