In [1]:
import pandas as pd
import time
import re
import nltk
import gensim
import gensim.corpora as corpora
import numpy as np
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from gensim.models.coherencemodel import CoherenceModel
from collections import Counter

## Reading the Data

In [2]:
tran = pd.read_csv("tran_sent_data.csv")
mda = pd.read_csv('mda_sent_data.csv')

In [3]:
tran = tran.drop(columns=['Unnamed: 0'])
mda = mda.drop(columns=['Unnamed: 0'])

In [4]:
df = pd.merge(tran,mda, on=['year','cik'])
df = df.reset_index(drop=True)
print(len(df))
print(len(df.dropna()))

5794
5788


In [5]:
print(len(df))
df = df[~df.isnull().any(axis=1)]
print(len(df))

5794
5788


## Formatting the Text

In [6]:
# Creation of a function to put the values in the right format
def remove_chars(s):
    s = s.replace('[', '').replace(']','')
    s = s.replace("'",'')
    s = s.replace(',','')
    return s

df['transcript_qQ1'] = df['transcript_qQ1'].apply(remove_chars)
df['transcript_qQ2'] = df['transcript_qQ2'].apply(remove_chars)
df['transcript_qQ3'] = df['transcript_qQ3'].apply(remove_chars)
df['transcript_qQ4'] = df['transcript_qQ4'].apply(remove_chars)
df['mda'] = df['mda'].apply(remove_chars)

I will remove words that occur only once, add bigrams to docs, (that appear at least 50 times), and filter out words that occur in less than 50 of the documents and words that appear in 50% of the documens

In [7]:
df['mda'] = df['mda'].apply(lambda x: x.split())

# count the frequency of each word
word_freq = Counter(word for words in df['mda'] for word in words)

# filter out words that appear only once
word_freq = {word: freq for word, freq in word_freq.items() if freq > 1}

# get the count of rows that each word appears in
word_rows = {}
for i, words in df['mda'].iteritems():
    for word in set(words):
        if word in word_freq and not word.isdigit():
            word_rows[word] = word_rows.get(word, set())
            word_rows[word].add(i)

# filter out words that appear in less than 50 rows or in more than 50% of the rows
min_rows = 50
max_rows = len(df) / 2
remaining_words = set(word for word, rows in word_rows.items() if len(rows) >= min_rows and len(rows) <= max_rows)

# join the remaining words into a single string for each row
df['mda'] = df['mda'].apply(lambda x: ' '.join(word for word in x if word in remaining_words))

In [8]:
df['transcript_qQ1'] = df['transcript_qQ1'].apply(lambda x: x.split())

# count the frequency of each word
word_freq = Counter(word for words in df['transcript_qQ1'] for word in words)

# filter out words that appear only once
word_freq = {word: freq for word, freq in word_freq.items() if freq > 1}

# get the count of rows that each word appears in
word_rows = {}
for i, words in df['transcript_qQ1'].iteritems():
    for word in set(words):
        if word in word_freq and not word.isdigit():
            word_rows[word] = word_rows.get(word, set())
            word_rows[word].add(i)

# filter out words that appear in less than 50 rows or in more than 50% of the rows
min_rows = 50
max_rows = len(df) / 2
remaining_words = set(word for word, rows in word_rows.items() if len(rows) >= min_rows and len(rows) <= max_rows)

# join the remaining words into a single string for each row
df['transcript_qQ1'] = df['transcript_qQ1'].apply(lambda x: ' '.join(word for word in x if word in remaining_words))

In [9]:
df['transcript_qQ2'] = df['transcript_qQ2'].apply(lambda x: x.split())

# count the frequency of each word
word_freq = Counter(word for words in df['transcript_qQ2'] for word in words)

# filter out words that appear only once
word_freq = {word: freq for word, freq in word_freq.items() if freq > 1}

# get the count of rows that each word appears in
word_rows = {}
for i, words in df['transcript_qQ2'].iteritems():
    for word in set(words):
        if word in word_freq and not word.isdigit():
            word_rows[word] = word_rows.get(word, set())
            word_rows[word].add(i)

# filter out words that appear in less than 50 rows or in more than 50% of the rows
min_rows = 50
max_rows = len(df) / 2
remaining_words = set(word for word, rows in word_rows.items() if len(rows) >= min_rows and len(rows) <= max_rows)

# join the remaining words into a single string for each row
df['transcript_qQ2'] = df['transcript_qQ2'].apply(lambda x: ' '.join(word for word in x if word in remaining_words))

In [10]:
df['transcript_qQ3'] = df['transcript_qQ3'].apply(lambda x: x.split())

# count the frequency of each word
word_freq = Counter(word for words in df['transcript_qQ3'] for word in words)

# filter out words that appear only once
word_freq = {word: freq for word, freq in word_freq.items() if freq > 1}

# get the count of rows that each word appears in
word_rows = {}
for i, words in df['transcript_qQ3'].iteritems():
    for word in set(words):
        if word in word_freq and not word.isdigit():
            word_rows[word] = word_rows.get(word, set())
            word_rows[word].add(i)

# filter out words that appear in less than 50 rows or in more than 50% of the rows
min_rows = 50
max_rows = len(df) / 2
remaining_words = set(word for word, rows in word_rows.items() if len(rows) >= min_rows and len(rows) <= max_rows)

# join the remaining words into a single string for each row
df['transcript_qQ3'] = df['transcript_qQ3'].apply(lambda x: ' '.join(word for word in x if word in remaining_words))

In [11]:
df['transcript_qQ4'] = df['transcript_qQ4'].apply(lambda x: x.split())

# count the frequency of each word
word_freq = Counter(word for words in df['transcript_qQ4'] for word in words)

# filter out words that appear only once
word_freq = {word: freq for word, freq in word_freq.items() if freq > 1}

# get the count of rows that each word appears in
word_rows = {}
for i, words in df['transcript_qQ4'].iteritems():
    for word in set(words):
        if word in word_freq and not word.isdigit():
            word_rows[word] = word_rows.get(word, set())
            word_rows[word].add(i)

# filter out words that appear in less than 50 rows or in more than 50% of the rows
min_rows = 50
max_rows = len(df) / 2
remaining_words = set(word for word, rows in word_rows.items() if len(rows) >= min_rows and len(rows) <= max_rows)

# join the remaining words into a single string for each row
df['transcript_qQ4'] = df['transcript_qQ4'].apply(lambda x: ' '.join(word for word in x if word in remaining_words))

In [14]:
df = df.reset_index(drop=True)
df

Unnamed: 0,cik,year,transcript_qQ1,transcript_qQ2,transcript_qQ3,transcript_qQ4,mda
0,1800.0,2008,priority received arrangement co develop abbot...,john phil wholesaler buying affected noted str...,possible division profitability bp pleasantly ...,went fairly rigorous self evaluation outside t...,broad health care patent protection license te...
1,1800.0,2010,tom pharmaceutical medical global pharmaceutic...,responding fda shortly meet fda okay europe pr...,mike basically raise fx lift somewhat settled ...,okay india healthy fit western tom track forec...,broad health care patent protection license te...
2,1800.0,2011,shared r rick went phase space rapidly watch h...,fingertip smaller mention mike went generic de...,international piece doubled size five double d...,mile delivering double digit eps original eps ...,broad health care patent protection license te...
3,1800.0,2012,closer john news delivering double digit ongoi...,tom vice finance larry divisional vice relatio...,mile operational unfavorable emerging remained...,mile explain handle transition pre separation ...,broad health care patent protection license te...
4,1800.0,2013,rare circumstance glad rely realize sound rati...,concludes replay central relation www afternoo...,okay brian enabled deliver double digit despit...,okay brian achieved representing double digit ...,broad health care patent protection license te...
...,...,...,...,...,...,...,...
5783,1561627.0,2013,appreciate chairman ceo dave burn john reviewi...,appreciate chairman david burn john reviewing ...,focusing carbon carbon play feature proving pr...,david happy slide absorption recurring stop re...,thereto constitute meaning 27a 21e cautionary ...
5784,1564822.0,2013,proactive initiate conversation reactive wait ...,remember piece specialty portfolio private lab...,absolute updating everything integration gone ...,category accelerated translated absolute chang...,week week week week week basic declared shareh...
5785,1564902.0,2013,gene entertainment excited public completion i...,jim afternoon onetime initial public offering ...,afternoon webcast live jim jim released closed...,ask jim attendance reference jamie afternoon r...,thereto supplementary numerous carefully speci...
5786,1569134.0,2013,okay count told urge annualize proxy sound eve...,jennifer afternoon appreciate afternoon david ...,afternoon david ebitda dcf compare distributio...,extend west afternoon everybody jump public ty...,midstream detail strategically hydrocarbon bas...


I decide to set a maximum for the amount of topics at 40 based on Li's paper. However, if the algorithm can not find 40 suitable topics it will keep less.

## Topics Q1

In [15]:
# Applying BERTopic with its default values (transcripts Q1)
start = time.time()
topic_model1 = BERTopic(nr_topics=40,verbose=True, calculate_probabilities=False)
topics1 = topic_model1.fit_transform(df['transcript_qQ1'])
end = time.time()
print(end-start)

Batches:   0%|          | 0/181 [00:00<?, ?it/s]

2023-03-28 06:28:23,833 - BERTopic - Transformed documents to Embeddings
2023-03-28 06:29:19,246 - BERTopic - Reduced dimensionality
2023-03-28 06:29:19,914 - BERTopic - Clustered reduced embeddings
2023-03-28 06:31:06,785 - BERTopic - Reduced number of topics from 61 to 40


31786.523812770844


In [16]:
topic_model1.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,2082,-1_production_america_north_client
1,0,1000,0_patient_study_clinical_trial
2,1,389,1_network_wireless_design_mobile
3,2,243,2_client_solution_cloud_enterprise
4,3,203,3_store_brand_retail_consumer
5,4,192,4_europe_america_north_currency
6,5,186,5_energy_utility_power_gas
7,6,186,6_oil_drilling_barrel_production
8,7,112,7_advertising_digital_station_game
9,8,88,8_fuel_truck_intermodal_coal


In [17]:
df_q1 = pd.DataFrame({"Document": df['transcript_qQ1'], "Topic_Q1": topics1[0]})

In [18]:
df_q1

Unnamed: 0,Document,Topic_Q1
0,priority received arrangement co develop abbot...,0
1,tom pharmaceutical medical global pharmaceutic...,0
2,shared r rick went phase space rapidly watch h...,0
3,closer john news delivering double digit ongoi...,0
4,rare circumstance glad rely realize sound rati...,0
...,...,...
5783,appreciate chairman ceo dave burn john reviewi...,10
5784,proactive initiate conversation reactive wait ...,3
5785,gene entertainment excited public completion i...,-1
5786,okay count told urge annualize proxy sound eve...,17


In [19]:
df_q1.to_csv('topics_q1.csv', index=False)

## Topics Q2

In [20]:
# Applying BERTopic with its default values (transcripts Q2)
start = time.time()
topic_model2 = BERTopic(nr_topics=40,verbose=True, calculate_probabilities=False)
topics2 = topic_model2.fit_transform(df['transcript_qQ2'])
end = time.time()
print(end-start)

Batches:   0%|          | 0/181 [00:00<?, ?it/s]

2023-03-28 15:19:34,614 - BERTopic - Transformed documents to Embeddings
2023-03-28 15:20:02,001 - BERTopic - Reduced dimensionality
2023-03-28 15:20:02,588 - BERTopic - Clustered reduced embeddings
2023-03-28 15:21:52,692 - BERTopic - Reduced number of topics from 70 to 40


31843.76603770256


In [21]:
topic_model2.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,2048,-1_brand_consumer_europe_client
1,0,959,0_patient_study_clinical_trial
2,1,496,1_rig_drilling_oil_production
3,2,257,2_network_cloud_solution_mobile
4,3,183,3_europe_america_currency_asia
5,4,179,4_energy_utility_power_transmission
6,5,152,5_design_semiconductor_wafer_q3
7,6,117,6_store_brand_consumer_retail
8,7,108,7_vehicle_car_store_truck
9,8,100,8_steel_mill_ton_paper


In [22]:
df_q2 = pd.DataFrame({"Document": df['transcript_qQ2'], "Topic_Q2": topics2[0]})

In [23]:
df_q2

Unnamed: 0,Document,Topic_Q2
0,john phil wholesaler buying affected noted str...,0
1,responding fda shortly meet fda okay europe pr...,0
2,fingertip smaller mention mike went generic de...,0
3,tom vice finance larry divisional vice relatio...,0
4,concludes replay central relation www afternoo...,0
...,...,...
5783,appreciate chairman david burn john reviewing ...,13
5784,remember piece specialty portfolio private lab...,17
5785,jim afternoon onetime initial public offering ...,14
5786,jennifer afternoon appreciate afternoon david ...,-1


In [24]:
df_q2.to_csv('topics_q2.csv', index=False)

## Topics Q3

In [25]:
# Applying BERTopic with its default values (transcripts Q3)
start = time.time()
topic_model3 = BERTopic(nr_topics=40,verbose=True, calculate_probabilities=False)
topics3 = topic_model3.fit_transform(df['transcript_qQ3'])
end = time.time()
print(end-start)

Batches:   0%|          | 0/181 [00:00<?, ?it/s]

2023-03-29 00:28:51,420 - BERTopic - Transformed documents to Embeddings
2023-03-29 00:29:08,189 - BERTopic - Reduced dimensionality
2023-03-29 00:29:08,895 - BERTopic - Clustered reduced embeddings
2023-03-29 00:31:09,666 - BERTopic - Reduced number of topics from 66 to 40


32954.18380069733


In [26]:
topic_model3.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,2193,-1_production_ebitda_north_europe
1,0,959,0_patient_study_clinical_trial
2,1,443,1_network_mobile_q4_solution
3,2,385,2_store_brand_consumer_retail
4,3,209,3_barrel_oil_gas_production
5,4,145,4_utility_energy_power_electric
6,5,139,5_vehicle_truck_car_fleet
7,6,122,6_slide_billion_backlog_john
8,7,95,7_aircraft_airline_fuel_airplane
9,8,92,8_rig_drilling_equipment_fleet


In [27]:
df_q3 = pd.DataFrame({"Document": df['transcript_qQ3'], "Topic_Q3": topics3[0]})

In [28]:
df_q3

Unnamed: 0,Document,Topic_Q3
0,possible division profitability bp pleasantly ...,0
1,mike basically raise fx lift somewhat settled ...,0
2,international piece doubled size five double d...,0
3,mile operational unfavorable emerging remained...,-1
4,okay brian enabled deliver double digit despit...,0
...,...,...
5783,focusing carbon carbon play feature proving pr...,-1
5784,absolute updating everything integration gone ...,-1
5785,afternoon webcast live jim jim released closed...,10
5786,afternoon david ebitda dcf compare distributio...,-1


In [29]:
df_q3.to_csv('topics_q3.csv', index=False)

## Topics Q4

In [30]:
# Applying BERTopic with its default values (transcripts Q4)
start = time.time()
topic_model4 = BERTopic(nr_topics=40,verbose=True, calculate_probabilities=False)
topics4 = topic_model4.fit_transform(df['transcript_qQ4'])
end = time.time()
print(end-start)

Batches:   0%|          | 0/181 [00:00<?, ?it/s]

2023-03-29 10:21:01,110 - BERTopic - Transformed documents to Embeddings
2023-03-29 10:21:14,626 - BERTopic - Reduced dimensionality
2023-03-29 10:21:15,191 - BERTopic - Clustered reduced embeddings
2023-03-29 10:22:57,934 - BERTopic - Reduced number of topics from 63 to 40


35505.43538880348


In [31]:
topic_model4.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,1990,-1_america_europe_ebitda_slide
1,0,1005,0_patient_study_clinical_trial
2,1,544,1_oil_rig_drilling_gas
3,2,331,2_network_cloud_mobile_solution
4,3,286,3_brand_store_consumer_retail
5,4,166,4_utility_energy_power_gas
6,5,143,5_america_currency_europe_brazil
7,6,110,6_client_fund_solution_firm
8,7,97,7_coal_ton_mine_production
9,8,95,8_aircraft_airplane_airline_fuel


In [32]:
df_q4 = pd.DataFrame({"Document": df['transcript_qQ4'], "Topic_Q4": topics4[0]})

In [33]:
df_q4

Unnamed: 0,Document,Topic_Q4
0,went fairly rigorous self evaluation outside t...,0
1,okay india healthy fit western tom track forec...,0
2,mile delivering double digit eps original eps ...,0
3,mile explain handle transition pre separation ...,0
4,okay brian achieved representing double digit ...,0
...,...,...
5783,david happy slide absorption recurring stop re...,-1
5784,category accelerated translated absolute chang...,-1
5785,ask jim attendance reference jamie afternoon r...,20
5786,extend west afternoon everybody jump public ty...,1


In [34]:
df_q4.to_csv('topics_q4.csv', index=False)

## Topics MDA

In [35]:
# Applying BERTopic with its default values (mda)
start = time.time()
topic_model5 = BERTopic(nr_topics=40,verbose=True, calculate_probabilities=False)
topics5 = topic_model5.fit_transform(df['mda'])
end = time.time()
print(end-start)

Batches:   0%|          | 0/181 [00:00<?, ?it/s]

2023-03-29 19:19:57,131 - BERTopic - Transformed documents to Embeddings
2023-03-29 19:20:12,132 - BERTopic - Reduced dimensionality
2023-03-29 19:20:12,824 - BERTopic - Clustered reduced embeddings
2023-03-29 19:23:12,195 - BERTopic - Reduced number of topics from 143 to 40


32416.916571378708


In [36]:
topic_model5.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,1404,-1_software_client_restructuring_network
1,0,525,0_oil_gas_natural_drilling
2,1,440,1_clinical_trial_drug_milestone
3,2,434,2_client_software_license_consulting
4,3,283,3_semiconductor_wafer_design_restructuring
5,4,268,4_subscriber_network_wireless_station
6,5,235,5_coal_mine_ton_mining
7,6,215,6_food_paper_mill_pulp
8,7,205,7_store_retail_brand_wholesale
9,8,201,8_power_energy_electric_utility


In [37]:
df_mda = pd.DataFrame({"Document": df['mda'], "Topic_MDA": topics5[0]})

In [38]:
df_mda

Unnamed: 0,Document,Topic_MDA
0,broad health care patent protection license te...,-1
1,broad health care patent protection license te...,-1
2,broad health care patent protection license te...,-1
3,broad health care patent protection license te...,-1
4,broad health care patent protection license te...,-1
...,...,...
5783,thereto constitute meaning 27a 21e cautionary ...,22
5784,week week week week week basic declared shareh...,6
5785,thereto supplementary numerous carefully speci...,13
5786,midstream detail strategically hydrocarbon bas...,0


In [39]:
df_mda.to_csv('topics_mda.csv', index=False)

## Merging all and extracting

In [40]:
df_all = pd.DataFrame({"cik": df['cik'], "year": df['year'], 'Topic_Q1': topics1[0],'Topic_Q2': topics2[0],
                       'Topic_Q3': topics3[0],'Topic_Q4': topics4[0],'Topic_MDA': topics5[0]})

In [43]:
df_all.to_csv('topic_features.csv')

In [42]:
df_all

Unnamed: 0,cik,year,Topic_Q1,Topic_Q2,Topic_Q3,Topic_Q4,Topic_MDA
0,1800.0,2008,0,0,0,0,-1
1,1800.0,2010,0,0,0,0,-1
2,1800.0,2011,0,0,0,0,-1
3,1800.0,2012,0,0,-1,0,-1
4,1800.0,2013,0,0,0,0,-1
...,...,...,...,...,...,...,...
5783,1561627.0,2013,10,13,-1,-1,22
5784,1564822.0,2013,3,17,-1,-1,6
5785,1564902.0,2013,-1,14,10,20,13
5786,1569134.0,2013,17,-1,-1,1,0
