In [9]:
import pandas as pd
import time
import re
import nltk
import gensim
import gensim.corpora as corpora
import numpy as np
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from gensim.models.coherencemodel import CoherenceModel

## Reading the Data

In [10]:
tran = pd.read_csv("tran_sent_data.csv")
mda = pd.read_csv('mda_sent_data.csv')

In [11]:
tran = tran.drop(columns=['Unnamed: 0'])
mda = mda.drop(columns=['Unnamed: 0'])

In [12]:
df = pd.merge(tran,mda, on=['year','cik'])
df = df.reset_index(drop=True)
print(len(df))
print(len(df.dropna()))

5794
5788


In [13]:
print(len(df))
df = df[~df.isnull().any(axis=1)]
print(len(df))

5794
5788


## Formatting the Text

In [14]:
# Creation of a function to put the values in the right format
def remove_chars(s):
    s = s.replace('[', '').replace(']','')
    s = s.replace("'",'')
    s = s.replace(',','')
    return s

df['transcript_qQ1'] = df['transcript_qQ1'].apply(remove_chars)
df['transcript_qQ2'] = df['transcript_qQ2'].apply(remove_chars)
df['transcript_qQ3'] = df['transcript_qQ3'].apply(remove_chars)
df['transcript_qQ4'] = df['transcript_qQ4'].apply(remove_chars)
df['mda'] = df['mda'].apply(remove_chars)

Earlier tests have shown that all topics have the words: 'quarter', 'year', and 'million' in common. for the actual topic modelling, I decide to exclude these words

In [15]:
df["transcript_qQ1"] = df["transcript_qQ1"].str.replace("quarter", "")
df["transcript_qQ1"] = df["transcript_qQ1"].str.replace("million", "")
df["transcript_qQ1"] = df["transcript_qQ1"].str.replace("year", "")
df["transcript_qQ2"] = df["transcript_qQ2"].str.replace("quarter", "")
df["transcript_qQ2"] = df["transcript_qQ2"].str.replace("million", "")
df["transcript_qQ2"] = df["transcript_qQ2"].str.replace("year", "")
df["transcript_qQ3"] = df["transcript_qQ3"].str.replace("quarter", "")
df["transcript_qQ3"] = df["transcript_qQ3"].str.replace("million", "")
df["transcript_qQ3"] = df["transcript_qQ3"].str.replace("year", "")
df["transcript_qQ4"] = df["transcript_qQ4"].str.replace("quarter", "")
df["transcript_qQ4"] = df["transcript_qQ4"].str.replace("million", "")
df["transcript_qQ4"] = df["transcript_qQ4"].str.replace("year", "")

Further tests still find some similarities and I decide to remove the following words: 'business', 'first', 'second', 'third', 'fourth' (for each quarter respectively. 'think' is kept in as it represents uncertainty.

In [16]:
df["transcript_qQ1"] = df["transcript_qQ1"].str.replace("first", "")
df["transcript_qQ1"] = df["transcript_qQ1"].str.replace("business", "")
df["transcript_qQ2"] = df["transcript_qQ2"].str.replace("second", "")
df["transcript_qQ2"] = df["transcript_qQ2"].str.replace("business", "")
df["transcript_qQ3"] = df["transcript_qQ3"].str.replace("third", "")
df["transcript_qQ3"] = df["transcript_qQ3"].str.replace("business", "")
df["transcript_qQ4"] = df["transcript_qQ4"].str.replace("fourth", "")
df["transcript_qQ4"] = df["transcript_qQ4"].str.replace("business", "")

In [44]:
df = df.reset_index(drop=True)

## Topics Q1

In [45]:
# Applying BERTopic with its default values (transcripts Q1)
start = time.time()
topic_model1 = BERTopic(nr_topics=40,verbose=True, calculate_probabilities=False)
topics1 = topic_model1.fit_transform(df['transcript_qQ1'])
end = time.time()
print(end-start)

Batches:   0%|          | 0/181 [00:00<?, ?it/s]

2023-03-11 02:21:16,400 - BERTopic - Transformed documents to Embeddings
2023-03-11 02:21:58,171 - BERTopic - Reduced dimensionality
2023-03-11 02:21:58,870 - BERTopic - Clustered reduced embeddings
2023-03-11 02:22:52,485 - BERTopic - Reduced number of topics from 4 to 4


32602.647617816925


In [46]:
topic_model1.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,0,14,0_event_race_derby_nascar
1,1,29,1_student_program_enrollment_new
2,2,731,2_patient_think_product_study
3,3,5014,3_think_revenue_market_well


In [47]:
df_q1 = pd.DataFrame({"Document": df['transcript_qQ1'], "Topic_Q1": topics1[0]})

In [48]:
df_q1

Unnamed: 0,Document,Topic_Q1
0,well certainly key strategic priority think we...,2
1,thanks tom morning review performance major s...,2
2,shared r program rick number pretty much numbe...,2
3,yes probably around right probably closer 2 th...,3
4,well u would say one rare circumstance glad s...,2
...,...,...
5783,thank melissa good morning everyone certainly ...,3
5784,think think think 2 way proactive go initiate ...,3
5785,thanks gene thank everyone call interest seawo...,3
5786,okay count told couple time resist urge annual...,3


In [49]:
df_q1.to_csv('topics_q1.csv', index=False)

## Topics Q2

In [50]:
# Applying BERTopic with its default values (transcripts Q2)
start = time.time()
topic_model2 = BERTopic(nr_topics=40,verbose=True, calculate_probabilities=False)
topics2 = topic_model2.fit_transform(df['transcript_qQ2'])
end = time.time()
print(end-start)

Batches:   0%|          | 0/181 [00:00<?, ?it/s]

2023-03-11 11:24:09,544 - BERTopic - Transformed documents to Embeddings
2023-03-11 11:24:24,716 - BERTopic - Reduced dimensionality
2023-03-11 11:24:25,554 - BERTopic - Clustered reduced embeddings
2023-03-11 11:25:39,582 - BERTopic - Reduced number of topics from 36 to 36


32565.527448415756


In [51]:
topic_model2.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,1385,-1_think_well_market_going
1,0,11,0_nascar_race_event_speedway
2,1,701,1_patient_study_trial_think
3,2,27,2_student_enrollment_program_learner
4,3,1086,3_revenue_think_market_growth
5,4,23,4_hotel_room_revpar_property
6,5,11,5_2009_project_would_cost
7,6,15,6_market_europe_think_growth
8,7,19,7_market_growth_revenue_increase
9,8,11,8_market_titanium_product_material


In [52]:
df_q2 = pd.DataFrame({"Document": df['transcript_qQ2'], "Topic_Q2": topics2[0]})

In [53]:
df_q2

Unnamed: 0,Document,Topic_Q2
0,yes let john take question phil sure see reduc...,1
1,remark said would responding fda shortly expec...,1
2,fingertip jami smaller product mention mike qu...,1
3,good morning thanks joining u also today call ...,1
4,well thank thank question concludes abbott con...,9
...,...,...
5783,thank manny good morning everyone certainly ap...,3
5784,think probably le potential mix improvement h...,9
5785,thanks jim good afternoon everyone going finan...,15
5786,thank jennifer good afternoon everyone appreci...,3


In [54]:
df_q2.to_csv('topics_q2.csv', index=False)

## Topics Q3

In [55]:
# Applying BERTopic with its default values (transcripts Q3)
start = time.time()
topic_model3 = BERTopic(nr_topics=40,verbose=True, calculate_probabilities=False)
topics3 = topic_model3.fit_transform(df['transcript_qQ3'])
end = time.time()
print(end-start)

Batches:   0%|          | 0/181 [00:00<?, ?it/s]

2023-03-11 20:28:49,039 - BERTopic - Transformed documents to Embeddings
2023-03-11 20:29:10,168 - BERTopic - Reduced dimensionality
2023-03-11 20:29:10,732 - BERTopic - Clustered reduced embeddings
2023-03-11 20:32:08,893 - BERTopic - Reduced number of topics from 50 to 40


32784.86909484863


In [56]:
topic_model3.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,2822,-1_think_revenue_market_growth
1,0,729,0_think_revenue_going_growth
2,1,416,1_revenue_product_market_customer
3,2,381,2_patient_study_trial_clinical
4,3,192,3_hospital_care_think_patient
5,4,191,4_well_drilling_rig_production
6,5,123,5_think_market_growth_well
7,6,104,6_barrel_well_oil_crude
8,7,85,7_think_market_cash_going
9,8,72,8_market_think_cost_margin


In [57]:
df_q3 = pd.DataFrame({"Document": df['transcript_qQ3'], "Topic_Q3": topics3[0]})

In [58]:
df_q3

Unnamed: 0,Document,Topic_Q3
0,good morning certainly think possible talked l...,3
1,yes mike basically question say couple thing s...,1
2,right overall international piece doubled size...,2
3,thanks mile today reported ongoing diluted ear...,-1
4,okay thanks brian good morning morning reporte...,2
...,...,...
5783,yes also focusing carbon material think carbon...,-1
5784,absolute updating everything integration right...,1
5785,thanks kelsey good afternoon everyone welcome ...,-1
5786,thanks good afternoon david mentioned net inco...,7


In [59]:
df_q3.to_csv('topics_q3.csv', index=False)

## Topics Q4

In [60]:
# Applying BERTopic with its default values (transcripts Q4)
start = time.time()
topic_model4 = BERTopic(nr_topics=40,verbose=True, calculate_probabilities=False)
topics4 = topic_model4.fit_transform(df['transcript_qQ4'])
end = time.time()
print(end-start)

Batches:   0%|          | 0/181 [00:00<?, ?it/s]

2023-03-12 05:32:21,201 - BERTopic - Transformed documents to Embeddings
2023-03-12 05:32:37,537 - BERTopic - Reduced dimensionality
2023-03-12 05:32:38,390 - BERTopic - Clustered reduced embeddings
2023-03-12 05:35:35,181 - BERTopic - Reduced number of topics from 55 to 40


32598.427647829056


In [61]:
topic_model4.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,2192,-1_think_market_revenue_growth
1,0,1777,0_revenue_think_market_growth
2,1,624,1_patient_study_trial_clinical
3,2,211,2_well_drilling_oil_production
4,3,99,3_brand_store_sale_consumer
5,4,64,4_utility_gas_energy_rate
6,5,63,5_think_revenue_cash_cost
7,6,52,6_market_sale_result_growth
8,7,47,7_customer_network_revenue_wireless
9,8,45,8_laser_customer_product_revenue


In [62]:
df_q4 = pd.DataFrame({"Document": df['transcript_qQ4'], "Topic_Q4": topics4[0]})

In [63]:
df_q4

Unnamed: 0,Document,Topic_Q4
0,yes went fairly rigorous self evaluation outsi...,1
1,okay think said piramal around 400 let give e...,1
2,thanks mile pleased ended 2011 strong delive...,-1
3,thanks mile review financial position like exp...,-1
4,okay thanks brian good morning morning review ...,1
...,...,...
5783,thanks david good morning everyone happy first...,-1
5784,sure long term target grow line category done ...,0
5785,ask jim heaney comment little bit expense numb...,0
5786,said extend use existing capacity moving back ...,0


In [64]:
df_q4.to_csv('topics_q4.csv', index=False)

## Topics MDA

In [65]:
# Applying BERTopic with its default values (mda)
start = time.time()
topic_model5 = BERTopic(nr_topics=40,verbose=True, calculate_probabilities=False)
topics5 = topic_model5.fit_transform(df['mda'])
end = time.time()
print(end-start)

Batches:   0%|          | 0/181 [00:00<?, ?it/s]

2023-03-12 14:51:29,388 - BERTopic - Transformed documents to Embeddings
2023-03-12 14:51:51,828 - BERTopic - Reduced dimensionality
2023-03-12 14:51:52,408 - BERTopic - Clustered reduced embeddings
2023-03-12 14:59:43,884 - BERTopic - Reduced number of topics from 131 to 40


33875.25102329254


In [66]:
topic_model5.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,1744,-1_million_sale_revenue_cost
1,0,1772,0_million_revenue_year_december
2,1,361,1_development_product_clinical_research
3,2,322,2_gas_oil_natural_production
4,3,125,3_energy_power_gas_utility
5,4,123,4_health_care_medicare_service
6,5,122,5_vehicle_million_store_sale
7,6,98,6_product_semiconductor_customer_million
8,7,86,7_sale_million_cost_net
9,8,81,8_restaurant_franchise_fiscal_company


In [67]:
df_mda = pd.DataFrame({"Document": df['mda'], "Topic_MDA": topics5[0]})

In [68]:
df_mda

Unnamed: 0,Document,Topic_MDA
0,financial review abbotts revenue derived prima...,1
1,financial review abbotts revenue derived prima...,1
2,financial review abbotts revenue derived prima...,1
3,financial review abbotts revenue derived prima...,1
4,financial review abbotts revenue derived prima...,1
...,...,...
5783,dollar thousand except per share amount follow...,0
5784,included elsewhere form 10 k million per share...,0
5785,following discussion contains management discu...,0
5786,liquidity capital resource overview initial pu...,2


## Merging all and extracting

In [69]:
df_all = pd.DataFrame({"cik": df['cik'], "year": df['year'], 'Topic_Q1': topics1[0],'Topic_Q2': topics2[0],
                       'Topic_Q3': topics3[0],'Topic_Q4': topics4[0],'Topic_MDA': topics5[0]})

In [70]:
df_all.to_csv('topic_features.csv')

In [71]:
df_all

Unnamed: 0,cik,year,Topic_Q1,Topic_Q2,Topic_Q3,Topic_Q4,Topic_MDA
0,1800.0,2008,2,1,3,1,1
1,1800.0,2010,2,1,1,1,1
2,1800.0,2011,2,1,2,-1,1
3,1800.0,2012,3,1,-1,-1,1
4,1800.0,2013,2,9,2,1,1
...,...,...,...,...,...,...,...
5783,1561627.0,2013,3,3,-1,-1,0
5784,1564822.0,2013,3,9,1,0,0
5785,1564902.0,2013,3,15,-1,0,0
5786,1569134.0,2013,3,3,7,0,2
