In [2]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.2 kB)
Collecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
Downloading gensim-4.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.5/26.5 MB[0m [31m92.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hDownloading scipy-1.13.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (38.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.6/38.6 MB[0m [31m69.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: scipy, gensim
  Attempting uninstall: scipy
    Found existing installation: scipy 1.15.1
    Uninstalling scipy-1.15.1:
      Successfully uninstalled scipy-1.15.1
Successfully installed gensim-4.3.3 scipy-1.13.1

[1m[[0m[34;49

In [7]:
import pandas as pd
import nltk
import gensim
import gensim.corpora as corpora
from nltk.corpus import stopwords
from gensim.models.ldamodel import LdaModel
import re
from google.cloud import storage
import io

In [15]:
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
BUCKET_NAME = "my-bert-topic-model"
FILE_NAME = "call_transcripts.xlsx"
client = storage.Client()
bucket = client.bucket(BUCKET_NAME)
blob = bucket.blob(FILE_NAME)
data = blob.download_as_bytes()
df = pd.read_excel(io.BytesIO(data))
print("Loaded dataset from GCS")
df = df.dropna(subset=["transcript"])

Loaded dataset from GCS


In [17]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return tokens 

In [18]:
df["cleaned_transcript"] = df["transcript"].apply(clean_text)

In [19]:
print(df["cleaned_transcript"].head())

0    [operator, good, afternoon, name, david, ill, ...
1    [operator, good, afternoon, name, david, ill, ...
2    [operator, good, afternoon, name, jl, conferen...
3    [operator, good, afternoon, name, rob, ill, co...
4    [operator, good, afternoon, name, david, ill, ...
Name: cleaned_transcript, dtype: object


In [20]:
dictionary = corpora.Dictionary(df["cleaned_transcript"].tolist())
corpus = [dictionary.doc2bow(text) for text in df["cleaned_transcript"].tolist()]

In [21]:
NUM_TOPICS = 10
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=NUM_TOPICS, passes=10, random_state=42)

In [22]:
topics = lda_model.print_topics(num_words=5)
for topic in topics:
    print(topic)

(0, '0.016*"quarter" + 0.009*"revenue" + 0.008*"think" + 0.008*"year" + 0.007*"question"')
(1, '0.015*"quarter" + 0.012*"growth" + 0.010*"year" + 0.008*"new" + 0.008*"business"')
(2, '0.013*"fiscal" + 0.012*"quarter" + 0.010*"dram" + 0.010*"nand" + 0.010*"growth"')
(3, '0.010*"quarter" + 0.008*"year" + 0.008*"think" + 0.007*"billion" + 0.006*"growth"')
(4, '0.014*"revenue" + 0.012*"growth" + 0.009*"quarter" + 0.008*"cloud" + 0.008*"business"')
(5, '0.012*"year" + 0.011*"growth" + 0.011*"customers" + 0.011*"revenue" + 0.009*"business"')
(6, '0.016*"ai" + 0.009*"data" + 0.006*"quarter" + 0.006*"nvidia" + 0.006*"year"')
(7, '0.013*"billion" + 0.011*"year" + 0.010*"revenue" + 0.009*"quarter" + 0.008*"iphone"')
(8, '0.010*"quarter" + 0.009*"customers" + 0.008*"year" + 0.008*"million" + 0.007*"cloud"')
(9, '0.017*"cloud" + 0.011*"revenue" + 0.009*"q" + 0.008*"growth" + 0.008*"quarter"')
