In [1]:
import pandas as pd
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt
from google.cloud import storage
import io

In [12]:
import re
import nltk
from nltk.corpus import stopwords

In [26]:
# Set Google Cloud Storage bucket and file path
BUCKET_NAME = "my-bert-topic-model"
FILE_NAME = "call_transcripts.xlsx"
client = storage.Client()
bucket = client.bucket(BUCKET_NAME)
blob = bucket.blob(FILE_NAME)
print("Downloading dataset from GCS...")
data = blob.download_as_bytes()
df = pd.read_excel(io.BytesIO(data))

Downloading dataset from GCS...


In [27]:
# Drop empty transcripts
df = df.dropna(subset=["transcript"])

In [28]:
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

def clean_text(texts):
    text = texts.lower()
    text = re.sub(r'\b(?:' + '|'.join(stop_words) + r')\b', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text

df["cleaned_transcript"] = df["transcript"].apply(clean_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [29]:
texts = df["cleaned_transcript"].tolist()

In [30]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")

In [31]:
print("Fitting BERTopic model...")
topic_model = BERTopic(embedding_model=embedding_model, language="english")

Fitting BERTopic model...


In [32]:
topics, probs = topic_model.fit_transform(texts)
print('done transforming texts')

topic_info = topic_model.get_topic_info()
print(topic_info.head(10))  # Print the top 10 topics
print('done getting topic info')

fig = topic_model.visualize_barchart(top_n_topics=10)
print('finished visualization')

fig_path = "bertopic_visualization.png"
fig.write_image(fig_path)
print('saved to storage')

blob = bucket.blob(fig_path)
blob.upload_from_filename(fig_path)
print(f"Visualization uploaded to: gs://{BUCKET_NAME}/{fig_path}")

starting transforming texts
done transforming texts
getting topic info
   Topic  Count                                    Name  \
0     -1     22        -1_growth_david_pierre_accenture   
1      0     44          0_adi_quarter_industrial_think   
2      1     40               1_dram_nand_fiscal_micron   
3      2     34       2_constant_currency_azure_windows   
4      3     31                 3_nvidia_ai_data_gaming   
5      4     31             4_oracle_cloud_database_erp   
6      5     30                5_clients_client_peo_adp   
7      6     27  6_salesforce_marc_incredible_customers   
8      7     26       7_hock_yearyear_semiconductor_tan   
9      8     26            8_cisco_kelly_revenue_growth   

                                      Representation  \
0  [growth, david, pierre, accenture, quarter, ye...   
1  [adi, quarter, industrial, think, vince, prash...   
2  [dram, nand, fiscal, micron, demand, bit, indu...   
3  [constant, currency, azure, windows, microsoft...   

## TRYING A STRONGER MODEL

In [33]:
embedding_model = SentenceTransformer("all-mpnet-base-v2", device="cpu")
print("Fitting BERTopic model...")
topic_model = BERTopic(embedding_model=embedding_model, language="english")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Fitting BERTopic model...


In [35]:
topics, probs = topic_model.fit_transform(texts)
print('done transforming texts')

topic_info = topic_model.get_topic_info()
print(topic_info.head(10))  # Print the top 10 topics
print('done getting topic info')

fig = topic_model.visualize_barchart(top_n_topics=10)
print('finished visualization')

fig_path = "bertopic_visualization_mpnet.png"
fig.write_image(fig_path)
print('saved to storage')

blob = bucket.blob(fig_path)
blob.upload_from_filename(fig_path)
print(f"Visualization uploaded to: gs://{BUCKET_NAME}/{fig_path}")

done transforming texts
   Topic  Count                                  Name  \
0     -1     14             -1_automatic_adp_year_inc   
1      0     57   0_salesforce_marc_customers_quarter   
2      1     40        1_adi_quarter_industrial_think   
3      2     40             2_dram_nand_fiscal_micron   
4      3     40  3_constant_currency_azure_commercial   
5      4     40            4_dave_pahl_rafael_quarter   
6      5     29              5_client_clients_peo_adp   
7      6     26       6_chuck_cisco_product_customers   
8      7     25           7_oracle_database_cloud_erp   
9      8     24   8_adobe_digital_creative_experience   

                                      Representation  \
0  [automatic, adp, year, inc, processing, carlos...   
1  [salesforce, marc, customers, quarter, incredi...   
2  [adi, quarter, industrial, think, vince, roche...   
3  [dram, nand, fiscal, micron, demand, bit, indu...   
4  [constant, currency, azure, commercial, revenu...   
5  [dave, pa