# BERTopic for Topic Modeling

## Setup and Installation for BERTopic

In [1]:
%pip install -q bertopic
%pip install numpy==1.24.4 scipy==1.10.1 --quiet


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3.11 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3.11 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
from bertopic import BERTopic
from hdbscan import HDBSCAN

  from .autonotebook import tqdm as notebook_tqdm


## Loading the Transcript Data

In [3]:
# Load the transcript data
with open("../../data_preprocessed/transcript_cleaned.txt", "r", encoding="utf-8") as f:
    text_data = f.read()

words = text_data.split()
chunk_size = 80
documents = [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]

print(f"Generated {len(documents)} documents for BERTopic (chunked by {chunk_size} words).")

Generated 16 documents for BERTopic (chunked by 80 words).


## Initialize and run BERTopic Model

In [4]:
# Custom HDBSCAN model with lower min_cluster_size
hdbscan_model = HDBSCAN(min_cluster_size=2, min_samples=1)

topic_model = BERTopic(hdbscan_model=hdbscan_model, nr_topics=None, low_memory=True)
topics, probs = topic_model.fit_transform(documents)

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


## Retrieve and display the discovered topics

In [5]:
topic_info = topic_model.get_topic_info()  # DataFrame with Topic ID, Count, and Name (top words)

# Show the first 10 topics, including topic -1 if present
print("Topic summary:")
print(topic_info.head(10))

print("\nDetailed topic breakdown:")
for _, row in topic_info.iterrows():
    # If outlier Topic, skip
    if row.Topic == -1:
        continue
    topic_words = [word for word, _ in topic_model.get_topic(row.Topic)]
    print(f"Topic {row.Topic} (Count: {row.Count} docs) – Top words: {', '.join(topic_words[:10])}")

# Save the BERTopic model and the topic summary output
topic_model.save("bertopic_model.pkl", serialization="pickle")
topic_info.to_csv("TM_res/topic_summary.csv", index=False)
print("\nModel saved to 'bertopic_model.pkl' and topic summary saved to 'topic_summary.csv'.")

# Visualize the topics
topic_model.visualize_barchart(top_n_topics=10)



Topic summary:
   Topic  Count                       Name  \
0      0     14     0_cave_like_look_water   
1      1      2  1_feature_tab_image_opera   

                                      Representation  \
0  [cave, like, look, water, form, limestone, for...   
1  [feature, tab, image, opera, creature, fossil,...   

                                 Representative_Docs  
0  [sandon large cave planet cave massive complet...  
1  [possible visual animal illustration live tetr...  

Detailed topic breakdown:
Topic 0 (Count: 14 docs) – Top words: cave, like, look, water, form, limestone, formation, large, reach, inside
Topic 1 (Count: 2 docs) – Top words: feature, tab, image, opera, creature, fossil, visit, identify, tentacle, extinct

Model saved to 'bertopic_model.pkl' and topic summary saved to 'topic_summary.csv'.
