# BERTopic for Topic Modeling

## Setup and Installation for BERTopic

In [1]:
%pip install -q bertopic
%pip install numpy==1.24.4 scipy==1.10.1 --quiet


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3.11 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3.11 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
from bertopic import BERTopic
from hdbscan import HDBSCAN

  from .autonotebook import tqdm as notebook_tqdm


## Loading the Transcript Data

In [3]:
# Load the transcript data
with open("../../data_preprocessed/transcript_cleaned.txt", "r", encoding="utf-8") as f:
    text_data = f.read()

words = text_data.split()
chunk_size = 50
documents = [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]

print(f"Generated {len(documents)} documents for BERTopic (chunked by {chunk_size} words).")

Generated 26 documents for BERTopic (chunked by 50 words).


## Initialize and run BERTopic Model

In [4]:
# Custom HDBSCAN model with lower min_cluster_size
hdbscan_model = HDBSCAN(min_cluster_size=2, min_samples=1)

topic_model = BERTopic(hdbscan_model=hdbscan_model, nr_topics=None, low_memory=True)
topics, probs = topic_model.fit_transform(documents)

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


## Retrieve and display the discovered topics

In [5]:
topic_info = topic_model.get_topic_info()  # DataFrame with Topic ID, Count, and Name (top words)

# Show the first 10 topics, including topic -1 if present
print("Topic summary:")
print(topic_info.head(10))

print("\nDetailed topic breakdown:")
for _, row in topic_info.iterrows():
    # If outlier Topic, skip
    if row.Topic == -1:
        continue
    topic_words = [word for word, _ in topic_model.get_topic(row.Topic)]
    print(f"Topic {row.Topic} (Count: {row.Count} docs) – Top words: {', '.join(topic_words[:10])}")

# Save the BERTopic model and the topic summary output
topic_model.save("bertopic_model.pkl", serialization="pickle")
topic_info.to_csv("TM_res/topic_summary.csv", index=False)
print("\nModel saved to 'bertopic_model.pkl' and topic summary saved to 'topic_summary.csv'.")

# Visualize the topics
topic_model.visualize_barchart(top_n_topics=10)



Topic summary:
   Topic  Count                                Name  \
0      0      5     0_limestone_cave_look_formation   
1      1      4              1_roof_grow_large_cave   
2      2      3             2_level_tree_come_sound   
3      3      2  3_image_article_underground_fossil   
4      4      2         4_pearl_like_year_formation   
5      5      2         5_cave_large_entrance_fully   
6      6      2             6_jump_ground_sleep_bat   
7      7      2      7_insect_live_specie_limestone   
8      8      2       8_wall_passage_navigate_great   
9      9      2         9_tab_feature_opera_extinct   

                                      Representation  \
0  [limestone, cave, look, formation, form, calci...   
1  [roof, grow, large, cave, soil, sunlight, rain...   
2  [level, tree, come, sound, thin, cave, bird, s...   
3  [image, article, underground, fossil, slash, a...   
4  [pearl, like, year, formation, rare, tiny, roc...   
5  [cave, large, entrance, fully, explore, 