# BERTopic for Topic Modeling

## Setup and Installation for BERTopic

In [1]:
%pip install -q bertopic
%pip install numpy==1.24.4 scipy==1.10.1 --quiet


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3.11 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3.11 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
from bertopic import BERTopic
from hdbscan import HDBSCAN

  from .autonotebook import tqdm as notebook_tqdm


## Loading the Transcript Data

In [3]:
with open("../../data_preprocessed/transcript_cleaned.txt", "r", encoding="utf-8") as f:
    text_data = f.read()

words = text_data.split()
chunk_size = 100
documents = [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]

print(f"Generated {len(documents)} documents for BERTopic (chunked by {chunk_size} words).")

Generated 13 documents for BERTopic (chunked by 100 words).


## Initialize and run BERTopic Model

In [4]:
hdbscan_model = HDBSCAN(min_cluster_size=2, min_samples=1)

topic_model = BERTopic(hdbscan_model=hdbscan_model, nr_topics=None, low_memory=True)
topics, probs = topic_model.fit_transform(documents)

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


## Retrieve and display the discovered topics

In [5]:
topic_info = topic_model.get_topic_info()  # DataFrame with Topic ID, Count, and Name (top words)

print("Topic summary:")
print(topic_info.head(10))

print("\nDetailed topic breakdown:")
for _, row in topic_info.iterrows():
    # If outlier Topic, skip
    if row.Topic == -1:
        continue
    topic_words = [word for word, _ in topic_model.get_topic(row.Topic)]
    print(f"Topic {row.Topic} (Count: {row.Count} docs) – Top words: {', '.join(topic_words[:10])}")

# Save the BERTopic model and the topic summary output
topic_model.save("bertopic_model.pkl", serialization="pickle")
topic_info.to_csv("TM_res/topic_summary.csv", index=False)
print("\nModel saved to 'bertopic_model.pkl' and topic summary saved to 'topic_summary.csv'.")

# Visualize the topics
topic_model.visualize_barchart()



Topic summary:
   Topic  Count                               Name  \
0     -1      1  -1_underground_cave_inside_sandon   
1      0      6            0_cave_water_reach_time   
2      1      2           1_cave_look_stand_thread   
3      2      2              2_roof_hole_cave_live   
4      3      2         3_feature_image_tab_fossil   

                                      Representation  \
0  [underground, cave, inside, sandon, unique, ro...   
1  [cave, water, reach, time, come, look, large, ...   
2  [cave, look, stand, thread, insect, like, grou...   
3  [roof, hole, cave, live, shape, formation, soi...   
4  [feature, image, tab, fossil, article, opera, ...   

                                 Representative_Docs  
0  [sandon large cave planet cave massive complet...  
1  [movement like fish cricket completely blind s...  
2  [hope vision passage large cave stand tall sta...  
3  [collapse result intersection main fault form ...  
4  [possible visual animal illustration live tet