In [55]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
import hdbscan
import umap
import pandas as pd
import numpy as np

# Load data
df = pd.read_csv("cleaned_transcripts.csv")  # Replace with actual cleaned text file
documents = df["cleaned_text"].tolist()


In [56]:
topic_keywords

{}

In [57]:

# Create embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(documents, show_progress_bar=True)

n_samples = len(embeddings)

# --- Safeguard for extremely small datasets ---
if n_samples < 2:
    raise ValueError("Not enough samples to perform UMAP or HDBSCAN. Need at least 2 documents.")

Batches: 100%|██████████| 1/1 [00:00<00:00,  4.05it/s]


In [58]:
# --- HDBSCAN Settings ---
# min_cluster_size should be at least 2 for meaningful clusters.
# min_samples should typically be less than min_cluster_size.
# Adjust these values based on your data size and desired cluster density.
# For very small N, you might need to go as low as 2 for min_cluster_size.
safe_min_cluster_size = max(2, min(10, n_samples // 4)) # Adjusted to be a bit more flexible for small N
safe_min_samples = max(1, min(5, safe_min_cluster_size -1)) # Should be less than min_cluster_size

print(f"HDBSCAN: n_samples={n_samples}, safe_min_cluster_size={safe_min_cluster_size}, safe_min_samples={safe_min_samples}")

hdbscan_model = hdbscan.HDBSCAN(
    min_cluster_size=safe_min_cluster_size,
    min_samples=safe_min_samples,
    metric='euclidean',
    prediction_data=True
)

HDBSCAN: n_samples=5, safe_min_cluster_size=2, safe_min_samples=1


In [59]:
# 1. n_neighbors: Must be less than n_samples.
#    For very small N, setting it to 2 or 3 is often necessary.
#    A common rule of thumb is sqrt(N) or log2(N), but max(2, ...) is safest minimum.
safe_n_neighbors = max(2, min(15, n_samples - 1))
if safe_n_neighbors == 0: # Avoid n_neighbors of 0 if n_samples is 1
    safe_n_neighbors = 1 # Though UMAP would likely fail for n_samples=1 anyway

# 2. n_components: Should be less than n_neighbors.
#    If n_neighbors is small (e.g., 2 or 3), n_components should be 1 or 2.
safe_n_components = min(5, safe_n_neighbors - 1)
if safe_n_components < 1:
    safe_n_components = 1 # Smallest possible output dimension

# 3. Initialization Method: Crucial for avoiding the `eigsh` error.
#    'spectral' is default and uses `eigsh`. 'random' avoids it.
umap_init_method = 'random' # <--- MOST LIKELY FIX FOR THE TypeError

print(f"UMAP: n_samples={n_samples}, safe_n_neighbors={safe_n_neighbors}, safe_n_components={safe_n_components}, init='{umap_init_method}'")

UMAP: n_samples=5, safe_n_neighbors=4, safe_n_components=3, init='random'


In [65]:
umap_model = umap.UMAP(
    n_neighbors=safe_n_neighbors,
    n_components=safe_n_components,
    min_dist=0.0,    # Good for preserving local structure
    metric='cosine', # Often works well with sentence embeddings
    random_state=42, # For reproducibility
    init=umap_init_method # Explicitly use 'random' initialization
)

# Initialize BERTopic with custom UMAP and HDBSCAN models
topic_model = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,  # Custom UMAP model
    hdbscan_model=hdbscan_model,
    calculate_probabilities=True,
    verbose=True,
    language="english"
)






In [66]:

# Fit the model
topics, probs = topic_model.fit_transform(documents, embeddings)
topic_model.save("my_topic_model")  # ✅ now it includes fitted topics


print("\nBERTopic model fitted successfully with UMAP (adjusted for small dataset and random initialization).")
print(f"Number of topics found: {len(topic_model.get_topics())}")

# Optional: Further exploration
# print("Topic-document probabilities sample:")
# print(probs[:5]) # Print first 5 probability arrays
# print("Document topics sample:")
# print(topics[:10]) # Print topics for first 10 documents
# print("Top 5 topics:")
# print(topic_model.get_topics(n=5))

2025-07-20 11:37:25,795 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-07-20 11:37:25,810 - BERTopic - Dimensionality - Completed ✓
2025-07-20 11:37:25,810 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-07-20 11:37:25,824 - BERTopic - Cluster - Completed ✓
2025-07-20 11:37:25,828 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-20 11:37:25,858 - BERTopic - Representation - Completed ✓



BERTopic model fitted successfully with UMAP (adjusted for small dataset and random initialization).
Number of topics found: 2


In [67]:
# Get topic representations (top words for each topic)
topics_info = topic_model.get_topic_info()

# Create a dictionary mapping topic ID → human-readable name
topic_id_to_name = {}
for _, row in topics_info.iterrows():
    topic_id = row["Topic"]
    name = f"Topic {topic_id}: " + row["Name"] if "Name" in row else f"Topic {topic_id}: " + row["Representation"]
    topic_id_to_name[topic_id] = name

# Now replace numerical topic labels with names
named_topics = [topic_id_to_name[tid] for tid in topics]

# Add back to DataFrame for convenience
df["Named_Topic"] = named_topics

# (Optional) Save to CSV for review
df.to_csv("documents_with_named_topics.csv", index=False)

print("Saved topic-labeled documents to documents_with_named_topics.csv")


Saved topic-labeled documents to documents_with_named_topics.csv
