In [5]:
import os
# Fix tokenizers warning
os.environ["TOKENIZERS_PARALLELISM"] = "false"

from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP
from hdbscan import HDBSCAN
import pandas as pd
import numpy as np

In [None]:
# First, let's load and prepare the data
# Load your training data
train_data = pd.read_csv('../Dataset/train.csv')

# Get the texts
# Filter for any type of toxic comment (toxic, severe_toxic, obscene, threat, insult, identity_hate)
toxic_mask = (
    (train_data['toxic'] == 1) |
    (train_data['severe_toxic'] == 1) |
    (train_data['obscene'] == 1) |
    (train_data['threat'] == 1) |
    (train_data['insult'] == 1) |
    (train_data['identity_hate'] == 1)
)
train_data = train_data[toxic_mask]
texts = train_data['comment_text'].tolist()

print(f"Processing {len(texts)} texts for topic modeling...")

# Check if we have device defined
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

Processing 16225 texts for topic modeling...
Using device: cuda


In [None]:
# Configure BERTopic for interpretable topics with max 30 clusters
print(f"Processing {len(texts)} texts for topic modeling...")

# Step 1: Configure embedding model (already done)
embedding_model = SentenceTransformer("all-MiniLM-L6-v2", device=device)

# Step 2: Configure UMAP for dimensionality reduction
# Step 2: Configure UMAP for better clustering
umap_model = UMAP(
    n_components=15,        # Increased dimensions
    n_neighbors=10,         # Reduced for more local structure
    min_dist=0.0,          # Keep tight clusters
    metric='cosine',       # Good for text embeddings
    random_state=42
)

# Step 3: Configure HDBSCAN to limit clusters
# Step 3: Configure HDBSCAN with more permissive parameters
hdbscan_model = HDBSCAN(
    min_cluster_size=50,    # Reduced from 200 - smaller clusters allowed
    min_samples=10,         # Reduced from 50 - less strict clustering
    metric='euclidean',
    cluster_selection_method='eom',
    prediction_data=True 
)

# Step 4: Configure vectorizer for better topic representation
# Step 4: Configure vectorizer for better topic representation
vectorizer_model = CountVectorizer(
    ngram_range=(1, 2),     # Include bigrams for better context
    stop_words="english",   # Remove common words
    max_features=5000,      # Limit vocabulary
    min_df=2,               # Word must appear in at least 2 documents (reduced from 10)
    max_df=0.95            # Remove words in >95% of documents
)

Processing 16225 texts for topic modeling...


In [8]:
# Create and fit BERTopic model (fixed parameters)
print("Creating BERTopic model with interpretable topics...")

topic_model = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    nr_topics="auto",                  # Let it determine optimal number
    calculate_probabilities=True,       # Get topic probabilities
    verbose=True
)

# Fit the model
topics, probs = topic_model.fit_transform(texts)

print(f"✅ Model created!")
print(f"Number of topics found: {len(set(topics)) - (1 if -1 in topics else 0)}")
print(f"Number of outliers: {sum(1 for t in topics if t == -1)}")

2025-06-03 15:31:48,362 - BERTopic - Embedding - Transforming documents to embeddings.


Creating BERTopic model with interpretable topics...


Batches:   0%|          | 0/508 [00:00<?, ?it/s]

2025-06-03 15:32:09,824 - BERTopic - Embedding - Completed ✓
2025-06-03 15:32:09,825 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-06-03 15:32:20,158 - BERTopic - Dimensionality - Completed ✓
2025-06-03 15:32:20,159 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-06-03 15:32:21,886 - BERTopic - Cluster - Completed ✓
2025-06-03 15:32:21,886 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-06-03 15:32:23,116 - BERTopic - Representation - Completed ✓
2025-06-03 15:32:23,120 - BERTopic - Topic reduction - Reducing number of topics
2025-06-03 15:32:23,127 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-06-03 15:32:24,400 - BERTopic - Representation - Completed ✓
2025-06-03 15:32:24,401 - BERTopic - Topic reduction - Reduced number of topics from 3 to 3


✅ Model created!
Number of topics found: 2
Number of outliers: 272


In [9]:
# Get topic information and reduce if needed
topic_info = topic_model.get_topic_info()
current_topics = len(set(topics)) - (1 if -1 in topics else 0)

print(f"Current number of topics: {current_topics}")

if current_topics > 30:
    print(f"Reducing from {current_topics} to 25 topics...")
    topic_model.reduce_topics(texts, nr_topics=25)
    topics = topic_model.topics_
    topic_info = topic_model.get_topic_info()
    print(f"✅ Reduced to {len(set(topics)) - (1 if -1 in topics else 0)} topics")

print("Topic Information:")
print(topic_info.head(10))

Current number of topics: 2
Topic Information:
   Topic  Count                                         Name  \
1      0  15477                      0_gay_fat_hate hate_jew   
2      1    476                 1_71_122_jackass jackass_156   

                                      Representation  \
1  [gay, fat, hate hate, jew, pig, pig pig, bulls...   
2  [71, 122, jackass jackass, 156, 180, 76, 179, ...   

                                 Representative_Docs  
0  [suxk my dick \n\nYOU FAILED AGAIN TO BLOCK ME...  
1  [im gay\nim gay\nim gay\nim gay\nim gay\nim ga...  
2  [Fuck up. 122.57.32.65, Amadeus!\nsings that A...  


In [10]:
# Get topic information and reduce if needed
topic_info = topic_model.get_topic_info()
current_topics = len(set(topics)) - (1 if -1 in topics else 0)

print(f"Current number of topics: {current_topics}")

if current_topics > 30:
    print(f"Reducing from {current_topics} to 25 topics...")
    topic_model.reduce_topics(texts, nr_topics=25)
    topics = topic_model.topics_
    topic_info = topic_model.get_topic_info()
    print(f"✅ Reduced to {len(set(topics)) - (1 if -1 in topics else 0)} topics")

print("Topic Information:")
print(topic_info.head(10))

Current number of topics: 2
Topic Information:
   Topic  Count                                         Name  \
1      0  15477                      0_gay_fat_hate hate_jew   
2      1    476                 1_71_122_jackass jackass_156   

                                      Representation  \
1  [gay, fat, hate hate, jew, pig, pig pig, bulls...   
2  [71, 122, jackass jackass, 156, 180, 76, 179, ...   

                                 Representative_Docs  
0  [suxk my dick \n\nYOU FAILED AGAIN TO BLOCK ME...  
1  [im gay\nim gay\nim gay\nim gay\nim gay\nim ga...  
2  [Fuck up. 122.57.32.65, Amadeus!\nsings that A...  


In [11]:
# Get topic information and reduce if needed
topic_info = topic_model.get_topic_info()
current_topics = len(set(topics)) - (1 if -1 in topics else 0)

print(f"Current number of topics: {current_topics}")

if current_topics > 30:
    print(f"Reducing from {current_topics} to 25 topics...")
    topic_model.reduce_topics(texts, nr_topics=25)
    topics = topic_model.topics_
    topic_info = topic_model.get_topic_info()
    print(f"✅ Reduced to {len(set(topics)) - (1 if -1 in topics else 0)} topics")

print("Topic Information:")
print(topic_info.head(10))

Current number of topics: 2
Topic Information:
   Topic  Count                                         Name  \
1      0  15477                      0_gay_fat_hate hate_jew   
2      1    476                 1_71_122_jackass jackass_156   

                                      Representation  \
1  [gay, fat, hate hate, jew, pig, pig pig, bulls...   
2  [71, 122, jackass jackass, 156, 180, 76, 179, ...   

                                 Representative_Docs  
0  [suxk my dick \n\nYOU FAILED AGAIN TO BLOCK ME...  
1  [im gay\nim gay\nim gay\nim gay\nim gay\nim ga...  
2  [Fuck up. 122.57.32.65, Amadeus!\nsings that A...  


In [12]:
# Create interpretable topic names using top words
topic_names = {}
for topic_id in topic_info['Topic'].unique():
    if topic_id == -1:
        topic_names[topic_id] = "Outliers"
        continue
    
    # Get top 3 words for this topic
    topic_words = topic_model.get_topic(topic_id)
    if topic_words:
        # Extract top 3 words
        top_words = [word for word, score in topic_words[:3]]
        topic_names[topic_id] = " | ".join(top_words)
    else:
        topic_names[topic_id] = f"Topic_{topic_id}"

# Show topic names
print("\nTopic Names:")
for topic_id, name in topic_names.items():
    if topic_id != -1:
        count = topic_info[topic_info['Topic'] == topic_id]['Count'].iloc[0]
        print(f"Topic {topic_id}: {name} (Count: {count})")


Topic Names:
Topic 0: gay | fat | hate hate (Count: 15477)
Topic 1: 71 | 122 | jackass jackass (Count: 476)


In [13]:
# Apply topic names to your data
interpretable_topic_labels = [topic_names.get(topic, "Unknown") for topic in topics]

# Add to training data
train_data['topic_id'] = topics
train_data['topic_name'] = interpretable_topic_labels

# Show distribution
from collections import Counter
topic_distribution = Counter(interpretable_topic_labels)

print(f"\nFinal Topic Distribution:")
for topic, count in topic_distribution.most_common(20):
    print(f"  {topic}: {count} ({count/len(texts)*100:.1f}%)")

ValueError: Length of values (16225) does not match length of index (159571)

In [None]:
# Show sample texts for top topics
print(f"\n📝 Sample texts for top 5 topics:")
for topic_name, count in topic_distribution.most_common(5):
    print(f"\n--- {topic_name} ({count} texts) ---")
    sample_indices = train_data[train_data['topic_name'] == topic_name].index[:3]
    for i, idx in enumerate(sample_indices, 1):
        text = train_data.loc[idx, 'comment_text']
        display_text = text[:120] + "..." if len(text) > 120 else text
        print(f"{i}. {display_text}")

# Save results
output_path = '../Dataset/train_with_bertopic_final.csv'
train_data.to_csv(output_path, index=False)

print(f"\n✅ COMPLETED!")
print(f"📁 Dataset saved: {output_path}")
print(f"📊 Total topics: {len(set(topics)) - (1 if -1 in topics else 0)}")

In [None]:
from collections import defaultdict
import random

# Group texts by cluster label (excluding noise, label == -1)
cluster_examples = defaultdict(list)
for text, label in zip(texts, cluster_labels):
    if label != -1:
        cluster_examples[label].append(text)

# Show 5 random examples from each cluster (up to 5 if less)
for label, examples in cluster_examples.items():
    print(f"\nCluster {label} (keywords: {cluster_keywords.get(label, '')}):")
    for example in random.sample(examples, min(5, len(examples))):
        print(f"- {example[:200]}{'...' if len(example) > 200 else ''}")

In [None]:


def get_top_keywords(texts, top_n=3):
    tfidf = TfidfVectorizer(stop_words='english', max_features=10)
    tfidf_matrix = tfidf.fit_transform(texts)
    summed = tfidf_matrix.sum(axis=0).A1
    keywords = np.array(tfidf.get_feature_names_out())[np.argsort(summed)[::-1][:top_n]]
    return ", ".join(keywords)

# Map clusters to keywords
cluster_keywords = {}
for label in set(cluster_labels):
    if label == -1:
        continue  # Skip noise
    cluster_texts = [t for t, l in zip(texts, cluster_labels) if l == label]
    cluster_keywords[label] = get_top_keywords(cluster_texts)

# Assign keywords as labels
interpretable_labels = [
    cluster_keywords.get(lbl, "Noise") for lbl in cluster_labels
]
