In [None]:
import pandas as pd
import numpy as np
import re
from collections import Counter
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import silhouette_score
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance
from bertopic.vectorizers import ClassTfidfTransformer

  from .autonotebook import tqdm as notebook_tqdm


# Preprocessing

**Preprocessing:**

Emoji removal – strips out emoticons, symbols, pictographs, and flags.

Missing/invalid values handling – replaces NaN, None, 'nan', 'null', etc. with empty text.

Normalization – converts text to lowercase and trims whitespace.

Noise removal – deletes URLs, reduces repeated punctuation, and collapses extra spaces.

Token filtering – removes very short or likely-typo words while keeping important short words (e.g., "ok", "hi").

Dataset-level cleaning – ensures the Comments column exists, preprocesses each entry, and removes empty or too-short results.

Final output – provides a validated list of sufficiently long, clean comments ready for topic modeling or further analysis.

In [2]:
def remove_emojis(text):
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags
        "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)


# Enhanced preprocessing function
def preprocess_text(text):
    """Enhanced preprocessing for social media comments"""
    # Handle various data types and missing values
    if pd.isna(text) or text is None:
        return ""
    
    # Convert to string and handle numpy types
    text = str(text).strip()
    
    if text == "" or text.lower() in ['nan', 'none', 'null']:
        return ""
    
    # Convert to lowercase
    text = text.lower()

    #remove emojis
    text = remove_emojis(text)
    
    # Remove URLs
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
    
    # Remove excessive punctuation but keep some for context
    text = re.sub(r'[.]{2,}', '.', text)
    text = re.sub(r'[!]{2,}', '!', text)
    text = re.sub(r'[?]{2,}', '?', text)
    
    # Remove excessive whitespace
    text = re.sub(r'\s+', ' ', text)
    
    # Remove very short words (likely typos) but keep important short words
    important_short = {'ok', 'no', 'go', 'hi', 'me', 'we', 'he', 'so', 'up', 'my', 'is', 'at', 'it', 'on', 'or'}
    words = text.split()
    words = [word for word in words if len(word) >= 2 or word in important_short]
    
    result = ' '.join(words).strip()
    return result if result else ""

# Load and preprocess data
print("Loading and preprocessing data...")
document = pd.read_csv('../../DATA MINING - DATASET - Consolidated_Dataset (1).csv')

# Ensure Comments column exists and handle missing values
if 'Comments' not in document.columns:
    print("Available columns:", document.columns.tolist())
    raise ValueError("'Comments' column not found in the dataset")

# Handle missing values and data type issues
document['Comments'] = document['Comments'].fillna('')  # Fill NaN with empty string
document['Comments'] = document['Comments'].astype(str)  # Ensure all are strings

print(f"Original number of comments: {len(document)}")

# Clean the comments
document['Comments_Clean'] = document['Comments'].apply(preprocess_text)

# Remove empty comments after preprocessing and ensure we have strings
document = document[
    (document['Comments_Clean'].str.len() > 5) & 
    (document['Comments_Clean'] != '') & 
    (document['Comments_Clean'].notna())
]

comments = document['Comments_Clean'].tolist()

# Final validation - ensure all comments are non-empty strings
comments = [str(comment).strip() for comment in comments if comment and str(comment).strip()]
comments = [comment for comment in comments if len(comment) > 5]

print(f"Number of valid comments after cleaning: {len(comments)}")

if len(comments) < 10:
    raise ValueError(f"Too few valid comments ({len(comments)}). Need at least 10 for topic modeling.")

# Debug: Show sample of cleaned comments
print("\nSample cleaned comments:")
for i, comment in enumerate(comments[:3]):
    print(f"  {i+1}: {comment[:100]}...")
print()

Loading and preprocessing data...
Original number of comments: 978
Number of valid comments after cleaning: 969

Sample cleaned comments:
  1: bat walang gumagalaw kay romualdez eh siya nga pinaka suspicious dyan...
  2: kunyari hindi alam....
  3: question before mag release nang full payment wla man lang inspection?...



## Stopwords

NLTK's English Stopwords + Social Media Stopwords + Custom Tagalog Stopwords

In [3]:
# Enhanced stopwords
eng_stopwords = set(stopwords.words('english'))
with open("tagalog_stopwords.txt", "r", encoding="utf-8") as f:
    tagalog_stopwords = set([line.strip().lower() for line in f if line.strip()])

# Add social media specific stopwords
social_media_stopwords = {
    'lol', 'lmao', 'haha', 'hehe', 'omg', 'wtf', 'tbh', 'imo', 'imho',
    'rt', 'dm', 'pm', 'fb', 'ig', 'twitter', 'facebook', 'instagram',
    'like', 'share', 'comment', 'follow', 'retweet', 'post', 'tagged'
}

all_stopwords = list(eng_stopwords | tagalog_stopwords | social_media_stopwords)
print(f"Total merged stopwords: {len(all_stopwords)}")

Total merged stopwords: 511


# BERTopic Config

TF-IDF Vectorizer – builds vocab (1–2 n-grams), removes stopwords, filters rare/common terms.

UMAP – reduces dimensionality with adaptive neighbors for small datasets.

HDBSCAN – flexible clustering with small min_cluster_size.

Sentence Transformer – multilingual embeddings.

Representation Models – KeyBERT, MMR, and custom c-TF-IDF for keyword extraction.

In [4]:
# Improved vectorizer with TF-IDF
n_docs = len(comments)

# Calculate min_df and max_df properly to avoid conflicts
min_df_value = 2
max_df_value = 0.8

vectorizer_model = TfidfVectorizer(
    stop_words=all_stopwords,
    ngram_range=(1, 2),
    min_df=min_df_value,
    max_df=max_df_value,
    max_features=5000,
    token_pattern=r'(?u)\b[a-zA-Z][a-zA-Z]+\b',  # Only alphabetic tokens with 2+ chars
    lowercase=True,
    strip_accents='unicode'
)

# Optimized UMAP parameters for small dataset
umap_model = UMAP(
    n_neighbors=min(10, max(2, int(n_docs * 0.015))),  # Adaptive to dataset size
    n_components=5,  # Reduced dimensions for small dataset
    min_dist=0.1,  # Slightly higher to prevent over-clustering
    metric='cosine',
    random_state=42
)

# Optimized HDBSCAN parameters
vec = vectorizer_model.fit(comments)
X = vectorizer_model.transform(comments)
min_cluster_size = 2
print("Vocab size:", len(vec.vocabulary_))
print("Document-term matrix shape:", X.shape)
print(f"Using min_cluster_size: {min_cluster_size}")

hdbscan_model = HDBSCAN(
    min_cluster_size=min_cluster_size,
    min_samples=1,  # More flexible clustering
    metric='euclidean',
    cluster_selection_method='eom',
    prediction_data=True,
    cluster_selection_epsilon=0.1  # Allow more flexible cluster selection
)

# Better sentence transformer for multilingual content
sentence_model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')

# Enhanced representation models
keybert_model = KeyBERTInspired()
mmr_model = MaximalMarginalRelevance(diversity=0.3)

# Custom c-TF-IDF model
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True, bm25_weighting=False)

Vocab size: 1440
Document-term matrix shape: (969, 1440)
Using min_cluster_size: 2


## Create the model and train

In [5]:
topic_model = BERTopic(
    embedding_model=sentence_model,
    vectorizer_model=vectorizer_model,
    ctfidf_model=ctfidf_model,
    language=None,
    calculate_probabilities=True,
    verbose=True,
    nr_topics="auto"
)
print("Training BERTopic model...")
topics, probs = topic_model.fit_transform(comments)

2025-09-20 21:49:25,527 - BERTopic - Embedding - Transforming documents to embeddings.


Training BERTopic model...


Batches: 100%|██████████| 31/31 [00:34<00:00,  1.12s/it]
2025-09-20 21:50:00,159 - BERTopic - Embedding - Completed ✓
2025-09-20 21:50:00,160 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-09-20 21:50:15,933 - BERTopic - Dimensionality - Completed ✓
2025-09-20 21:50:15,936 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-09-20 21:50:16,041 - BERTopic - Cluster - Completed ✓
2025-09-20 21:50:16,043 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-09-20 21:50:16,145 - BERTopic - Representation - Completed ✓
2025-09-20 21:50:16,146 - BERTopic - Topic reduction - Reducing number of topics
2025-09-20 21:50:16,152 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-09-20 21:50:16,200 - BERTopic - Representation - Completed ✓
2025-09-20 21:50:16,202 - BERTopic - Topic reduction - Reduced number of topics from 11 to 9


### Save topic info

In [6]:
# Save full topic info DataFrame
topic_model.get_topic_info().to_csv("topic_info_BERT_v2.csv", index=False)

# Save top words for each topic
import pandas as pd
rows = []
for topic_num in topic_model.get_topic_info()['Topic']:
    if topic_num == -1:  # skip outliers
        continue
    words = topic_model.get_topic(topic_num)
    rows.append({
        "Topic": topic_num,
        "Keywords": ", ".join([w for w, _ in words])
    })

pd.DataFrame(rows).to_csv("topic_keywords_BERT_v2.csv", index=False)

In [7]:
# Get topic information
topic_info = topic_model.get_topic_info()
print(f"\nNumber of topics found: {len(topic_info) - 1}")  # -1 to exclude outlier topic
print(f"Number of outliers (topic -1): {sum(1 for t in topics if t == -1)}")

# Display topic information
print("\nTopic Information:")
for i, row in topic_info.head(10).iterrows():
    print(f"Topic {row['Topic']}: {row['Count']} documents")
    if row['Topic'] != -1:  # Skip outlier topic
        words = topic_model.get_topic(row['Topic'])
        print(f"  Top words: {', '.join([word for word, _ in words[:10]])}")
    print()

# Calculate and display coherence scores (if available)
try:
    from gensim.corpora import Dictionary
    from gensim.models import CoherenceModel
    from gensim.utils import simple_preprocess
    
    # Prepare documents for coherence calculation
    processed_docs = [simple_preprocess(doc) for doc in comments]
    dictionary = Dictionary(processed_docs)
    
    # Get topics for coherence calculation
    topics_for_coherence = []
    for topic_id in range(len(topic_info) - 1):  # Exclude outlier topic
        if topic_id != -1:
            topic_words = [word for word, _ in topic_model.get_topic(topic_id)]
            topics_for_coherence.append(topic_words[:10])  # Top 10 words per topic
    
    if topics_for_coherence:
        # Calculate coherence
        coherence_model_cv = CoherenceModel(
            topics=topics_for_coherence, 
            texts=processed_docs, 
            dictionary=dictionary, 
            coherence='c_v'
        )
        coherence_cv = coherence_model_cv.get_coherence()
        
        coherence_model_umass = CoherenceModel(
            topics=topics_for_coherence, 
            texts=processed_docs, 
            dictionary=dictionary, 
            coherence='u_mass'
        )
        coherence_umass = coherence_model_umass.get_coherence()
        
        print(f"Overall Coherence Scores:")
        print(f"C_v: {coherence_cv:.4f}")
        print(f"UMass: {coherence_umass:.4f}")
        
except ImportError:
    print("Gensim not available for coherence calculation. Install with: pip install gensim")
except Exception as e:
    print(f"Could not calculate coherence scores: {e}")

# Additional model analysis
print(f"\nModel Statistics:")
print(f"Total documents: {len(comments)}")
print(f"Average documents per topic: {len(comments) / max(1, len(set(topics)) - (1 if -1 in topics else 0)):.1f}")



Number of topics found: 8
Number of outliers (topic -1): 124

Topic Information:
Topic -1: 124 documents

Topic 0: 394 documents
  Top words: alam, pilipinas, tapos, galing, pilipino, makukulong, kawawa, senado, bayan, inspection

Topic 1: 183 documents
  Top words: talk, alcantara, good, bangag, coming, thank, romualdez, sabay, tiba, properties

Topic 2: 103 documents
  Top words: state, witness, coa, congressman, senador, senate, presidente, mayor, magalong, hearing

Topic 3: 69 documents
  Top words: corruption, take, government, people, corrupt, even, money, billions, filipinos, philippine

Topic 4: 53 documents
  Top words: flood control, flood, control, control project, control projects, projects, project, budget, sapat, admin

Topic 5: 19 documents
  Top words: death penalty, penalty, death, jail, prison, come, put, execution, bitayin, money

Topic 6: 13 documents
  Top words: freeze, assets, freeze assets, seize, seize assets, escape, list, came, tax, lumabas

Topic 7: 11 docu

In [8]:
topic_model.save("models/BERTv2_model_1")



In [20]:
# Save full topic info DataFrame
topic_model.get_topic_info().to_csv("topic_info_BERTv2_merged.csv", index=False)

# Save top words for each topic
import pandas as pd
rows = []
for topic_num in topic_model.get_topic_info()['Topic']:
    if topic_num == -1:  # skip outliers
        continue
    words = topic_model.get_topic(topic_num)
    rows.append({
        "Topic": topic_num,
        "Keywords": ", ".join([w for w, _ in words])
    })

pd.DataFrame(rows).to_csv("topic_keywords_BERTv2_merged.csv", index=False)

In [None]:
def topic_diversity(topic_model):
    """
    Calculate topic diversity: proportion of unique words across all topic keywords.
    """
    topics = topic_model.get_topics()
    all_words = []
    for topic in topics.values():
        words = [word for word, _ in topic]
        all_words.extend(words)
    unique_words = set(all_words)
    return len(unique_words) / len(all_words) if all_words else 0.0


def topic_silhouette(embeddings, topics):
    """
    Compute silhouette score for topic assignments, excluding outliers (-1).
    Handles list-based topic labels.
    """
    topics = np.array(topics)  # convert list to array
    
    # Exclude outliers
    mask = topics != -1
    if np.sum(mask) < 2 or len(np.unique(topics[mask])) < 2:
        return None
    
    return silhouette_score(embeddings[mask], topics[mask])


def intra_topic_distance(topic_model, embeddings, documents):
    """
    Average pairwise distance within each topic.
    """
    from sklearn.metrics.pairwise import cosine_distances
    topics, _ = topic_model.transform(documents)
    unique_topics = set(topics) - {-1}

    intra_distances = []
    for t in unique_topics:
        idx = np.where(topics == t)[0]
        if len(idx) > 1:
            dists = cosine_distances(embeddings[idx])
            intra_distances.append(np.mean(dists))
    return np.mean(intra_distances) if intra_distances else None


def inter_topic_distance(topic_model):
    """
    Distance between topic centroids (based on c-TF-IDF representations).
    """
    import numpy as np
    from sklearn.metrics.pairwise import cosine_distances

    topic_embeddings = topic_model.c_tf_idf_
    if topic_embeddings is None:
        return None
    dists = cosine_distances(topic_embeddings)
    return np.mean(dists)


In [14]:
# Get embeddings
embeddings = sentence_model.encode(comments, show_progress_bar=True)

# Run metrics
div_score = topic_diversity(topic_model)
sil_score = topic_silhouette(embeddings, topics)
intra_dist = intra_topic_distance(topic_model, embeddings, comments)
inter_dist = inter_topic_distance(topic_model)

print("Topic Diversity:", div_score)
print("Silhouette Score:", sil_score)
print("Intra-topic Distance:", intra_dist)
print("Inter-topic Distance:", inter_dist)

Batches:   0%|          | 0/31 [00:00<?, ?it/s]

Batches: 100%|██████████| 31/31 [00:33<00:00,  1.07s/it]
Batches: 100%|██████████| 31/31 [00:32<00:00,  1.04s/it]
2025-09-20 21:58:07,317 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2025-09-20 21:58:07,326 - BERTopic - Dimensionality - Completed ✓
2025-09-20 21:58:07,329 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2025-09-20 21:58:07,369 - BERTopic - Probabilities - Start calculation of probabilities with HDBSCAN
2025-09-20 21:58:07,483 - BERTopic - Probabilities - Completed ✓
2025-09-20 21:58:07,484 - BERTopic - Cluster - Completed ✓


Topic Diversity: 0.9888888888888889
Silhouette Score: 0.0683543011546135
Intra-topic Distance: 0.4795085
Inter-topic Distance: 0.7339283702415055
