In [1]:
import os
import pandas as pd
import numpy as np
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic.representation import MaximalMarginalRelevance
from bertopic.vectorizers import ClassTfidfTransformer
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora import Dictionary
from sklearn.metrics import silhouette_score
import random
from itertools import product
from sentence_transformers import SentenceTransformer, models

def topic_modeling(platform, label, original_cols, stopwords, min_df=1, max_df=0.9,
                   n_neighbors=15, min_cluster_size=30, min_samples=10, nr_topics='auto',
                   use_pretrained_embed=True, grid_search=False, n_samples=100):
    
    # Set paths
    cur_dir = os.getcwd()
    target_path = os.path.join(cur_dir, 'data', f'{platform}_data', 'berttopic_label', f'{label}')
    label_df = pd.read_csv(os.path.join(target_path, "label_df.csv"))

    # Load documents
    with open(os.path.join(target_path, 'unlemma_dc'), 'r', encoding='utf-8') as f:
        unlemma_dc = [line.strip() for line in f if line.strip()]
    if not unlemma_dc:
        print(f"No valid documents remain after preprocessing for label '{label}'.")

    with open(os.path.join(target_path, 'lemma_dc'), 'r', encoding='utf-8') as f:
        lemma_dc = [line.strip() for line in f if line.strip()]
    if not lemma_dc:
        print(f"No lemmatized documents found for label '{label}'.")

    # Load embeddings
    embedding = np.load(os.path.join(target_path, 'embedding.npy'))
    if embedding.size == 0 or len(embedding) != len(lemma_dc):
        print(f"Embedding generation failed or mismatched for label '{label}': {len(embedding)} embeddings, {len(lemma_dc)} documents.")
    reduced_embedding = np.load(os.path.join(target_path, 'reduce_embedding.npy'))
    if reduced_embedding.size == 0:
        print(f"Embedding generation failed for label '{label}'.")

    # Custom function to filter n-grams from vocabulary
    def filter_ngrams_vocabulary(vectorizer, documents, unwanted_ngrams):
        vectorizer.fit(documents)
        vocab = vectorizer.get_feature_names_out()
        filtered_vocab = [term for term in vocab if term not in unwanted_ngrams]
        new_vectorizer = CountVectorizer(
            ngram_range=vectorizer.ngram_range,
            stop_words=vectorizer.stop_words,
            min_df=vectorizer.min_df,
            max_df=vectorizer.max_df,
            vocabulary=filtered_vocab
        )
        return new_vectorizer

    # Evaluation functions
    def extract_topic_words(topics_dict, topk: int = 10):
        topics_clean = []
        for tid, pairs in topics_dict.items():
            if tid == -1:
                continue
            topic_words = []
            for word, _ in pairs[:topk]:
                split_words = word.strip().split()
                topic_words.extend(split_words)
            if topic_words:
                topics_clean.append(list(dict.fromkeys(topic_words)))
        return topics_clean

    def topic_coherence(topics_list, docs, topk: int = 10):
        dictionary = Dictionary(doc.split() for doc in docs)
        coherence_model = CoherenceModel(
            topics=topics_list,
            texts=[doc.split() for doc in docs],
            dictionary=dictionary,
            coherence="c_v",
            topn=topk,
        )
        return coherence_model.get_coherence()

    def topic_diversity(topics_dict, topk: int = 10):
        all_words = [w for _, pairs in topics_dict.items() if _ != -1
                     for w, _ in pairs[:topk]]
        return len(set(all_words)) / (len(topics_dict) * topk)

    def topic_silhouette(embeddings, topics_labels):
        valid_idx = [i for i, t in enumerate(topics_labels) if t != -1]
        X_valid = embeddings[valid_idx]
        y_valid = np.array(topics_labels)[valid_idx]
        if len(np.unique(y_valid)) < 2:
            return 0.0
        return silhouette_score(X_valid, y_valid, metric="cosine")

    # Initialize SentenceTransformer model
    sentence_transformer_model = models.Transformer(
        model_name_or_path="mental/mental-bert-base-uncased",
        tokenizer_name_or_path="mental/mental-bert-base-uncased",
        max_seq_length=512
    )
    pooling_model = models.Pooling(
        sentence_transformer_model.get_word_embedding_dimension(),
        pooling_mode_mean_tokens=True
    )
    mentalbert_sentence_model = SentenceTransformer(device="cuda", modules=[sentence_transformer_model, pooling_model])

    # Grid search logic
    if grid_search:
        # Define hyperparameter ranges
        max_df_range = np.linspace(0.4, 0.95, num=12)  # 12 values between 0.4 and 0.95
        n_neighbors_range = np.arange(5, 36, 5)  # 5 to 35, step 5
        min_samples_range = np.arange(5, 41, 5)  # 5 to 40, step 5

        # Generate random sample of parameter combinations
        param_combinations = list(product(max_df_range, n_neighbors_range, min_samples_range))
        if n_samples > len(param_combinations):
            n_samples = len(param_combinations)
        sampled_combinations = random.sample(param_combinations, n_samples)

        results = []
        for idx, (max_df_val, n_neighbors_val, min_samples_val) in enumerate(sampled_combinations):
            print(f"Testing combination {idx+1}/{n_samples}: max_df={max_df_val:.3f}, n_neighbors={n_neighbors_val}, min_samples={min_samples_val}")

            # Initialize models with current parameters
            vectorizer_model = CountVectorizer(
                ngram_range=(1, 3),
                stop_words='english',
                min_df=min_df,
                max_df=max_df_val
            )
            vectorizer_model = filter_ngrams_vocabulary(vectorizer_model, lemma_dc, stopwords)

            umap_model = UMAP(
                n_neighbors=n_neighbors_val,
                n_components=5,
                min_dist=0.0,
                metric='cosine',
                random_state=42
            )

            hdbscan_model = HDBSCAN(
                min_cluster_size=min_cluster_size,
                min_samples=min_samples_val,
                metric='euclidean',
                prediction_data=True
            )

            ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
            representation_model = MaximalMarginalRelevance(diversity=0.3)

            # Initialize and fit BERTopic
            topic_model = BERTopic(
                embedding_model=mentalbert_sentence_model,
                umap_model=umap_model,
                hdbscan_model=hdbscan_model,
                vectorizer_model=vectorizer_model,
                ctfidf_model=ctfidf_model,
                representation_model=representation_model,
                top_n_words=10,
                nr_topics=nr_topics,
                calculate_probabilities=True
            )

            # Fit model
            if use_pretrained_embed:
                topics, probs = topic_model.fit_transform(documents=lemma_dc, embeddings=embedding)
            else:
                topics, probs = topic_model.fit_transform(documents=lemma_dc)

            # Reduce outliers
            num_outliers = np.sum(np.array(topics) == -1)
            if num_outliers > 0:
                new_topics = topic_model.reduce_outliers(
                    documents=lemma_dc,
                    topics=topics,
                    probabilities=probs,
                    strategy="probabilities",
                    threshold=0.6
                )
                if num_outliers != new_topics.count(-1):
                    topic_model.update_topics(lemma_dc, topics=new_topics, vectorizer_model=vectorizer_model,
                                              ctfidf_model=ctfidf_model, representation_model=representation_model)

            # Evaluate
            topics_dict = topic_model.get_topics()
            topics_labels = topics
            topics_list = extract_topic_words(topics_dict, topk=10)

            coh = topic_coherence(topics_list, lemma_dc, topk=10)
            div = topic_diversity(topics_dict, topk=10)
            sil = topic_silhouette(embedding, topics_labels)

            # Store results
            results.append({
                'max_df': max_df_val,
                'n_neighbors': n_neighbors_val,
                'min_samples': min_samples_val,
                'coherence': coh,
                'diversity': div,
                'silhouette': sil,
                'combined_score': (coh + div + sil) / 3  # Simple average for combined score
            })

        # Sort results by combined score and select top 5
        results = sorted(results, key=lambda x: x['combined_score'], reverse=True)
        top_5_results = results[:5]

        print("\nTop 5 Parameter Combinations:")
        for i, res in enumerate(top_5_results, 1):
            print(f"\nRank {i}:")
            print(f"max_df: {res['max_df']:.3f}, n_neighbors: {res['n_neighbors']}, min_samples: {res['min_samples']}")
            print(f"Coherence: {res['coherence']:.4f}, Diversity: {res['diversity']:.4f}, Silhouette: {res['silhouette']:.4f}")
            print(f"Combined Score: {res['combined_score']:.4f}")

        # Save results to CSV
        results_df = pd.DataFrame(results)
        output_path = os.path.join(cur_dir, f'topic_modeling_results_{label.replace(" ", "_").lower()}.csv')
        results_df.to_csv(output_path, index=False)
        print(f"\nGrid search results saved to: {output_path}")

        return top_5_results

    else:
        # Original code for single run
        vectorizer_model = CountVectorizer(
            ngram_range=(1, 3),
            stop_words='english',
            min_df=min_df,
            max_df=max_df
        )
        vectorizer_model = filter_ngrams_vocabulary(vectorizer_model, lemma_dc, stopwords)

        umap_model = UMAP(
            n_neighbors=n_neighbors,
            n_components=5,
            min_dist=0.0,
            metric='cosine',
            random_state=42
        )

        hdbscan_model = HDBSCAN(
            min_cluster_size=min_cluster_size,
            min_samples=min_samples,
            metric='euclidean',
            prediction_data=True
        )

        ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
        representation_model = MaximalMarginalRelevance(diversity=0.3)

        topic_model = BERTopic(
            embedding_model=mentalbert_sentence_model,
            umap_model=umap_model,
            hdbscan_model=hdbscan_model,
            vectorizer_model=vectorizer_model,
            ctfidf_model=ctfidf_model,
            representation_model=representation_model,
            top_n_words=10,
            nr_topics=nr_topics,
            calculate_probabilities=True
        )

        if use_pretrained_embed:
            topics, probs = topic_model.fit_transform(documents=lemma_dc, embeddings=embedding)
        else:
            topics, probs = topic_model.fit_transform(documents=lemma_dc)

        num_outliers = np.sum(np.array(topics) == -1)
        if num_outliers > 0:
            new_topics = topic_model.reduce_outliers(
                documents=lemma_dc,
                topics=topics,
                probabilities=probs,
                strategy="probabilities",
                threshold=0.6
            )
            print(f"Before Number of outliers: {num_outliers}")
            print(f"After Number of outliers: {new_topics.count(-1)}")
            if num_outliers != new_topics.count(-1):
                topic_model.update_topics(lemma_dc, topics=new_topics, vectorizer_model=vectorizer_model,
                                          ctfidf_model=ctfidf_model, representation_model=representation_model)
        else:
            print("No outliers found — skipping reduction.")

        topics_dict = topic_model.get_topics()
        topics_labels = topics
        topics_list = extract_topic_words(topics_dict, topk=10)

        coh = topic_coherence(topics_list, lemma_dc, topk=10)
        div = topic_diversity(topics_dict, topk=10)
        sil = topic_silhouette(embedding, topics_labels)

        print(f"Coherence (c_v): {coh:.4f}")
        print(f"Diversity: {div:.4f}")
        print(f"Silhouette (cos): {sil:.4f}")

        return topic_model, topics, probs

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
platform = 'beyondblue'
use_pretrained_embed = True
# # Example usage for Anxiety
# stopwords = ['feel', 'week', 'tng', 'need', 'say', 'hard', 'good', 'sometng',
#              'work', 'help', 'talk', 'ask', 'end', 'start', 'people', 'month',
#              'thought', 'way', 'anytng', 'day', 'make', 'year', 'everytng', 'fly',
#              'experience', 'health', 'drive', 'feeling', 'kind', 'manage', 'mental',
#              'understand', 'mind', 'new', 'hear', 'right', 'lm', 'tell', 'hello', 'body',
#              'meet', 'past', 'self', 'follow', 'try', 'walk', 'wiht', 'use', 'act', 'wle', 'welcome']
# label = "Anxiety"





# # Depression
# stopwords=['sleep', 'kind', 'good', 'love', 'need', 'sometng', 'feel', 'talk', 'home',
#            'tng', 'everytng', 'anytng', 'notng', 'self','people', 'work', 'way',
#             'tell',  'experience', 'wle', 'health','wle','make','help', 'say',
#             'day','right', 'hear', 'lead','thought', 've','end','week','use',
#             'word','ask','come', 'sure','mean','lot' ]
# label = "Depression"



# PTSD and trauma
# stopwords=['tng', 'love', 'good', 'need', 'way',  'feel', 'talk', 'sometng','help',
#            'people', 'work', 'tell',  'hope',  'sorry','hard', 'right','say','end',
#            'week','everytng', 'anytng','gh', 'make','mh','feeling', 'thought', 'situation',
#            'hear', 'long','past', 'like','person', 'mind','ask', 'womb', 'welcome',
#            'use','sort', 'result', 'write','day','mean','friendsp','wle', 'self' ]
# label = "PTSD and trauma"




# Suicidal thoughts and self-harm
stopwords=['tng', 'need', 'good', 'feel', 'talk', 'friend', 'love', 'hear', 'family', 'tell',
           'way', 'make', 'sometng', 'self', 'say',  'end','support', 'post', 'understand',
           'live', 'service', 'experience', 'leave', 'sound', 'welcome', 'long','wle', 'let',
            'mental health', 'anytng', 'everytng','use', 'wonder','thought','anymore','work',
            'hard', 'hope', 'day', 'feeling', 'mental', 'right', 'person','sope', 'reacng',
            'sorry hear','start', 'write','sh thought', 'skill', 'look', 'mean','people', 
            'try', 'care', 'health',  'kind',  'moment','year', 'week', 'ask','mind','lucys',
            'variable','kalice', 'nice story','okpitch', 'sit', 'dear okpitch','notng', 'calli',
            'tiah', 'stay', 'jessksch','tony', 'old','mum', 'beekay','tnke','place', ]
label = "Suicidal thoughts and self-harm"



original_cols = ["Post Title", "Post Content", "Comments"]
min_df = 1
max_df = 0.95
n_neighbors = 15
min_cluster_size = 30
min_samples = 10
nr_topics = 'auto'


# Run grid search
top_5_results = topic_modeling(
    platform=platform,
    label=label,
    original_cols=original_cols,
    stopwords=stopwords,
    min_df=min_df,
    max_df=max_df,
    n_neighbors=n_neighbors,
    min_cluster_size=min_cluster_size,
    min_samples=min_samples,
    nr_topics=nr_topics,
    use_pretrained_embed=use_pretrained_embed,
    grid_search=True,
    n_samples=30
)

Some weights of BertModel were not initialized from the model checkpoint at mental/mental-bert-base-uncased and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Testing combination 1/30: max_df=0.900, n_neighbors=5, min_samples=25
Testing combination 2/30: max_df=0.950, n_neighbors=20, min_samples=15




Testing combination 3/30: max_df=0.850, n_neighbors=20, min_samples=10




Testing combination 4/30: max_df=0.450, n_neighbors=15, min_samples=10
Testing combination 5/30: max_df=0.700, n_neighbors=20, min_samples=5




Testing combination 6/30: max_df=0.400, n_neighbors=15, min_samples=20




Testing combination 7/30: max_df=0.450, n_neighbors=25, min_samples=35




Testing combination 8/30: max_df=0.400, n_neighbors=10, min_samples=15
Testing combination 9/30: max_df=0.700, n_neighbors=20, min_samples=10




Testing combination 10/30: max_df=0.650, n_neighbors=5, min_samples=30




Testing combination 11/30: max_df=0.750, n_neighbors=35, min_samples=15
Testing combination 12/30: max_df=0.900, n_neighbors=35, min_samples=20
Testing combination 13/30: max_df=0.750, n_neighbors=10, min_samples=40




Testing combination 14/30: max_df=0.400, n_neighbors=35, min_samples=20
Testing combination 15/30: max_df=0.800, n_neighbors=20, min_samples=20




Testing combination 16/30: max_df=0.900, n_neighbors=15, min_samples=25




Testing combination 17/30: max_df=0.500, n_neighbors=15, min_samples=30
Testing combination 18/30: max_df=0.800, n_neighbors=20, min_samples=35




Testing combination 19/30: max_df=0.400, n_neighbors=30, min_samples=35
Testing combination 20/30: max_df=0.650, n_neighbors=15, min_samples=5
Testing combination 21/30: max_df=0.850, n_neighbors=15, min_samples=30
Testing combination 22/30: max_df=0.450, n_neighbors=15, min_samples=5
Testing combination 23/30: max_df=0.400, n_neighbors=5, min_samples=40
Testing combination 24/30: max_df=0.750, n_neighbors=20, min_samples=10




Testing combination 25/30: max_df=0.400, n_neighbors=30, min_samples=20
Testing combination 26/30: max_df=0.800, n_neighbors=25, min_samples=40




Testing combination 27/30: max_df=0.700, n_neighbors=35, min_samples=15
Testing combination 28/30: max_df=0.400, n_neighbors=30, min_samples=10




Testing combination 29/30: max_df=0.650, n_neighbors=35, min_samples=35




Testing combination 30/30: max_df=0.700, n_neighbors=10, min_samples=30

Top 5 Parameter Combinations:

Rank 1:
max_df: 0.750, n_neighbors: 10, min_samples: 40
Coherence: 0.4437, Diversity: 0.7714, Silhouette: 0.1712
Combined Score: 0.4621

Rank 2:
max_df: 0.450, n_neighbors: 25, min_samples: 35
Coherence: 0.4709, Diversity: 0.7625, Silhouette: 0.0918
Combined Score: 0.4417

Rank 3:
max_df: 0.800, n_neighbors: 20, min_samples: 20
Coherence: 0.4485, Diversity: 0.7750, Silhouette: 0.0988
Combined Score: 0.4408

Rank 4:
max_df: 0.400, n_neighbors: 30, min_samples: 10
Coherence: 0.4620, Diversity: 0.7375, Silhouette: 0.1203
Combined Score: 0.4399

Rank 5:
max_df: 0.700, n_neighbors: 10, min_samples: 30
Coherence: 0.3855, Diversity: 0.8071, Silhouette: 0.0345
Combined Score: 0.4090

Grid search results saved to: e:\Studying in Adelaide\2_Trimester-2\project_A_ML-Mental Health (MDS)\LLM_NetworkMoel_MentalHealth_coding\topic_modeling_results_suicidal_thoughts_and_self-harm.csv
