In [1]:
import os
import pandas as pd
import numpy as np
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic.representation import MaximalMarginalRelevance
from bertopic.vectorizers import ClassTfidfTransformer
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora import Dictionary
from sklearn.metrics import silhouette_score
import random
from itertools import product
from sentence_transformers import SentenceTransformer, models

def topic_modeling(platform, label, original_cols, stopwords, min_df=1, max_df=0.9,
                   n_neighbors=15, min_cluster_size=30, min_samples=10, nr_topics='auto',
                   use_pretrained_embed=True, n_samples=100):
    
    # Set paths
    cur_dir = os.getcwd()
    target_path = os.path.join(cur_dir, 'data', f'{platform}_data', 'berttopic_label', f'{label}')
    label_df = pd.read_csv(os.path.join(target_path, "label_df.csv"))

    # Load documents
    with open(os.path.join(target_path, 'unlemma_dc'), 'r', encoding='utf-8') as f:
        unlemma_dc = [line.strip() for line in f if line.strip()]
    if not unlemma_dc:
        print(f"No valid documents remain after preprocessing for label '{label}'.")

    with open(os.path.join(target_path, 'lemma_dc'), 'r', encoding='utf-8') as f:
        lemma_dc = [line.strip() for line in f if line.strip()]
    if not lemma_dc:
        print(f"No lemmatized documents found for label '{label}'.")

    # Load embeddings
    embedding = np.load(os.path.join(target_path, 'embedding.npy'))
    if embedding.size == 0 or len(embedding) != len(lemma_dc):
        print(f"Embedding generation failed or mismatched for label '{label}': {len(embedding)} embeddings, {len(lemma_dc)} documents.")
    reduced_embedding = np.load(os.path.join(target_path, 'reduce_embedding.npy'))
    if reduced_embedding.size == 0:
        print(f"Embedding generation failed for label '{label}'.")

    # Custom function to filter n-grams from vocabulary
    def filter_ngrams_vocabulary(vectorizer, documents, unwanted_ngrams):
        vectorizer.fit(documents)
        vocab = vectorizer.get_feature_names_out()
        filtered_vocab = [term for term in vocab if term not in unwanted_ngrams]
        new_vectorizer = CountVectorizer(
            ngram_range=vectorizer.ngram_range,
            stop_words=vectorizer.stop_words,
            min_df=vectorizer.min_df,
            max_df=vectorizer.max_df,
            vocabulary=filtered_vocab
        )
        return new_vectorizer

    # Evaluation functions
    def extract_topic_words(topics_dict, topk: int = 10):
        topics_clean = []
        for tid, pairs in topics_dict.items():
            if tid == -1:
                continue
            topic_words = []
            for word, _ in pairs[:topk]:
                split_words = word.strip().split()
                topic_words.extend(split_words)
            if topic_words:
                topics_clean.append(list(dict.fromkeys(topic_words)))
        return topics_clean

    def topic_coherence(topics_list, docs, topk: int = 10):
        dictionary = Dictionary(doc.split() for doc in docs)
        coherence_model = CoherenceModel(
            topics=topics_list,
            texts=[doc.split() for doc in docs],
            dictionary=dictionary,
            coherence="c_v",
            topn=topk,
        )
        return coherence_model.get_coherence()

    def topic_diversity(topics_dict, topk: int = 10):
        all_words = [w for _, pairs in topics_dict.items() if _ != -1
                     for w, _ in pairs[:topk]]
        return len(set(all_words)) / (len(topics_dict) * topk)

    def topic_silhouette(embeddings, topics_labels):
        valid_idx = [i for i, t in enumerate(topics_labels) if t != -1]
        X_valid = embeddings[valid_idx]
        y_valid = np.array(topics_labels)[valid_idx]
        if len(np.unique(y_valid)) < 2:
            return 0.0
        return silhouette_score(X_valid, y_valid, metric="cosine")

    # Initialize SentenceTransformer model
    sentence_transformer_model = models.Transformer(
        model_name_or_path="mental/mental-bert-base-uncased",
        tokenizer_name_or_path="mental/mental-bert-base-uncased",
        max_seq_length=512
    )
    pooling_model = models.Pooling(
        sentence_transformer_model.get_word_embedding_dimension(),
        pooling_mode_mean_tokens=True
    )
    mentalbert_sentence_model = SentenceTransformer(device="cuda", modules=[sentence_transformer_model, pooling_model])

    # Grid search logic

    # Define hyperparameter ranges
    max_df_range = np.linspace(0.4, 0.95, num=12)  # 12 values between 0.4 and 0.95
    min_cluster_size_range = np.arange(30, 61, 5)  # 30 to 60, step 5
    min_samples_range = np.arange(15, 36, 5)  # 15 to 35, step 5

    # Generate random sample of parameter combinations
    param_combinations = list(product(max_df_range, min_cluster_size_range, min_samples_range))
    if n_samples > len(param_combinations):
        n_samples = len(param_combinations)
    sampled_combinations = random.sample(param_combinations, n_samples)

    results = []
    for idx, (max_df_val, min_cluster_size_val, min_samples_val) in enumerate(sampled_combinations):
        print(f"Testing combination {idx+1}/{n_samples}: max_df={max_df_val:.3f}, min_cluster_size={min_cluster_size_val}, min_samples={min_samples_val}")

        # Initialize models with current parameters
        vectorizer_model = CountVectorizer(
            ngram_range=(1, 3),
            stop_words='english',
            min_df=min_df,
            max_df=max_df_val
        )
        vectorizer_model = filter_ngrams_vocabulary(vectorizer_model, lemma_dc, stopwords)

        umap_model = UMAP(
            n_neighbors=n_neighbors,
            n_components=5,
            min_dist=0.0,
            metric='cosine',
            random_state=42
        )

        hdbscan_model = HDBSCAN(
            min_cluster_size=min_cluster_size_val,
            min_samples=min_samples_val,
            metric='euclidean',
            prediction_data=True
        )

        ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
        representation_model = MaximalMarginalRelevance(diversity=0.3)

        # Initialize and fit BERTopic
        topic_model = BERTopic(
            embedding_model=mentalbert_sentence_model,
            umap_model=umap_model,
            hdbscan_model=hdbscan_model,
            vectorizer_model=vectorizer_model,
            ctfidf_model=ctfidf_model,
            representation_model=representation_model,
            top_n_words=10,
            nr_topics=nr_topics,
            calculate_probabilities=True
        )

        # Fit model
        if use_pretrained_embed:
            topics, probs = topic_model.fit_transform(documents=lemma_dc, embeddings=embedding)
        else:
            topics, probs = topic_model.fit_transform(documents=lemma_dc)

        # Reduce outliers
        num_outliers = np.sum(np.array(topics) == -1)
        if num_outliers > 0:
            new_topics = topic_model.reduce_outliers(
                documents=lemma_dc,
                topics=topics,
                probabilities=probs,
                strategy="probabilities",
                threshold=0.6
            )
            if num_outliers != new_topics.count(-1):
                topic_model.update_topics(lemma_dc, topics=new_topics, vectorizer_model=vectorizer_model,
                                            ctfidf_model=ctfidf_model, representation_model=representation_model)

        # Evaluate
        topics_dict = topic_model.get_topics()
        topics_labels = topics
        topics_list = extract_topic_words(topics_dict, topk=10)

        coh = topic_coherence(topics_list, lemma_dc, topk=10)
        div = topic_diversity(topics_dict, topk=10)
        sil = topic_silhouette(embedding, topics_labels)

        # Store results
        results.append({
            'max_df': max_df_val,
            'min_cluster_size': min_cluster_size_val,
            'min_samples': min_samples_val,
            'coherence': coh,
            'diversity': div,
            'silhouette': sil,
            'combined_score': (coh + div + sil) / 3  # Simple average for combined score
        })

    # Sort results by combined score and select top 5
    results = sorted(results, key=lambda x: x['combined_score'], reverse=True)
    top_5_results = results[:5]

    print("\nTop 5 Parameter Combinations:")
    for i, res in enumerate(top_5_results, 1):
        print(f"\nRank {i}:")
        print(f"max_df: {res['max_df']:.3f}, min_cluster_size: {res['min_cluster_size']}, min_samples: {res['min_samples']}")
        print(f"Coherence: {res['coherence']:.4f}, Diversity: {res['diversity']:.4f}, Silhouette: {res['silhouette']:.4f}")
        print(f"Combined Score: {res['combined_score']:.4f}")

    # Save results to CSV
    results_df = pd.DataFrame(results)
    output_path = os.path.join(cur_dir, f'topic_modeling_results_{label.replace(" ", "_").lower()}.csv')
    results_df.to_csv(output_path, index=False)
    print(f"\nGrid search results saved to: {output_path}")

    return top_5_results


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
platform = 'beyondblue'
use_pretrained_embed = True
original_cols = ["Post Title", "Post Content", "Comments"]
min_df = 1
max_df = 0.95
n_neighbors = 15
min_cluster_size = 30
min_samples = 10
nr_topics = 'auto'

categories = [
    {
        "label": "Anxiety",
        "stopwords": [ 'feel', 'week', 'tng', 'need', 'say', 'hard', 'good', 'sometng',
             'work',  'help',  'talk', 'ask', 'end','start', 'people', 'month',
             'thought', 'way', 'anytng','day','make','year','everytng','fly',
             'experience', 'health', 'drive', 'feeling', 'kind', 'manage', 'mental',
             'understand', 'mind', 'new', 'hear', 'right','lm','tell', 'hello','body',
             'meet','past','self','follow','try','walk','wiht','use','act','welcome',
             'support', 'hope',  'post', 'sound', 'long', 'let', 'worry', 'stay', 'situation',
             'lot','mean', 'learn','person', 'friend', 'fear', 'family', 'love', 'share',
             'psychologist', 'mental health', 'change', 'speak', 'read', 'lead','leave',
             'anymore', 'notng', 'remember', 'believe', 'moment', 'step','reacng','sorry hear',
             'kind regard', 'regard sope', 'counsellor', 'kind regard sope', 'openness', 'warm',
             'wasng','lol', 'simply', 'sope', 'dear','soon', 'happy','stop', 'easy', 'happen',
             'point','advice','safe', 'seek', 'idont','tnke', 'listen', 'ksimp', 'thank reacng',
             'psyccs','rx','tl','wle','gh','emij','jt','tonywk','turn','plan','normal','welcome forum',
             
]
    },
    {
        "label": "Depression",
        "stopwords": ['sleep', 'kind', 'good', 'love', 'need', 'sometng', 'feel', 'talk', 'home',
           'tng', 'everytng', 'anytng', 'notng', 'self','people', 'work', 'way',
            'tell',  'experience', 'health','make','help', 'say',
            'day','right', 'hear', 'lead','thought', 've','end','week','use',
            'word','ask','come', 'sure','mean','lot', 'believe','gh' , 'month','situation',
            'psychologist','happy', 'old', 'emotion', 'moment', 'sope','sorry hear', 
            'service', 'regard sope', 'listen', 'welcome forum', 'reacng','lm','ime',
            'think', 'similar', 'step', 'write', 'learn','form', 'ability', 'begin',
            'claim', 'anymore','head','belief','wle','hard', 'story', 'turn','close',
             'wonder','mind','change', 'text', 'mself','kind regard','place', 'point',
            'stay', 'idea','year', 'idk','mental health','sound','know', 'problem','post', 
            'mum','little']
    },
    {
        "label": "PTSD and trauma",
        "stopwords": ['tng', 'love', 'good', 'need', 'way',  'feel', 'talk', 'sometng','help',
           'people', 'work', 'tell',  'hope',  'sorry','hard', 'right','say','end',
           'week','everytng', 'anytng','gh', 'make','mh','feeling', 'thought', 'situation',
           'hear', 'long','past', 'like','person', 'mind','ask', 'womb', 'welcome',
           'use','sort', 'result', 'write','day','mean','friendsp','self','start',
           'let', 'kind', 'mental', 'spl', 'noella','lifea','mental health', 'change', 'believe',
           'happy', 'old', 'reacng','symptom','process', 'moment','dear kate', 'act', 'psychologist',
           'similar', 'cldren','reply', 'stay',  'sorry hear','sope','accept', 'word',
           'matthew', 'sorry hear', 'face', 'maxnotkat','lead', 'service','relationsp','ptsd',
           'share', 'story', 'wle', 'read','know','timtam','pinklightne','hello pinklightne',
           'wiht', 'psycatrist', 'cope', 'place','mum', 'learn','dear', 'dad','month','point',
           'stop', 'question','near', 'practice practice', 'issue','idea', 'reason',
           'jupiter', 'parent','welcome forum','comp','sarah xx','forward', 'method','wonder', 
           'step', 'notng', 'ppl','blue','myprofile', 'speak', 'emotion', 'seek','black cloud', 
            'real','jemma', 'frank', 'year', 'sound', 'think','early'    ]
    },
    {
        "label": "Suicidal thoughts and self-harm",
        "stopwords": ['tng', 'need', 'good', 'feel', 'talk', 'friend', 'love', 'hear', 'family', 'tell',
           'way', 'make', 'sometng', 'self', 'say',  'end','support', 'post', 'understand',
           'live', 'service', 'experience', 'leave', 'sound', 'welcome', 'long', 'let',
            'mental health', 'anytng', 'everytng','use', 'wonder','thought','anymore','work',
            'hard', 'hope', 'day', 'feeling', 'mental', 'right', 'person','sope', 'reacng',
            'sorry hear','start', 'write','sh thought', 'skill', 'look', 'mean','people', 
            'try', 'care', 'health',  'kind',  'moment','year', 'week', 'ask','mind','lucys',
            'variable','kalice', 'nice story','okpitch', 'sit', 'dear okpitch','notng', 'calli',
            'tiah', 'stay', 'jessksch','tony', 'old','mum', 'beekay','tnke','place', 'tonywk','mh',
            'step','feellike','idont','qld','past','month','speak','situation', 'problem',
            'change', 'relationsp', 'happy','believe','listen', 'strength','learn','seek',
             'change','sair','roadsend', 'regard sope','lead', 'plan', 'water','truly','matter',
            'question', 'reason', 'issue','tom','issue', 'read', 'little', 'face', 'idea', 'therapy',
            'world', 'psychologist','undecided uneasy hesitant']
    }
]

# Run grid search for all categories
all_top_5 = {}
for cat in categories:
    print(f"\n\n=== Processing category: {cat['label']} ===\n")
    top_5_results = topic_modeling(
        platform=platform,
        label=cat['label'],
        original_cols=original_cols,
        stopwords=cat['stopwords'],
        min_df=min_df,
        max_df=max_df,
        n_neighbors=n_neighbors,
        min_cluster_size=min_cluster_size,
        min_samples=min_samples,
        nr_topics=nr_topics,
        use_pretrained_embed=use_pretrained_embed,
        n_samples=100
    )
    all_top_5[cat['label']] = top_5_results



=== Processing category: Anxiety ===



Some weights of BertModel were not initialized from the model checkpoint at mental/mental-bert-base-uncased and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Testing combination 1/100: max_df=0.800, min_cluster_size=55, min_samples=30




Testing combination 2/100: max_df=0.950, min_cluster_size=40, min_samples=15




Testing combination 3/100: max_df=0.450, min_cluster_size=30, min_samples=35
Testing combination 4/100: max_df=0.500, min_cluster_size=35, min_samples=20
Testing combination 5/100: max_df=0.750, min_cluster_size=35, min_samples=25
Testing combination 6/100: max_df=0.700, min_cluster_size=40, min_samples=20
Testing combination 7/100: max_df=0.650, min_cluster_size=60, min_samples=35
Testing combination 8/100: max_df=0.900, min_cluster_size=50, min_samples=35




Testing combination 9/100: max_df=0.450, min_cluster_size=60, min_samples=35
Testing combination 10/100: max_df=0.600, min_cluster_size=60, min_samples=25




Testing combination 11/100: max_df=0.500, min_cluster_size=45, min_samples=20




Testing combination 12/100: max_df=0.400, min_cluster_size=30, min_samples=30
Testing combination 13/100: max_df=0.700, min_cluster_size=40, min_samples=30
Testing combination 14/100: max_df=0.550, min_cluster_size=40, min_samples=20
Testing combination 15/100: max_df=0.400, min_cluster_size=50, min_samples=20
Testing combination 16/100: max_df=0.550, min_cluster_size=60, min_samples=15
Testing combination 17/100: max_df=0.550, min_cluster_size=45, min_samples=25




Testing combination 18/100: max_df=0.900, min_cluster_size=45, min_samples=15




Testing combination 19/100: max_df=0.800, min_cluster_size=45, min_samples=30




Testing combination 20/100: max_df=0.750, min_cluster_size=60, min_samples=35
Testing combination 21/100: max_df=0.500, min_cluster_size=35, min_samples=30
Testing combination 22/100: max_df=0.650, min_cluster_size=50, min_samples=25




Testing combination 23/100: max_df=0.650, min_cluster_size=40, min_samples=20
Testing combination 24/100: max_df=0.550, min_cluster_size=55, min_samples=25




Testing combination 25/100: max_df=0.950, min_cluster_size=55, min_samples=20
Testing combination 26/100: max_df=0.450, min_cluster_size=60, min_samples=30
Testing combination 27/100: max_df=0.500, min_cluster_size=50, min_samples=25




Testing combination 28/100: max_df=0.600, min_cluster_size=60, min_samples=30
Testing combination 29/100: max_df=0.600, min_cluster_size=30, min_samples=20
Testing combination 30/100: max_df=0.650, min_cluster_size=55, min_samples=20
Testing combination 31/100: max_df=0.750, min_cluster_size=40, min_samples=35
Testing combination 32/100: max_df=0.450, min_cluster_size=35, min_samples=25
Testing combination 33/100: max_df=0.450, min_cluster_size=45, min_samples=35




Testing combination 34/100: max_df=0.850, min_cluster_size=45, min_samples=30




Testing combination 35/100: max_df=0.900, min_cluster_size=50, min_samples=30




Testing combination 36/100: max_df=0.850, min_cluster_size=55, min_samples=35
Testing combination 37/100: max_df=0.750, min_cluster_size=55, min_samples=30




Testing combination 38/100: max_df=0.800, min_cluster_size=35, min_samples=15




Testing combination 39/100: max_df=0.650, min_cluster_size=50, min_samples=15
Testing combination 40/100: max_df=0.800, min_cluster_size=45, min_samples=35




Testing combination 41/100: max_df=0.750, min_cluster_size=55, min_samples=15
Testing combination 42/100: max_df=0.450, min_cluster_size=45, min_samples=15




Testing combination 43/100: max_df=0.800, min_cluster_size=40, min_samples=35
Testing combination 44/100: max_df=0.550, min_cluster_size=50, min_samples=20
Testing combination 45/100: max_df=0.650, min_cluster_size=60, min_samples=30
Testing combination 46/100: max_df=0.600, min_cluster_size=40, min_samples=15




Testing combination 47/100: max_df=0.450, min_cluster_size=40, min_samples=20
Testing combination 48/100: max_df=0.700, min_cluster_size=55, min_samples=15
Testing combination 49/100: max_df=0.400, min_cluster_size=35, min_samples=15




Testing combination 50/100: max_df=0.850, min_cluster_size=40, min_samples=15




Testing combination 51/100: max_df=0.750, min_cluster_size=35, min_samples=35
Testing combination 52/100: max_df=0.850, min_cluster_size=30, min_samples=25
Testing combination 53/100: max_df=0.400, min_cluster_size=35, min_samples=30
Testing combination 54/100: max_df=0.450, min_cluster_size=40, min_samples=30
Testing combination 55/100: max_df=0.900, min_cluster_size=35, min_samples=30
Testing combination 56/100: max_df=0.750, min_cluster_size=60, min_samples=15
Testing combination 57/100: max_df=0.650, min_cluster_size=35, min_samples=20
Testing combination 58/100: max_df=0.900, min_cluster_size=45, min_samples=20




Testing combination 59/100: max_df=0.600, min_cluster_size=35, min_samples=25
Testing combination 60/100: max_df=0.700, min_cluster_size=35, min_samples=15




Testing combination 61/100: max_df=0.950, min_cluster_size=45, min_samples=15




Testing combination 62/100: max_df=0.650, min_cluster_size=45, min_samples=35




Testing combination 63/100: max_df=0.900, min_cluster_size=40, min_samples=30
Testing combination 64/100: max_df=0.400, min_cluster_size=30, min_samples=20
Testing combination 65/100: max_df=0.900, min_cluster_size=50, min_samples=15
Testing combination 66/100: max_df=0.900, min_cluster_size=35, min_samples=35
Testing combination 67/100: max_df=0.750, min_cluster_size=55, min_samples=25




Testing combination 68/100: max_df=0.950, min_cluster_size=50, min_samples=15
Testing combination 69/100: max_df=0.950, min_cluster_size=30, min_samples=25
Testing combination 70/100: max_df=0.800, min_cluster_size=50, min_samples=15
Testing combination 71/100: max_df=0.850, min_cluster_size=60, min_samples=15
Testing combination 72/100: max_df=0.700, min_cluster_size=50, min_samples=35




Testing combination 73/100: max_df=0.600, min_cluster_size=55, min_samples=35
Testing combination 74/100: max_df=0.450, min_cluster_size=35, min_samples=35
Testing combination 75/100: max_df=0.450, min_cluster_size=40, min_samples=25
Testing combination 76/100: max_df=0.900, min_cluster_size=50, min_samples=20
Testing combination 77/100: max_df=0.800, min_cluster_size=50, min_samples=25




Testing combination 78/100: max_df=0.600, min_cluster_size=30, min_samples=15
Testing combination 79/100: max_df=0.550, min_cluster_size=50, min_samples=30




Testing combination 80/100: max_df=0.800, min_cluster_size=60, min_samples=30
Testing combination 81/100: max_df=0.700, min_cluster_size=55, min_samples=20
Testing combination 82/100: max_df=0.500, min_cluster_size=55, min_samples=30




Testing combination 83/100: max_df=0.600, min_cluster_size=35, min_samples=30
Testing combination 84/100: max_df=0.550, min_cluster_size=45, min_samples=20




Testing combination 85/100: max_df=0.800, min_cluster_size=30, min_samples=20
Testing combination 86/100: max_df=0.900, min_cluster_size=45, min_samples=25




Testing combination 87/100: max_df=0.550, min_cluster_size=60, min_samples=35
Testing combination 88/100: max_df=0.850, min_cluster_size=40, min_samples=20
Testing combination 89/100: max_df=0.850, min_cluster_size=55, min_samples=30




Testing combination 90/100: max_df=0.850, min_cluster_size=55, min_samples=20
Testing combination 91/100: max_df=0.800, min_cluster_size=55, min_samples=20
Testing combination 92/100: max_df=0.500, min_cluster_size=45, min_samples=30




Testing combination 93/100: max_df=0.750, min_cluster_size=50, min_samples=15
Testing combination 94/100: max_df=0.450, min_cluster_size=55, min_samples=30




Testing combination 95/100: max_df=0.850, min_cluster_size=40, min_samples=25
Testing combination 96/100: max_df=0.850, min_cluster_size=45, min_samples=25




Testing combination 97/100: max_df=0.550, min_cluster_size=30, min_samples=35
Testing combination 98/100: max_df=0.950, min_cluster_size=30, min_samples=35
Testing combination 99/100: max_df=0.900, min_cluster_size=40, min_samples=35
Testing combination 100/100: max_df=0.700, min_cluster_size=60, min_samples=30

Top 5 Parameter Combinations:

Rank 1:
max_df: 0.400, min_cluster_size: 30, min_samples: 30
Coherence: 0.4499, Diversity: 1.0000, Silhouette: 0.2913
Combined Score: 0.5804

Rank 2:
max_df: 0.400, min_cluster_size: 35, min_samples: 30
Coherence: 0.4499, Diversity: 1.0000, Silhouette: 0.2913
Combined Score: 0.5804

Rank 3:
max_df: 0.400, min_cluster_size: 30, min_samples: 20
Coherence: 0.4499, Diversity: 1.0000, Silhouette: 0.2913
Combined Score: 0.5804

Rank 4:
max_df: 0.500, min_cluster_size: 35, min_samples: 20
Coherence: 0.4274, Diversity: 1.0000, Silhouette: 0.2913
Combined Score: 0.5729

Rank 5:
max_df: 0.500, min_cluster_size: 35, min_samples: 30
Coherence: 0.4274, Diversi

Some weights of BertModel were not initialized from the model checkpoint at mental/mental-bert-base-uncased and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Testing combination 1/100: max_df=0.650, min_cluster_size=40, min_samples=30
Testing combination 2/100: max_df=0.550, min_cluster_size=35, min_samples=35
Testing combination 3/100: max_df=0.700, min_cluster_size=60, min_samples=20




Testing combination 4/100: max_df=0.800, min_cluster_size=55, min_samples=30




Testing combination 5/100: max_df=0.850, min_cluster_size=40, min_samples=30
Testing combination 6/100: max_df=0.800, min_cluster_size=50, min_samples=15




Testing combination 7/100: max_df=0.800, min_cluster_size=30, min_samples=15




Testing combination 8/100: max_df=0.500, min_cluster_size=45, min_samples=25




Testing combination 9/100: max_df=0.700, min_cluster_size=35, min_samples=20




Testing combination 10/100: max_df=0.750, min_cluster_size=55, min_samples=25
Testing combination 11/100: max_df=0.850, min_cluster_size=30, min_samples=35




Testing combination 12/100: max_df=0.950, min_cluster_size=55, min_samples=20




Testing combination 13/100: max_df=0.650, min_cluster_size=30, min_samples=35




Testing combination 14/100: max_df=0.600, min_cluster_size=45, min_samples=15




Testing combination 15/100: max_df=0.450, min_cluster_size=35, min_samples=30




Testing combination 16/100: max_df=0.700, min_cluster_size=30, min_samples=35




Testing combination 17/100: max_df=0.500, min_cluster_size=35, min_samples=20




Testing combination 18/100: max_df=0.600, min_cluster_size=45, min_samples=20




Testing combination 19/100: max_df=0.550, min_cluster_size=35, min_samples=15




Testing combination 20/100: max_df=0.650, min_cluster_size=30, min_samples=25




Testing combination 21/100: max_df=0.850, min_cluster_size=35, min_samples=35
Testing combination 22/100: max_df=0.500, min_cluster_size=40, min_samples=35
Testing combination 23/100: max_df=0.600, min_cluster_size=30, min_samples=30
Testing combination 24/100: max_df=0.500, min_cluster_size=40, min_samples=25




Testing combination 25/100: max_df=0.450, min_cluster_size=50, min_samples=30
Testing combination 26/100: max_df=0.850, min_cluster_size=30, min_samples=25




Testing combination 27/100: max_df=0.700, min_cluster_size=30, min_samples=30
Testing combination 28/100: max_df=0.400, min_cluster_size=40, min_samples=25




Testing combination 29/100: max_df=0.700, min_cluster_size=35, min_samples=15




Testing combination 30/100: max_df=0.650, min_cluster_size=50, min_samples=15




Testing combination 31/100: max_df=0.800, min_cluster_size=45, min_samples=35
Testing combination 32/100: max_df=0.600, min_cluster_size=30, min_samples=25




Testing combination 33/100: max_df=0.600, min_cluster_size=40, min_samples=35
Testing combination 34/100: max_df=0.650, min_cluster_size=45, min_samples=30
Testing combination 35/100: max_df=0.450, min_cluster_size=50, min_samples=25
Testing combination 36/100: max_df=0.800, min_cluster_size=50, min_samples=30
Testing combination 37/100: max_df=0.850, min_cluster_size=45, min_samples=20




Testing combination 38/100: max_df=0.400, min_cluster_size=40, min_samples=35
Testing combination 39/100: max_df=0.900, min_cluster_size=35, min_samples=20




Testing combination 40/100: max_df=0.850, min_cluster_size=40, min_samples=20




Testing combination 41/100: max_df=0.800, min_cluster_size=55, min_samples=35




Testing combination 42/100: max_df=0.800, min_cluster_size=30, min_samples=30
Testing combination 43/100: max_df=0.900, min_cluster_size=45, min_samples=30
Testing combination 44/100: max_df=0.900, min_cluster_size=30, min_samples=25




Testing combination 45/100: max_df=0.950, min_cluster_size=40, min_samples=35
Testing combination 46/100: max_df=0.950, min_cluster_size=40, min_samples=25




Testing combination 47/100: max_df=0.750, min_cluster_size=50, min_samples=15




Testing combination 48/100: max_df=0.400, min_cluster_size=55, min_samples=20




Testing combination 49/100: max_df=0.700, min_cluster_size=55, min_samples=15




Testing combination 50/100: max_df=0.550, min_cluster_size=40, min_samples=20




Testing combination 51/100: max_df=0.750, min_cluster_size=40, min_samples=35
Testing combination 52/100: max_df=0.550, min_cluster_size=30, min_samples=30
Testing combination 53/100: max_df=0.500, min_cluster_size=60, min_samples=20




Testing combination 54/100: max_df=0.450, min_cluster_size=40, min_samples=25




Testing combination 55/100: max_df=0.850, min_cluster_size=50, min_samples=35
Testing combination 56/100: max_df=0.800, min_cluster_size=60, min_samples=20




Testing combination 57/100: max_df=0.600, min_cluster_size=45, min_samples=35
Testing combination 58/100: max_df=0.900, min_cluster_size=40, min_samples=20




Testing combination 59/100: max_df=0.400, min_cluster_size=40, min_samples=20




Testing combination 60/100: max_df=0.550, min_cluster_size=35, min_samples=25




Testing combination 61/100: max_df=0.900, min_cluster_size=45, min_samples=20




Testing combination 62/100: max_df=0.850, min_cluster_size=40, min_samples=15




Testing combination 63/100: max_df=0.900, min_cluster_size=45, min_samples=25




Testing combination 64/100: max_df=0.650, min_cluster_size=60, min_samples=30




Testing combination 65/100: max_df=0.850, min_cluster_size=40, min_samples=25




Testing combination 66/100: max_df=0.850, min_cluster_size=60, min_samples=15




Testing combination 67/100: max_df=0.800, min_cluster_size=55, min_samples=15




Testing combination 68/100: max_df=0.700, min_cluster_size=60, min_samples=25




Testing combination 69/100: max_df=0.750, min_cluster_size=40, min_samples=30
Testing combination 70/100: max_df=0.450, min_cluster_size=60, min_samples=25




Testing combination 71/100: max_df=0.700, min_cluster_size=50, min_samples=20




Testing combination 72/100: max_df=0.650, min_cluster_size=45, min_samples=35
Testing combination 73/100: max_df=0.500, min_cluster_size=30, min_samples=15




Testing combination 74/100: max_df=0.900, min_cluster_size=35, min_samples=35
Testing combination 75/100: max_df=0.650, min_cluster_size=50, min_samples=20




Testing combination 76/100: max_df=0.950, min_cluster_size=30, min_samples=30
Testing combination 77/100: max_df=0.800, min_cluster_size=50, min_samples=20




Testing combination 78/100: max_df=0.900, min_cluster_size=45, min_samples=15




Testing combination 79/100: max_df=0.750, min_cluster_size=30, min_samples=25




Testing combination 80/100: max_df=0.900, min_cluster_size=50, min_samples=25
Testing combination 81/100: max_df=0.700, min_cluster_size=45, min_samples=20




Testing combination 82/100: max_df=0.950, min_cluster_size=30, min_samples=15




Testing combination 83/100: max_df=0.750, min_cluster_size=55, min_samples=30




Testing combination 84/100: max_df=0.500, min_cluster_size=50, min_samples=15




Testing combination 85/100: max_df=0.450, min_cluster_size=45, min_samples=30
Testing combination 86/100: max_df=0.450, min_cluster_size=30, min_samples=30
Testing combination 87/100: max_df=0.600, min_cluster_size=60, min_samples=30




Testing combination 88/100: max_df=0.550, min_cluster_size=60, min_samples=25




Testing combination 89/100: max_df=0.650, min_cluster_size=30, min_samples=20




Testing combination 90/100: max_df=0.700, min_cluster_size=40, min_samples=30
Testing combination 91/100: max_df=0.750, min_cluster_size=35, min_samples=35
Testing combination 92/100: max_df=0.950, min_cluster_size=55, min_samples=15




Testing combination 93/100: max_df=0.400, min_cluster_size=50, min_samples=20




Testing combination 94/100: max_df=0.550, min_cluster_size=50, min_samples=25
Testing combination 95/100: max_df=0.800, min_cluster_size=40, min_samples=35
Testing combination 96/100: max_df=0.800, min_cluster_size=40, min_samples=20




Testing combination 97/100: max_df=0.700, min_cluster_size=50, min_samples=25
Testing combination 98/100: max_df=0.550, min_cluster_size=45, min_samples=15




Testing combination 99/100: max_df=0.850, min_cluster_size=30, min_samples=30
Testing combination 100/100: max_df=0.550, min_cluster_size=60, min_samples=30





Top 5 Parameter Combinations:

Rank 1:
max_df: 0.450, min_cluster_size: 35, min_samples: 30
Coherence: 0.5018, Diversity: 0.7714, Silhouette: 0.0297
Combined Score: 0.4343

Rank 2:
max_df: 0.500, min_cluster_size: 40, min_samples: 35
Coherence: 0.4813, Diversity: 0.7267, Silhouette: 0.0075
Combined Score: 0.4052

Rank 3:
max_df: 0.500, min_cluster_size: 30, min_samples: 15
Coherence: 0.4395, Diversity: 0.8091, Silhouette: -0.0355
Combined Score: 0.4044

Rank 4:
max_df: 0.450, min_cluster_size: 60, min_samples: 25
Coherence: 0.4691, Diversity: 0.7000, Silhouette: 0.0200
Combined Score: 0.3964

Rank 5:
max_df: 0.450, min_cluster_size: 50, min_samples: 25
Coherence: 0.4574, Diversity: 0.7000, Silhouette: 0.0200
Combined Score: 0.3925

Grid search results saved to: e:\Studying in Adelaide\2_Trimester-2\project_A_ML-Mental Health (MDS)\LLM_NetworkMoel_MentalHealth_coding\topic_modeling_results_depression.csv


=== Processing category: PTSD and trauma ===



Some weights of BertModel were not initialized from the model checkpoint at mental/mental-bert-base-uncased and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Testing combination 1/100: max_df=0.800, min_cluster_size=45, min_samples=15




Testing combination 2/100: max_df=0.500, min_cluster_size=55, min_samples=25
Testing combination 3/100: max_df=0.950, min_cluster_size=55, min_samples=35
Testing combination 4/100: max_df=0.950, min_cluster_size=35, min_samples=35
Testing combination 5/100: max_df=0.850, min_cluster_size=45, min_samples=25
Testing combination 6/100: max_df=0.800, min_cluster_size=55, min_samples=20
Testing combination 7/100: max_df=0.800, min_cluster_size=60, min_samples=35
Testing combination 8/100: max_df=0.750, min_cluster_size=40, min_samples=20




Testing combination 9/100: max_df=0.950, min_cluster_size=40, min_samples=30
Testing combination 10/100: max_df=0.950, min_cluster_size=45, min_samples=20




Testing combination 11/100: max_df=0.700, min_cluster_size=55, min_samples=15
Testing combination 12/100: max_df=0.750, min_cluster_size=45, min_samples=20




Testing combination 13/100: max_df=0.400, min_cluster_size=30, min_samples=25




Testing combination 14/100: max_df=0.450, min_cluster_size=60, min_samples=25
Testing combination 15/100: max_df=0.400, min_cluster_size=45, min_samples=25
Testing combination 16/100: max_df=0.850, min_cluster_size=40, min_samples=20




Testing combination 17/100: max_df=0.950, min_cluster_size=50, min_samples=25
Testing combination 18/100: max_df=0.450, min_cluster_size=55, min_samples=20
Testing combination 19/100: max_df=0.550, min_cluster_size=45, min_samples=25
Testing combination 20/100: max_df=0.500, min_cluster_size=45, min_samples=30
Testing combination 21/100: max_df=0.450, min_cluster_size=30, min_samples=30




Testing combination 22/100: max_df=0.450, min_cluster_size=35, min_samples=15
Testing combination 23/100: max_df=0.600, min_cluster_size=35, min_samples=20




Testing combination 24/100: max_df=0.800, min_cluster_size=50, min_samples=20




Testing combination 25/100: max_df=0.450, min_cluster_size=60, min_samples=30
Testing combination 26/100: max_df=0.950, min_cluster_size=30, min_samples=15




Testing combination 27/100: max_df=0.400, min_cluster_size=40, min_samples=20




Testing combination 28/100: max_df=0.550, min_cluster_size=35, min_samples=15
Testing combination 29/100: max_df=0.600, min_cluster_size=45, min_samples=15




Testing combination 30/100: max_df=0.650, min_cluster_size=55, min_samples=25
Testing combination 31/100: max_df=0.800, min_cluster_size=35, min_samples=35
Testing combination 32/100: max_df=0.400, min_cluster_size=60, min_samples=20
Testing combination 33/100: max_df=0.600, min_cluster_size=45, min_samples=20




Testing combination 34/100: max_df=0.400, min_cluster_size=55, min_samples=35
Testing combination 35/100: max_df=0.650, min_cluster_size=50, min_samples=35
Testing combination 36/100: max_df=0.700, min_cluster_size=45, min_samples=25
Testing combination 37/100: max_df=0.900, min_cluster_size=50, min_samples=15




Testing combination 38/100: max_df=0.800, min_cluster_size=50, min_samples=35
Testing combination 39/100: max_df=0.600, min_cluster_size=30, min_samples=30




Testing combination 40/100: max_df=0.500, min_cluster_size=45, min_samples=15




Testing combination 41/100: max_df=0.500, min_cluster_size=60, min_samples=25
Testing combination 42/100: max_df=0.850, min_cluster_size=45, min_samples=20




Testing combination 43/100: max_df=0.700, min_cluster_size=35, min_samples=15
Testing combination 44/100: max_df=0.650, min_cluster_size=35, min_samples=30




Testing combination 45/100: max_df=0.500, min_cluster_size=60, min_samples=30
Testing combination 46/100: max_df=0.550, min_cluster_size=50, min_samples=35
Testing combination 47/100: max_df=0.500, min_cluster_size=45, min_samples=25
Testing combination 48/100: max_df=0.750, min_cluster_size=60, min_samples=35
Testing combination 49/100: max_df=0.900, min_cluster_size=40, min_samples=20




Testing combination 50/100: max_df=0.450, min_cluster_size=40, min_samples=30
Testing combination 51/100: max_df=0.650, min_cluster_size=35, min_samples=20




Testing combination 52/100: max_df=0.600, min_cluster_size=45, min_samples=25
Testing combination 53/100: max_df=0.650, min_cluster_size=50, min_samples=15




Testing combination 54/100: max_df=0.900, min_cluster_size=30, min_samples=35
Testing combination 55/100: max_df=0.400, min_cluster_size=50, min_samples=15




Testing combination 56/100: max_df=0.900, min_cluster_size=55, min_samples=15
Testing combination 57/100: max_df=0.850, min_cluster_size=55, min_samples=25
Testing combination 58/100: max_df=0.700, min_cluster_size=60, min_samples=35
Testing combination 59/100: max_df=0.500, min_cluster_size=40, min_samples=30
Testing combination 60/100: max_df=0.700, min_cluster_size=40, min_samples=20




Testing combination 61/100: max_df=0.500, min_cluster_size=55, min_samples=30




Testing combination 62/100: max_df=0.800, min_cluster_size=40, min_samples=25




Testing combination 63/100: max_df=0.750, min_cluster_size=35, min_samples=15
Testing combination 64/100: max_df=0.650, min_cluster_size=40, min_samples=30
Testing combination 65/100: max_df=0.650, min_cluster_size=45, min_samples=30
Testing combination 66/100: max_df=0.600, min_cluster_size=30, min_samples=20




Testing combination 67/100: max_df=0.400, min_cluster_size=60, min_samples=30
Testing combination 68/100: max_df=0.900, min_cluster_size=45, min_samples=15




Testing combination 69/100: max_df=0.950, min_cluster_size=60, min_samples=20
Testing combination 70/100: max_df=0.750, min_cluster_size=35, min_samples=25




Testing combination 71/100: max_df=0.850, min_cluster_size=30, min_samples=35
Testing combination 72/100: max_df=0.900, min_cluster_size=35, min_samples=20




Testing combination 73/100: max_df=0.550, min_cluster_size=40, min_samples=35
Testing combination 74/100: max_df=0.600, min_cluster_size=55, min_samples=30




Testing combination 75/100: max_df=0.700, min_cluster_size=40, min_samples=30
Testing combination 76/100: max_df=0.900, min_cluster_size=35, min_samples=35
Testing combination 77/100: max_df=0.800, min_cluster_size=60, min_samples=25
Testing combination 78/100: max_df=0.750, min_cluster_size=40, min_samples=30
Testing combination 79/100: max_df=0.900, min_cluster_size=35, min_samples=25




Testing combination 80/100: max_df=0.550, min_cluster_size=45, min_samples=20




Testing combination 81/100: max_df=0.800, min_cluster_size=55, min_samples=35
Testing combination 82/100: max_df=0.850, min_cluster_size=50, min_samples=35
Testing combination 83/100: max_df=0.600, min_cluster_size=35, min_samples=15
Testing combination 84/100: max_df=0.800, min_cluster_size=50, min_samples=15




Testing combination 85/100: max_df=0.700, min_cluster_size=45, min_samples=30
Testing combination 86/100: max_df=0.800, min_cluster_size=55, min_samples=30




Testing combination 87/100: max_df=0.800, min_cluster_size=45, min_samples=25
Testing combination 88/100: max_df=0.650, min_cluster_size=55, min_samples=20
Testing combination 89/100: max_df=0.550, min_cluster_size=30, min_samples=25




Testing combination 90/100: max_df=0.550, min_cluster_size=60, min_samples=15
Testing combination 91/100: max_df=0.850, min_cluster_size=55, min_samples=35
Testing combination 92/100: max_df=0.550, min_cluster_size=55, min_samples=30




Testing combination 93/100: max_df=0.600, min_cluster_size=50, min_samples=20




Testing combination 94/100: max_df=0.600, min_cluster_size=60, min_samples=25
Testing combination 95/100: max_df=0.900, min_cluster_size=30, min_samples=20




Testing combination 96/100: max_df=0.750, min_cluster_size=40, min_samples=15




Testing combination 97/100: max_df=0.550, min_cluster_size=60, min_samples=20
Testing combination 98/100: max_df=0.650, min_cluster_size=60, min_samples=35
Testing combination 99/100: max_df=0.950, min_cluster_size=50, min_samples=35
Testing combination 100/100: max_df=0.800, min_cluster_size=30, min_samples=15





Top 5 Parameter Combinations:

Rank 1:
max_df: 0.950, min_cluster_size: 30, min_samples: 15
Coherence: 0.3936, Diversity: 0.8000, Silhouette: -0.0882
Combined Score: 0.3685

Rank 2:
max_df: 0.800, min_cluster_size: 30, min_samples: 15
Coherence: 0.3936, Diversity: 0.8000, Silhouette: -0.0882
Combined Score: 0.3685

Rank 3:
max_df: 0.550, min_cluster_size: 30, min_samples: 25
Coherence: 0.4119, Diversity: 0.7600, Silhouette: -0.0877
Combined Score: 0.3614

Rank 4:
max_df: 0.400, min_cluster_size: 30, min_samples: 25
Coherence: 0.3995, Diversity: 0.7700, Silhouette: -0.0877
Combined Score: 0.3606

Rank 5:
max_df: 0.450, min_cluster_size: 30, min_samples: 30
Coherence: 0.4060, Diversity: 0.7188, Silhouette: -0.0471
Combined Score: 0.3592

Grid search results saved to: e:\Studying in Adelaide\2_Trimester-2\project_A_ML-Mental Health (MDS)\LLM_NetworkMoel_MentalHealth_coding\topic_modeling_results_ptsd_and_trauma.csv


=== Processing category: Suicidal thoughts and self-harm ===



Some weights of BertModel were not initialized from the model checkpoint at mental/mental-bert-base-uncased and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Testing combination 1/100: max_df=0.450, min_cluster_size=30, min_samples=20




Testing combination 2/100: max_df=0.750, min_cluster_size=40, min_samples=30




Testing combination 3/100: max_df=0.800, min_cluster_size=45, min_samples=25




Testing combination 4/100: max_df=0.900, min_cluster_size=60, min_samples=20




Testing combination 5/100: max_df=0.400, min_cluster_size=30, min_samples=35




Testing combination 6/100: max_df=0.450, min_cluster_size=35, min_samples=30




Testing combination 7/100: max_df=0.850, min_cluster_size=50, min_samples=35
Testing combination 8/100: max_df=0.800, min_cluster_size=60, min_samples=15




Testing combination 9/100: max_df=0.950, min_cluster_size=40, min_samples=25
Testing combination 10/100: max_df=0.600, min_cluster_size=50, min_samples=20




Testing combination 11/100: max_df=0.800, min_cluster_size=50, min_samples=30




Testing combination 12/100: max_df=0.900, min_cluster_size=55, min_samples=35
Testing combination 13/100: max_df=0.550, min_cluster_size=45, min_samples=35
Testing combination 14/100: max_df=0.900, min_cluster_size=50, min_samples=30




Testing combination 15/100: max_df=0.950, min_cluster_size=55, min_samples=20




Testing combination 16/100: max_df=0.450, min_cluster_size=60, min_samples=30
Testing combination 17/100: max_df=0.800, min_cluster_size=45, min_samples=35
Testing combination 18/100: max_df=0.950, min_cluster_size=40, min_samples=30




Testing combination 19/100: max_df=0.750, min_cluster_size=40, min_samples=25
Testing combination 20/100: max_df=0.500, min_cluster_size=35, min_samples=30




Testing combination 21/100: max_df=0.950, min_cluster_size=40, min_samples=15




Testing combination 22/100: max_df=0.900, min_cluster_size=45, min_samples=35
Testing combination 23/100: max_df=0.850, min_cluster_size=45, min_samples=30
Testing combination 24/100: max_df=0.600, min_cluster_size=30, min_samples=20




Testing combination 25/100: max_df=0.400, min_cluster_size=50, min_samples=35
Testing combination 26/100: max_df=0.950, min_cluster_size=35, min_samples=15




Testing combination 27/100: max_df=0.700, min_cluster_size=35, min_samples=20




Testing combination 28/100: max_df=0.650, min_cluster_size=55, min_samples=25




Testing combination 29/100: max_df=0.550, min_cluster_size=30, min_samples=35




Testing combination 30/100: max_df=0.550, min_cluster_size=30, min_samples=20




Testing combination 31/100: max_df=0.800, min_cluster_size=35, min_samples=25
Testing combination 32/100: max_df=0.500, min_cluster_size=40, min_samples=20




Testing combination 33/100: max_df=0.550, min_cluster_size=30, min_samples=15
Testing combination 34/100: max_df=0.850, min_cluster_size=35, min_samples=30




Testing combination 35/100: max_df=0.500, min_cluster_size=35, min_samples=25
Testing combination 36/100: max_df=0.600, min_cluster_size=40, min_samples=35
Testing combination 37/100: max_df=0.650, min_cluster_size=30, min_samples=35




Testing combination 38/100: max_df=0.800, min_cluster_size=60, min_samples=25




Testing combination 39/100: max_df=0.650, min_cluster_size=45, min_samples=35
Testing combination 40/100: max_df=0.800, min_cluster_size=35, min_samples=15




Testing combination 41/100: max_df=0.500, min_cluster_size=30, min_samples=15
Testing combination 42/100: max_df=0.900, min_cluster_size=60, min_samples=30
Testing combination 43/100: max_df=0.600, min_cluster_size=60, min_samples=30
Testing combination 44/100: max_df=0.450, min_cluster_size=40, min_samples=35
Testing combination 45/100: max_df=0.550, min_cluster_size=40, min_samples=15




Testing combination 46/100: max_df=0.750, min_cluster_size=50, min_samples=20




Testing combination 47/100: max_df=0.650, min_cluster_size=40, min_samples=20




Testing combination 48/100: max_df=0.450, min_cluster_size=50, min_samples=15




Testing combination 49/100: max_df=0.400, min_cluster_size=60, min_samples=30
Testing combination 50/100: max_df=0.850, min_cluster_size=55, min_samples=20




Testing combination 51/100: max_df=0.550, min_cluster_size=35, min_samples=20




Testing combination 52/100: max_df=0.450, min_cluster_size=30, min_samples=15
Testing combination 53/100: max_df=0.600, min_cluster_size=45, min_samples=25




Testing combination 54/100: max_df=0.900, min_cluster_size=30, min_samples=30




Testing combination 55/100: max_df=0.950, min_cluster_size=30, min_samples=20




Testing combination 56/100: max_df=0.850, min_cluster_size=40, min_samples=35
Testing combination 57/100: max_df=0.700, min_cluster_size=60, min_samples=30
Testing combination 58/100: max_df=0.450, min_cluster_size=40, min_samples=30




Testing combination 59/100: max_df=0.550, min_cluster_size=45, min_samples=30
Testing combination 60/100: max_df=0.950, min_cluster_size=55, min_samples=35
Testing combination 61/100: max_df=0.900, min_cluster_size=40, min_samples=20




Testing combination 62/100: max_df=0.400, min_cluster_size=55, min_samples=35
Testing combination 63/100: max_df=0.850, min_cluster_size=30, min_samples=15
Testing combination 64/100: max_df=0.950, min_cluster_size=40, min_samples=35
Testing combination 65/100: max_df=0.850, min_cluster_size=35, min_samples=15




Testing combination 66/100: max_df=0.850, min_cluster_size=40, min_samples=25
Testing combination 67/100: max_df=0.550, min_cluster_size=40, min_samples=20




Testing combination 68/100: max_df=0.600, min_cluster_size=60, min_samples=20




Testing combination 69/100: max_df=0.700, min_cluster_size=30, min_samples=15
Testing combination 70/100: max_df=0.650, min_cluster_size=55, min_samples=35
Testing combination 71/100: max_df=0.500, min_cluster_size=60, min_samples=20




Testing combination 72/100: max_df=0.450, min_cluster_size=30, min_samples=30




Testing combination 73/100: max_df=0.800, min_cluster_size=30, min_samples=25
Testing combination 74/100: max_df=0.500, min_cluster_size=60, min_samples=25




Testing combination 75/100: max_df=0.800, min_cluster_size=55, min_samples=20




Testing combination 76/100: max_df=0.900, min_cluster_size=50, min_samples=20




Testing combination 77/100: max_df=0.750, min_cluster_size=45, min_samples=35
Testing combination 78/100: max_df=0.700, min_cluster_size=35, min_samples=35




Testing combination 79/100: max_df=0.550, min_cluster_size=60, min_samples=20




Testing combination 80/100: max_df=0.850, min_cluster_size=45, min_samples=15




Testing combination 81/100: max_df=0.750, min_cluster_size=35, min_samples=20




Testing combination 82/100: max_df=0.800, min_cluster_size=50, min_samples=35
Testing combination 83/100: max_df=0.450, min_cluster_size=35, min_samples=15




Testing combination 84/100: max_df=0.550, min_cluster_size=40, min_samples=25
Testing combination 85/100: max_df=0.400, min_cluster_size=35, min_samples=15




Testing combination 86/100: max_df=0.400, min_cluster_size=30, min_samples=25
Testing combination 87/100: max_df=0.550, min_cluster_size=55, min_samples=25




Testing combination 88/100: max_df=0.800, min_cluster_size=50, min_samples=20




Testing combination 89/100: max_df=0.450, min_cluster_size=35, min_samples=25
Testing combination 90/100: max_df=0.400, min_cluster_size=35, min_samples=20




Testing combination 91/100: max_df=0.500, min_cluster_size=45, min_samples=35
Testing combination 92/100: max_df=0.500, min_cluster_size=40, min_samples=35
Testing combination 93/100: max_df=0.650, min_cluster_size=45, min_samples=20




Testing combination 94/100: max_df=0.600, min_cluster_size=35, min_samples=20




Testing combination 95/100: max_df=0.700, min_cluster_size=55, min_samples=15




Testing combination 96/100: max_df=0.750, min_cluster_size=55, min_samples=20




Testing combination 97/100: max_df=0.700, min_cluster_size=40, min_samples=35
Testing combination 98/100: max_df=0.600, min_cluster_size=40, min_samples=30




Testing combination 99/100: max_df=0.700, min_cluster_size=55, min_samples=30
Testing combination 100/100: max_df=0.650, min_cluster_size=45, min_samples=30

Top 5 Parameter Combinations:

Rank 1:
max_df: 0.450, min_cluster_size: 35, min_samples: 30
Coherence: 0.4812, Diversity: 0.7600, Silhouette: 0.0583
Combined Score: 0.4332

Rank 2:
max_df: 0.450, min_cluster_size: 30, min_samples: 30
Coherence: 0.4812, Diversity: 0.7600, Silhouette: 0.0583
Combined Score: 0.4332

Rank 3:
max_df: 0.450, min_cluster_size: 40, min_samples: 30
Coherence: 0.4812, Diversity: 0.7600, Silhouette: 0.0505
Combined Score: 0.4305

Rank 4:
max_df: 0.900, min_cluster_size: 30, min_samples: 30
Coherence: 0.5067, Diversity: 0.7200, Silhouette: 0.0583
Combined Score: 0.4284

Rank 5:
max_df: 0.850, min_cluster_size: 35, min_samples: 30
Coherence: 0.4977, Diversity: 0.7200, Silhouette: 0.0583
Combined Score: 0.4253

Grid search results saved to: e:\Studying in Adelaide\2_Trimester-2\project_A_ML-Mental Health (MDS)\

In [1]:

import pandas as pd

#read topic_modeling_results_anxiety.csv file

anxiety_df = pd.read_csv('topic_modeling_results_anxiety.csv')
depression_df = pd.read_csv('topic_modeling_results_depression.csv')
ptsd_df = pd.read_csv('topic_modeling_results_ptsd_and_trauma.csv')
suicidal_df = pd.read_csv('topic_modeling_results_suicidal_thoughts_and_self-harm.csv')



# print the rows with the highest coherence score for each dataframe
print("Anxiety - Highest Coherence Score:")
print(anxiety_df.loc[anxiety_df['coherence'].idxmax()])
print("\nDepression - Highest Coherence Score:")
print(depression_df.loc[depression_df['coherence'].idxmax()])
print("\nPTSD and Trauma - Highest Coherence Score:")
print(ptsd_df.loc[ptsd_df['coherence'].idxmax()])
print("\nSuicidal Thoughts and Self-Harm - Highest Coherence Score:")
print(suicidal_df.loc[suicidal_df['coherence'].idxmax()])


Anxiety - Highest Coherence Score:
max_df               0.800000
min_cluster_size    55.000000
min_samples         30.000000
coherence            0.543862
diversity            0.714286
silhouette          -0.003057
combined_score       0.418364
Name: 38, dtype: float64

Depression - Highest Coherence Score:
max_df               0.450000
min_cluster_size    35.000000
min_samples         30.000000
coherence            0.501826
diversity            0.771429
silhouette           0.029738
combined_score       0.434331
Name: 0, dtype: float64

PTSD and Trauma - Highest Coherence Score:
max_df               0.400000
min_cluster_size    55.000000
min_samples         35.000000
coherence            0.482274
diversity            0.600000
silhouette          -0.023162
combined_score       0.353037
Name: 6, dtype: float64

Suicidal Thoughts and Self-Harm - Highest Coherence Score:
max_df               0.550000
min_cluster_size    55.000000
min_samples         25.000000
coherence            0.565117