In [1]:


import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import spacy
import emoji
import emot
import json
from transformers import BertTokenizer

class NLP_OPERATORS:
    def __init__(self):
        self.stop_words = set(stopwords.words('english')) | {
            'https', 'http', 'www', 'helplinehttps', 'nan', 'deleted', 'rselfhelp', 'rbpd', 'rptsd',
            'please click list', 'findahelplinecomiiasp', 'sharing story bot', 'please remember subreddit',
            'mods keep positive', 'frown', 'andry', 'pouting', 'lapping',
            'face smiley', 'happy face', 'upsidedown', 'upsidedown face', 'hi', 'anyone', 'etc',
            'nonenglish', 'beyondblue', 'im', 'dont', 'like', 'know', 'ive',
            'ts', 'wch', 'get', 'got', 'na', 'r', 'feel', 'time', 'life', 'want', 'years', 'really', 'tnk'
        }
        self.nlp = spacy.load('en_core_web_sm', disable=["parser", "ner", "textcat"])
        self.nlp.max_length = 5_000_000
        self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.kaomoji_to_text = json.load(open('./kaomoji_to_text.json', 'r', encoding='utf-8'))

    def convert_emojis_emoticons(self, text):
        # Replace kaomojis with their text equivalents
        for kaomoji, text_representation in self.kaomoji_to_text.items():
            if kaomoji in text:
                text = text.replace(kaomoji, f" {text_representation} ")

        # Replace emoticons
        e = emot.core.emot()
        emoticon_results = e.emoticons(text)
        for original, meaning in zip(emoticon_results['value'], emoticon_results['mean']):
            text = text.replace(original, f" {meaning} ")

        # Convert emojis to text
        text = emoji.demojize(text)

        # Make BERT-compatible
        text = re.sub(r'(:[^:]+:|_|\b(frown|andry|pouting|lapping|smiley)\b)', ' ', text, flags=re.IGNORECASE)
        text = text.replace(":", " ").replace("_", " ")
        return text.strip().lower()

    def basic_cleaning(self, text):
        # Combined regex pattern for efficiency
        pattern = (
            r'https?://\S+|www\.\S+|helplinehttps\S*|\S*https\S*|' +
            r'r/[a-zA-Z0-9_]+|rptsd|rbpd|rselfhelp|please click list|findahelplinecomiiasp|' +
            r'sharing story bot|beyondblue|hi|anyone|etc|nonenglish|subreddit\S*|\breddit\w*|' +
            r'<.*?>|"|</?.*?>|[^a-zA-Z0-9 ]|\d+'
        )
        text = re.sub(pattern, '', text, flags=re.IGNORECASE)
        text = self.convert_emojis_emoticons(text)
        text = text.replace('\n', ' ').replace('\r', ' ')
        text = re.sub(r'\s+', ' ', text)
        return text.strip()

    def text_preprocessing(self, text, regex=False, remove_stop_word=False, lemmatisation=False, lower_case=False,
                          return_tokens=True, use_bert_tokenizer=False):
        """
        Preprocess text with options for regex cleaning, stop word removal, lemmatization, and tokenization.

        Args:
            text (str): Input text to preprocess.
            regex (bool): Apply regex-based cleaning (URLs, special characters, etc.).
            remove_stop_word (bool): Remove stop words from the text.
            lemmatisation (bool): Apply lemmatization using spaCy.
            lower_case (bool): Convert text to lowercase.
            return_tokens (bool): Return list of tokens if True, else return joined string.
            use_bert_tokenizer (bool): Use BERT tokenizer instead of NLTK.

        Returns:
            list or str: Processed tokens (if return_tokens=True) or joined string.
        """
        if use_bert_tokenizer:
            text = self.basic_cleaning(text)
            tokens = self.bert_tokenizer.tokenize(text)
            tokens = [token for token in tokens if not any(term in token.lower() for term in ['https', 'http', 'www', 'helplinehttps'])]
            return tokens if return_tokens else " ".join(tokens)

        if regex:
            text = self.basic_cleaning(text)

        if lower_case:
            text = text.lower()

        # Skip tokenization if not needed
        if return_tokens or remove_stop_word or lemmatisation:
            tokens = word_tokenize(text)
            if remove_stop_word:
                tokens = [word for word in tokens if word not in self.stop_words]
            if lemmatisation:
                doc = self.nlp(' '.join(tokens))
                tokens = [token.lemma_ for token in doc]
            return tokens if return_tokens else " ".join(tokens)

        return text

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import os
# from transformers import AutoTokenizer, AutoModelForMaskedLM
from sentence_transformers import SentenceTransformer, models
import pandas as pd
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance
from bertopic.vectorizers import ClassTfidfTransformer
import numpy as np
from transformers import pipeline
from datasets import Dataset
import spacy

# from IPython.display import display
# import plotly.io as pio
from dotenv import load_dotenv
load_dotenv()

current_dir = os.getcwd()
clean_operator = NLP_OPERATORS()




# Wrap into a sentence-transformer model
sentence_transformer_model = models.Transformer(
    model_name_or_path="mental/mental-bert-base-uncased",
    tokenizer_name_or_path="mental/mental-bert-base-uncased",
    max_seq_length=512
)

# Apply pooling to get sentence-level embeddings
pooling_model = models.Pooling(
    sentence_transformer_model.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True

)

# Combine into SentenceTransformer model
mentalbert_sentence_model = SentenceTransformer(device = "cuda", modules=[sentence_transformer_model, pooling_model])
# Sbert_model = SentenceTransformer("all-MiniLM-L6-v2")
Sbert_model = SentenceTransformer("all-mpnet-base-v2", device="cuda")

#=======================================================================================================

# Load the sentiment analysis pipeline
sentiment = pipeline("sentiment-analysis",
                      model="siebert/sentiment-roberta-large-english",
                      device="cuda",
                    truncation=True,  # Enable truncation
                    max_length=512,   # Set max length to 512 tokens
                    padding=True      # Enable padding for consistent tensor sizes
                      )

Some weights of BertModel were not initialized from the model checkpoint at mental/mental-bert-base-uncased and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cuda


In [3]:



def bert_topic_process(platform, df, unlemma_col, lemma_col, category, label, post_content):


    current_dir = os.getcwd()

    target_path = os.path.join(current_dir, 'data', f'{platform}_data', 'berttopic_label',f'{label}')
    #create a label folder
    if not os.path.exists(target_path):
        os.makedirs(target_path, exist_ok=True)
    
  

# ================================================================================================



    label_df = df[df[category] == label].copy()
    # Remove rows where post_content column is empty
    label_df = label_df[label_df[post_content].notna() & (label_df[post_content].str.strip() != '')]
    
    # Convert to Dataset
    dataset = Dataset.from_pandas(label_df)
    # Batch sentiment analysis
    def batch_sentiment(examples):
        # Process a batch of texts
        results = sentiment(examples[post_content], batch_size=32)
        return {
            "sentiment_label": [res["label"] for res in results],
            "sentiment_score": [res["score"] for res in results]
        }

    # Apply sentiment analysis in batches
    dataset = dataset.map(batch_sentiment, batched=True, batch_size=32)

    # Filter out positive sentiment with score > 0.55
    dataset = dataset.filter(
        lambda x: not (x["sentiment_label"] == "POSITIVE" and x["sentiment_score"] > 0.55)
    )

    # Convert back to DataFrame for consistency with your original code
    label_df = dataset.to_pandas().reset_index(drop=True)
    label_df.to_csv(os.path.join(target_path, "label_df.csv"))

# ================================================================================================
    


    unlemma_dc = label_df[unlemma_col].copy()
    unlemma_dc = unlemma_dc.apply(lambda x: clean_operator.text_preprocessing(x, regex=True, lower_case=True, remove_stop_word=False, return_tokens = False)).tolist()
    unlemma_dc = [doc for doc in unlemma_dc if doc.strip()]
    # Check if any documents remain after preprocessing
    if not unlemma_dc:
        print(f"No valid documents remain after preprocessing for label '{label}'.")
        return
    with open(os.path.join(target_path, 'unlemma_dc'), 'w', encoding='utf-8') as f:
        for doc in unlemma_dc:
            f.write(doc + '\n')


    

    lemma_dc = label_df[lemma_col].copy().tolist()
    lemma_dc = [doc for doc in lemma_dc if doc.strip()]
    # Check if lemma_dc is empty
    if not lemma_dc:
        print(f"No lemmatized documents found for label '{label}' in column '{lemma_col}'.")
        return
    with open(os.path.join(target_path, 'lemma_dc'), 'w', encoding='utf-8') as f:
        for doc in lemma_dc:
            f.write(doc + '\n')




    # Load spaCy English model
    nlp = spacy.load("en_core_web_sm")

    # Extract named entities from each document in lemma_dc
    ner_lemma_dc = []
    for doc in lemma_dc:
        spacy_doc = nlp(doc)
        entities = [ent.text for ent in spacy_doc.ents if ent.label_ in {"PERSON", "ORG", "GPE", "EVENT", "NORP"}]
        ner_lemma_dc.append(" ".join(entities))

    # Save NER-extracted texts to file
    with open(os.path.join(target_path, 'ner_lemma_dc'), 'w', encoding='utf-8') as f:
        for doc in ner_lemma_dc:
            f.write(doc + '\n')


# ================================================================================================
    



    # # Load documents
    # with open(os.path.join(target_path, 'unlemma_dc'), 'r', encoding='utf-8') as f:
    #     unlemma_dc = [line.strip() for line in f if line.strip()]
    # if not unlemma_dc:
    #     print(f"No valid documents remain after preprocessing for label '{label}'.")

    # with open(os.path.join(target_path, 'lemma_dc'), 'r', encoding='utf-8') as f:
    #     lemma_dc = [line.strip() for line in f if line.strip()]
    # if not lemma_dc:
    #     print(f"No lemmatized documents found for label '{label}'.")

    # with open(os.path.join(target_path, 'ner_lemma_dc'), 'r', encoding='utf-8') as f:
    #     ner_lemma_dc = [line.strip() for line in f if line.strip()]
    # if not ner_lemma_dc:
    #     print(f"No ner_lemma_dc found for label '{label}'.")
        



    embedding = mentalbert_sentence_model.encode(
        unlemma_dc,
        batch_size=32,
        convert_to_numpy=True,
        normalize_embeddings=True,
        show_progress_bar=True
    )
    # Check if embeddings are valid
    if embedding.size == 0 or len(embedding) != len(unlemma_dc):
        print(f"Embedding generation failed or mismatched for label '{label}': {len(embedding)} embeddings, {len(unlemma_dc)} documents.")
        return

    # # Save embeddings to a .npy file
    np.save(os.path.join(target_path,'embedding.npy')  , embedding)
    reduced_embedding = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embedding)
    np.save(os.path.join(target_path,'reduce_embedding.npy')  , reduced_embedding)



read_the model

In [None]:


cur_dir = os.getcwd()

platform = "reddit"
df = pd.read_csv(os.path.join(cur_dir,'data','reddit_data',"clean","cleaned_mental_all_text_2015up.csv"))
unlemma_col = "title_selftext_topcomments_text"
lemma_col = "clean_title_selftext_topcomments_text"




labels = ['anxiety', 'ptsd', 'therapy', 'depression', 'mentalhealth', 'selfhelp', 'self_suicide_harm']

for label in labels:
    bert_topic_process(platform, df, unlemma_col, lemma_col, "subreddit",label, "selftext")



Map:   1%|          | 32/4218 [00:07<16:48,  4.15 examples/s]

In [None]:
cur_dir = os.getcwd()

platform = "beyondblue"
df = pd.read_csv(os.path.join(cur_dir,'data','beyondblue_data',"clean","cleaned_beyondblue_all_text_2015up.csv"))
unlemma_col = "title_content_comments"
lemma_col = "clean_title_content_comments"

# labels = ["Anxiety", "Depression","PTSD and trauma",'Sexuality and gender identity','Suicidal thoughts and self-harm','Young people']
labels = ['Multicultural experiences','Relationship and family issues','Treatments, health professionals and therapies']
for label in labels:
    bert_topic_process(platform, df,unlemma_col,lemma_col,"Post Category",label, "Post Content")
    

Map:  11%|█         | 320/2978 [01:08<09:28,  4.68 examples/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Map: 100%|██████████| 2978/2978 [10:35<00:00,  4.68 examples/s]
Filter: 100%|██████████| 2978/2978 [00:00<00:00, 17390.71 examples/s]
Batches: 100%|██████████| 76/76 [02:41<00:00,  2.13s/it]
Map: 100%|██████████| 2982/2982 [10:46<00:00,  4.62 examples/s]
Filter: 100%|██████████| 2982/2982 [00:00<00:00, 17567.80 examples/s]
Batches: 100%|██████████| 80/80 [02:41<00:00,  2.02s/it]
Map: 100%|██████████| 2996/2996 [10:38<00:00,  4.69 examples/s]
Filter: 100%|██████████| 2996/2996 [00:00<00:00, 21608.45 examples/s]
Batches: 100%|██████████| 70/70 [02:28<00:00,  2.13s/it]
Map: 100%|██████████| 1389/1389 [05:03<00:00,  4.58 examples/s]
Filter: 100%|██████████| 1389/1389 [00:00<00:00, 16963.14 examples/s]
Batches: 100%|██████████| 25/25 [00:52<00:00,  2.09s/it]
Map: 100%|██████████| 2617/2617 [09:27<00:00,  4.61 examples/s]
F

No valid documents remain after preprocessing for label 'Treatments'.


Map: 100%|██████████| 2990/2990 [11:01<00:00,  4.52 examples/s]
Filter: 100%|██████████| 2990/2990 [00:00<00:00, 15020.67 examples/s]
Batches: 100%|██████████| 81/81 [02:55<00:00,  2.17s/it]


In [6]:
import os
import pandas as pd
import numpy as np
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic.representation import MaximalMarginalRelevance
from bertopic.vectorizers import ClassTfidfTransformer
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora import Dictionary
from sklearn.metrics import silhouette_score
# import plotly.io as pio

def topic_modeling(platform, label, original_cols, 
                  stopwords, min_df=1, max_df=0.9,
                    n_neighbors=15, min_cluster_size=30,
                    min_samples=10, nr_topics='auto',
                    use_pretrained_embed = True):
    
    # Set paths
    cur_dir = os.getcwd()
    target_path = os.path.join(cur_dir, 'data', f'{platform}_data', 'berttopic_label', f'{label}')
    label_df = pd.read_csv(os.path.join(target_path,"label_df.csv"))





    # ===========================================================================


    # Load documents
    with open(os.path.join(target_path, 'unlemma_dc'), 'r', encoding='utf-8') as f:
        unlemma_dc = [line.strip() for line in f if line.strip()]
    if not unlemma_dc:
        print(f"No valid documents remain after preprocessing for label '{label}'.")

    with open(os.path.join(target_path, 'lemma_dc'), 'r', encoding='utf-8') as f:
        lemma_dc = [line.strip() for line in f if line.strip()]
    if not lemma_dc:
        print(f"No lemmatized documents found for label '{label}'.")

    # with open(os.path.join(target_path, 'ner_lemma_dc'), 'r', encoding='utf-8') as f:
    #     ner_lemma_dc = [line.strip() for line in f if line.strip()]
    # if not ner_lemma_dc:
    #     print(f"No ner_lemma_dc found for label '{label}'.")

    

    # Load embeddings
    embedding = np.load(os.path.join(target_path, 'embedding.npy'))
    if embedding.size == 0 or len(embedding) != len(lemma_dc):
        print(f"Embedding generation failed or mismatched for label '{label}': {len(embedding)} embeddings, {len(lemma_dc)} documents.")
    reduced_embedding = np.load(os.path.join(target_path, 'reduce_embedding.npy'))
    if reduced_embedding.size == 0:
        print(f"Embedding generation failed for label '{label}'.")



    # ===========================================================================

    # Custom function to filter n-grams from vocabulary
    def filter_ngrams_vocabulary(vectorizer, documents, unwanted_ngrams):
        # Fit vectorizer to get initial vocabulary
        vectorizer.fit(documents)
        vocab = vectorizer.get_feature_names_out()
        # Filter out unwanted n-grams
        filtered_vocab = [term for term in vocab if term not in unwanted_ngrams]
        # Create a new vectorizer with the filtered vocabulary
        new_vectorizer = CountVectorizer(
            ngram_range=vectorizer.ngram_range,
            stop_words=vectorizer.stop_words,
            min_df=vectorizer.min_df,
            max_df=vectorizer.max_df,
            vocabulary=filtered_vocab
        )
        return new_vectorizer

    # Initialize CountVectorizer
    vectorizer_model = CountVectorizer(
        ngram_range=(1, 3),
        stop_words='english',  # Use scikit-learn's English stopwords
        min_df=min_df,
        max_df=max_df
    )

    # Filter unwanted n-grams from vocabulary
    vectorizer_model = filter_ngrams_vocabulary(vectorizer_model, lemma_dc, stopwords)


    # # Initialize models
    # vectorizer_model = CountVectorizer(
    #     ngram_range=(1, 2),
    #     stop_words=stopwords,
    #     min_df=min_df,
    #     max_df=max_df
    # )





    

    umap_model = UMAP(
        n_neighbors=n_neighbors,
        n_components=5,
        min_dist=0.0,
        metric='cosine',
        random_state=42
    )

    hdbscan_model = HDBSCAN(
        min_cluster_size=min_cluster_size,
        min_samples=min_samples,
        metric='euclidean',
        prediction_data=True
    )

    ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)





# ===========================================================================


    representation_model = MaximalMarginalRelevance(diversity=0.3)
    # Initialize and fit BERTopic
    topic_model = BERTopic(
        embedding_model=mentalbert_sentence_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        vectorizer_model=vectorizer_model,
        ctfidf_model=ctfidf_model,
        representation_model=representation_model,
        top_n_words=10,
        nr_topics=nr_topics,
        calculate_probabilities=True
    )

    if use_pretrained_embed:
        topics, probs = topic_model.fit_transform(documents=lemma_dc, embeddings=embedding)
    else:
        topics, probs = topic_model.fit_transform(documents=lemma_dc)
        
    
    # Count how many documents were marked as outliers
    num_outliers = np.sum(np.array(topics) == -1)
    # Reduce outliers
    if num_outliers > 0:
        new_topics = topic_model.reduce_outliers(
            documents=lemma_dc,
            topics=topics,
            probabilities=probs,
            strategy="probabilities",
            threshold=0.6
        )
        print(f"Before Number of outliers: {num_outliers}")
        print(f"After Number of outliers: {new_topics.count(-1)}")
        
        if num_outliers != new_topics.count(-1):
            topic_model.update_topics(lemma_dc, topics=new_topics,vectorizer_model = vectorizer_model, ctfidf_model = ctfidf_model, representation_model = representation_model)
        
    else:
        print("No outliers found — skipping reduction.")

    



#==================================================================================================
    # saving outcome  


    # Save model and results
    topic_model.save(os.path.join(target_path, f"{label}_berttopic"), serialization="pickle")
    # Save topics (make sure it's a NumPy array)
    np.save(os.path.join(target_path, 'topics.npy'), np.array(topics))
    # Save topics (make sure it's a NumPy array)
    np.save(os.path.join(target_path, 'probs.npy'), np.array(probs))


    # Combine clustering outcome with original text
    topic_docs = topic_model.get_document_info(lemma_dc).reset_index(drop=True)
    #add original title, post_content, and comment to compare the clustering 
    ori_top_post = label_df[original_cols].reset_index(drop=True)
    combined_docs = pd.concat([ori_top_post, topic_docs], axis=1)
    combined_docs.to_csv(os.path.join(target_path, f'{label}_topic_docs.csv'), index=False)






    # Save topic info
    topic_info = topic_model.get_topic_info()
    with open(os.path.join(target_path, f'{label}_topic_info.csv'), 'w', encoding='utf-8') as f:
        for _, row in topic_info.iterrows():
            f.write(f"Topic {row['Topic']}: {row['Name']} (Count: {row['Count']})\n")
            f.write(f"Words: {row['Representation']}\n\n")

    





# ===================================================================================

    # visualization

    num_topics = len(topic_info[topic_info['Topic'] != -1])  # Exclude outlier topic (-1)
    print(f"There are {num_topics} for the bert_topic clustering")

    if num_topics > 3:
        fig = topic_model.visualize_topics()
        fig.show()
    else:
        import plotly.express as px
        def simple_visualize_topics(topics, reduced_embedding, label):
            # Create DataFrame for plotting
            plot_data = pd.DataFrame({
                'x': reduced_embedding[:, 0],
                'y': reduced_embedding[:, 1],
                'Topic': [f"Topic {t}" if t != -1 else "Outlier" for t in topics]
            })
            
            # Filter out outliers
            plot_data = plot_data[plot_data['Topic'] != "Outlier"]
            
            # Create scatter plot
            fig = px.scatter(
                plot_data,
                x='x',
                y='y',
                color='Topic',
                title=f"Topics for {label}",
                labels={'x': 'Dimension 1', 'y': 'Dimension 2'},
                width=600,
                height=400
            )
            fig.show()
            

        # Call simplified visualization
        simple_visualize_topics(topics, reduced_embedding, label)


    fig = topic_model.visualize_documents(docs = lemma_dc,reduced_embeddings=reduced_embedding)
    fig.show()


    # top_n_topics=len(topic_model.get_topic_info())-1
    fig = topic_model.visualize_barchart( title = f'{label} Topic Word Scores', width=300)
    fig.show()


    fig = topic_model.visualize_heatmap(title = f'<b>{label} Similarity Matrix</b>')
    fig.show()


    
    # ===================================================================
    # Evaluation


    def extract_topic_words(topics_dict, topk: int = 10):
        topics_clean = []
        for tid, pairs in topics_dict.items():
            if tid == -1:
                continue
            topic_words = []
            for word, _ in pairs[:topk]:
                split_words = word.strip().split()
                topic_words.extend(split_words)
            if topic_words:
                topics_clean.append(list(dict.fromkeys(topic_words)))
        return topics_clean


    # ------------------------------------------------------------------
    # 1  Topic Coherence  (c_v, higher == better,  ≳ 0.50 is “good”)
    # ------------------------------------------------------------------
    def topic_coherence(topics_list, docs, topk: int = 10):
        dictionary = Dictionary(doc.split() for doc in docs)
        coherence_model = CoherenceModel(
            topics=topics_list,
            texts=[doc.split() for doc in docs],
            dictionary=dictionary,
            coherence="c_v",
            topn=topk,
        )
        return coherence_model.get_coherence()
    


    # ------------------------------------------------------------------
    # 2  Topic Diversity  (unique words ratio, higher == better,  ≳ 0.80)
    # ------------------------------------------------------------------
    def topic_diversity(topics_dict, topk: int = 10):
        all_words = [w for _, pairs in topics_dict.items() if _ != -1
                     for w, _ in pairs[:topk]]
        return len(set(all_words)) / (len(topics_dict) * topk)


    # ------------------------------------------------------------------
    # 3  Silhouette  (embedding separation, higher == better,  ≳ 0.25)
    # ------------------------------------------------------------------
    def topic_silhouette(embeddings, topics_labels):
        valid_idx = [i for i, t in enumerate(topics_labels) if t != -1]
        X_valid = embeddings[valid_idx]
        y_valid = np.array(topics_labels)[valid_idx]
        return silhouette_score(X_valid, y_valid, metric="cosine")




    topics_dict = topic_model.get_topics()
    topics_labels = topics
    topics_list = extract_topic_words(topics_dict, topk=10)

    coh = topic_coherence(topics_list, lemma_dc, topk=10)
    div = topic_diversity(topics_dict, topk=10)
    sil = topic_silhouette(embedding, topics_labels)

    print(f"Coherence  (c_v) : {coh:.4f}")
    print(f"Diversity        : {div:.4f}")
    print(f"Silhouette (cos) : {sil:.4f}")

    # ===================================================================

reddit topic model

In [None]:
#anxiety
stopwords =  ['tng', 'feel', 'sometng', 'need', 'sleep', 'good', 'say', 'thought', 'anytng',
              'help', 'way', 'fuck', 'attack', 'eat', 'symptom', 'talk', 'heart', 'experience',
              'gh','make', 'bad', 'people', 'week', 'fuck fuck', 'right', 'mind', 'fuck fuck fuck',
              'new', 'ask','right', 'end', 'everytng','start','long','wake','read','tell', 'stay',
              'month', 'day']

platform = 'reddit'
label = "anxiety"
original_cols = ["title","selftext","top_comments"]
min_df=1
max_df=0.85
n_neighbors=15 
min_cluster_size=25
min_samples=10
nr_topics='auto'



# ================================================================================

# #depression
# stopwords = ['tng', 'feel', 'help', 'love', 'anytng', 'sometng', 'notng', 'talk', 'need',
#              'friend', 'good', 'say', 'happy', 'anymore', 'hate', 'end', 'everytng', 'thought',
#               'wake','fuck','make', 'people', 'day', 'way', 'bad', 'work', 'right', 'hope', 'lose',
#                'idk','tell', 'hard', 'mean', 'st', 'change', 'month','depressed', 'person',  'long',
#                 'use', 'wish', 'year', 'sad', 'wake repeat wake','repeat wake repeat', 'fuck fuck',
#                 'fuck fuck fuck','gh','ask','tnke' ]



# platform = 'reddit'
# label = "anxiety"
# original_cols = ["title","selftext","top_comments"]
# min_df=1
# max_df=0.85
# n_neighbors=15 
# min_cluster_size=25
# min_samples=10
# nr_topics='auto'


# ================================================================================




#ptsd
# stopwords = []


# platform = 'reddit'
# label = "anxiety"
# original_cols = ["title","selftext","top_comments"]
# min_df=1
# max_df=0.85
# n_neighbors=15 
# min_cluster_size=25
# min_samples=10
# nr_topics='auto'





topic_modeling(platform = platform,label =label,original_cols =original_cols, 
               stopwords=stopwords, min_df = min_df,max_df = max_df, n_neighbors=n_neighbors, min_cluster_size=min_cluster_size,
               min_samples = min_samples, nr_topics =nr_topics )


beyondblue topic model

In [7]:
#Anxiety
stopwords = [ 'feel', 'week', 'tng', 'need', 'say', 'hard', 'good', 'sometng',
             'work',  'help',  'talk', 'ask', 'end','start', 'people', 'month',
             'thought', 'way', 'anytng','day','make','year','everytng','fly',
             'experience', 'health', 'drive', 'feeling', 'kind', 'manage', 'mental',
             'understand', 'mind', 'new', 'hear', 'right','lm','tell', 'hello','body',
             'meet','past','self','follow','try','walk','wiht','use','act','wle','welcome',
             'support', 'hope',  'post', 'sound', 'long', 'let', 'worry', 'stay', 'situation',
             'lot','mean', 'learn'
]



platform = 'beyondblue'
label = "Anxiety"
original_cols = ["Post Title","Post Content","Comments"]
min_df=1
max_df=0.95
n_neighbors=15 
min_cluster_size=30
min_samples=10
nr_topics='auto'


#test
# platform = 'beyondblue'
# label = "Anxiety"
# original_cols = ["Post Title","Post Content","Comments"]
# min_df=1
# max_df=0.7
# n_neighbors=35
# min_cluster_size=30
# min_samples=30
# nr_topics='auto'



# # ================================================================================



# # Depression
# stopwords=['sleep', 'kind', 'good', 'love', 'need', 'sometng', 'feel', 'talk', 'home',
#            'tng', 'everytng', 'anytng', 'notng', 'self','people', 'work', 'way',
#             'tell',  'experience', 'wle', 'health','wle','make','help', 'say',
#             'day','right', 'hear', 'lead','thought', 've','end','week','use',
#             'word','ask','come', 'sure','mean','lot' ]

# platform = 'beyondblue'
# label = "Depression"
# original_cols = ["Post Title","Post Content","Comments"]
# min_df=1
# max_df=0.9
# n_neighbors = 15
# min_cluster_size=30
# min_samples=10
# nr_topics='auto'




# # ================================================================================



# # PTSD and trauma
# stopwords=['tng', 'love', 'good', 'need', 'way',  'feel', 'talk', 'sometng','help',
#            'people', 'work', 'tell',  'hope',  'sorry','hard', 'right','say','end',
#            'week','everytng', 'anytng','gh', 'make','mh','feeling', 'thought', 'situation',
#            'hear', 'long','past', 'like','person', 'mind','ask', 'womb', 'welcome',
#            'use','sort', 'result', 'write','day','mean','friendsp','wle', 'self' ]


# platform = 'beyondblue'
# label = "PTSD and trauma"
# original_cols = ["Post Title","Post Content","Comments"]
# min_df=1
# max_df=0.9
# n_neighbors=15 
# min_cluster_size=30
# min_samples=10
# nr_topics='auto'


# # ================================================================================



# # Suicidal thoughts and self-harm
# stopwords=['tng', 'need', 'good', 'feel', 'talk', 'friend', 'love', 'hear', 'family', 'tell',
#            'way', 'make', 'sometng', 'self', 'say',  'end','support', 'post', 'understand',
#            'live', 'service', 'experience', 'leave', 'sound', 'welcome', 'long','wle', 'let',
#             'mental health', 'anytng', 'everytng','use', 'wonder','thought','anymore','work',
#             'hard', 'hope', 'day', 'feeling', 'mental', 'right', 'person','sope', 'reacng',
#             'sorry hear','start', 'write','sh thought', 'skill', 'look', 'mean','people', 
#             'try', 'care', 'health',  'kind',  'moment','year', 'week', 'ask','mind','lucys',
#             'variable','kalice', 'nice story','okpitch', 'sit', 'dear okpitch','notng', 'calli',
#             'tiah', 'stay', 'jessksch','tony', 'old','mum', 'beekay','tnke','place', ]



# platform = 'beyondblue'
# label = "Suicidal thoughts and self-harm"
# original_cols = ["Post Title","Post Content","Comments"]
# min_df=1
# max_df=0.8
# n_neighbors=15 
# min_cluster_size=40
# min_samples=10
# nr_topics='auto'


# ================================================================================




use_pretrained_embed = True




topic_modeling(platform = platform,label =label,original_cols =original_cols, stopwords=stopwords,
                min_df = min_df,max_df = max_df, n_neighbors=n_neighbors, min_cluster_size=min_cluster_size,
                min_samples = min_samples, nr_topics =nr_topics,
                use_pretrained_embed = use_pretrained_embed  )



Before Number of outliers: 494
After Number of outliers: 494
There are 12 for the bert_topic clustering


Coherence  (c_v) : 0.4856
Diversity        : 0.7769
Silhouette (cos) : -0.0113


merge topic

In [None]:

import os
import pandas as pd
import numpy as np
from bertopic import BERTopic
from sklearn.metrics.pairwise import cosine_similarity


platform = 'beyondblue'
label = "Anxiety"


cur_dir = os.getcwd()
target_path = os.path.join(cur_dir, 'data', f'{platform}_data', 'berttopic_label', f'{label}')
label_df = pd.read_csv(os.path.join(target_path,"label_df.csv"))


# Load lemmatized documents
with open(os.path.join(target_path, 'lemma_dc'), 'r', encoding='utf-8') as f:
    lemma_dc = [line.strip() for line in f if line.strip()]
if not lemma_dc:
    print(f"No lemmatized documents found for label '{label}'.")
    exit()

# Load the pre-trained BERTopic model
try:
    topic_model = BERTopic.load(os.path.join(target_path, f"{label}_berttopic"))
    print("BERTopic model loaded successfully.")
except FileNotFoundError:
    print("Error: BERTopic model file not found" )
    exit()

# Load topics
try:
    topics = np.load(os.path.join(target_path, "topics.npy"))
    print(f"Loaded {len(topics)} topic assignments.")
except FileNotFoundError:
    print(f"Error: Topics file not found at {os.path.join(target_path, 'topics.npy')}")
    exit()

# Load probabilities
try:
    probs = np.load(os.path.join(target_path, "probs.npy"))
    print(f"Loaded probabilities for {len(probs)} documents.")
except FileNotFoundError:
    print(f"Error: Probabilities file not found at {os.path.join(target_path, 'probs.npy')}")
    exit()

# Validate loaded data
if len(topics) != len(lemma_dc) or len(probs) != len(lemma_dc):
    print(f"Error: Mismatch in lengths - Documents: {len(lemma_dc)}, Topics: {len(topics)}, Probabilities: {len(probs)}")
    exit()





def merge_topics_by_similarity(topic_model, documents, topics, similarity_threshold=0.97):
    """
    Merge topics with similarity score above the given threshold.
    
    Args:
        topic_model: Fitted BERTopic model
        documents: List of documents used for topic modeling
        topics: Current topic assignments
        similarity_threshold: Threshold for merging topics
    
    Returns:
        Updated topic assignments
    """
    # Get topic representations (c-TF-IDF)
    topic_info = topic_model.get_topic_info()
    topic_ids = topic_info[topic_info['Topic'] != -1]['Topic'].tolist()  # Exclude outliers

    if len(topic_ids) < 2:
        print("Fewer than 2 topics found — skipping merging.")
        return topics

    # Compute similarity between topics
    ctfidf_matrix = topic_model.c_tf_idf_.toarray()
    topic_indices = [tid + 1 for tid in topic_ids]  # Correct if needed based on matrix alignment
    ctfidf_matrix = ctfidf_matrix[topic_indices]
    similarity_matrix = cosine_similarity(ctfidf_matrix)

    # Find pairs to merge
    topics_to_merge = []
    n_topics = len(topic_ids)
    for i in range(n_topics):
        for j in range(i + 1, n_topics):
            if similarity_matrix[i, j] > similarity_threshold:
                topics_to_merge.append([topic_ids[i], topic_ids[j]])

    print(topics_to_merge)



    if topics_to_merge:
        print(f"Merging {len(topics_to_merge)} topic pairs with similarity > {similarity_threshold}")
        topic_model.merge_topics(docs=documents, topics_to_merge=topics_to_merge)

        # Update topics after merging
        updated_topics, _ = topic_model.transform(documents)
        print(f"Number of topics after merging: {len(set(updated_topics)) - 1}")  # Exclude -1
        return updated_topics
    else:
        print("No topics to merge based on similarity threshold.")
        return topics


# Perform topic merging
print(f"Number of topics before merging: {len(set(topics)) - 1}")  # Exclude -1
topic_model = merge_topics_by_similarity(topic_model, lemma_dc, topics, similarity_threshold=0.15)

fig = topic_model.visualize_topics()
fig.show()

fig = topic_model.visualize_heatmap(use_ctfidf = True)
fig.show()








BERTopic model loaded successfully.
Loaded 2424 topic assignments.
Loaded probabilities for 2424 documents.
Number of topics before merging: 21
[[0, 1]]
Merging 1 topic pairs with similarity > 0.15


LDA test

In [None]:
import pandas as pd
import gensim
from gensim import corpora
from gensim.models import LdaModel

# Assuming label_df is your DataFrame
# Replace with your actual DataFrame loading if needed


def lda_topicmodeling(platform, label, lemma_dc):

    cur_dir = os.getcwd()
    target_path = os.path.join(current_dir, 'data', f'{platform}_data', 'berttopic_label',f'{label}')
    label_df = pd.read_csv( os.path.join(target_path, 'label_df.csv'))


    # Convert preprocessed text to list of words (if stored as space-separated strings)
    def prepare_text(text):
        if isinstance(text, str):
            return text.split()  # Split space-separated string into list
        return text  # Assume it's already a list

    processed_docs = label_df[lemma_dc].apply(prepare_text)

    # Create dictionary and document-term matrix
    dictionary = corpora.Dictionary(processed_docs)
    corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

    # Build LDA model
    num_topics = 5  # Adjust based on your needs
    lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, random_state=42, passes=10)

    # Assign each document to the dominant topic (clustering)
    def get_dominant_topic(ldamodel, corpus):
        dominant_topics = []
        for bow in corpus:
            topics = ldamodel[bow]
            # Get the dominant topic (highest probability)
            if topics:  # Check if topics list is not empty
                dominant_topic = sorted(topics, key=lambda x: x[1], reverse=True)[0][0]
            else:
                dominant_topic = -1  # Handle empty topic assignments
            dominant_topics.append(dominant_topic)
        return dominant_topics

    label_df['dominant_topic'] = get_dominant_topic(lda_model, corpus)

    # Print the topics
    print("Topics found by LDA:")
    for idx, topic in lda_model.print_topics(-1):
        print(f'Topic {idx}: {topic}')

    # Display the DataFrame with cluster assignments
    print("\nDataFrame with Cluster Assignments:")
    print(label_df[[lemma_dc, 'dominant_topic']].head())

    from gensim.models import CoherenceModel
    coherence_model = CoherenceModel(model=lda_model, texts=processed_docs, dictionary=dictionary, coherence='c_v')
    print("Coherence Score:", coherence_model.get_coherence())



    import pyLDAvis.gensim_models
    pyLDAvis.enable_notebook()
    vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
    pyLDAvis.display(vis)


    # Optional: Save the results
    # label_df[[lemma_dc, 'dominant_topic']].to_csv('LDA_clustered_data.csv', index=False)

In [None]:
# platform = 'reddit'
# label = "anxiety"
# lemma_dc = "clean_title_selftext_topcomments_text"



platform = 'beyondblue'
label = "Anxiety"
lemma_dc = "clean_title_content_comments"
lda_topicmodeling(platform, label, lemma_dc)

Topics found by LDA:
Topic 0: 0.019*"not" + 0.013*"go" + 0.010*"anxiety" + 0.010*"job" + 0.009*"work" + 0.009*"say" + 0.008*"can" + 0.007*"day" + 0.007*"bad" + 0.007*"make"
Topic 1: 0.017*"not" + 0.012*"go" + 0.010*"anxiety" + 0.009*"feel" + 0.007*"tng" + 0.007*"would" + 0.007*"get" + 0.006*"need" + 0.006*"start" + 0.006*"bad"
Topic 2: 0.018*"anxiety" + 0.018*"not" + 0.016*"work" + 0.014*"go" + 0.011*"day" + 0.010*"make" + 0.009*"would" + 0.007*"can" + 0.006*"try" + 0.006*"one"
Topic 3: 0.018*"not" + 0.014*"go" + 0.014*"work" + 0.010*"make" + 0.009*"job" + 0.008*"tng" + 0.008*"anxiety" + 0.008*"can" + 0.007*"try" + 0.007*"week"
Topic 4: 0.025*"anxiety" + 0.014*"go" + 0.009*"help" + 0.007*"not" + 0.007*"heart" + 0.006*"tng" + 0.006*"make" + 0.006*"feel" + 0.006*"still" + 0.006*"bad"

DataFrame with Cluster Assignments:
                        clean_title_content_comments  dominant_topic
0  feel bit little scared hey have not nearly wee...               3
1  need help emetophobia emetoph