# Libraries

In [1]:
import os
import re
import string

import pandas as pd

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.util import ngrams
from sentence_transformers import SentenceTransformer, util
from tqdm.notebook import tqdm


# Supporting Functions

## load_data()

In [2]:
def load_data():
    '''
    Load ./data/conditions_data_post_stitched.csv and return pd.DataFrame
    '''
    df = pd.read_csv('./data/BeyondBlue/conditions_data_post_stitched.csv')
    return df


## sample_data()

In [3]:
def sample_data(df, n=100):
    '''
    Randomly sample n rows from the DataFrame with fixed random state.
    '''
    return df.sample(n, random_state=42)


## clean_text()

In [4]:
def clean_text(text):
    """  
    Cleans and standardizes text by removing leading/trailing whitespace,
    replacing newlines, tabs, non-breaking spaces with a single space, multiple spaces,

    Args:
        text (str): The text to be formatted.
    Returns:
        str: The cleaned and standardized text.
    """
    text = text.strip()
    text = text.replace('\n', ' ')
    text = text.replace('\t', ' ')
    text = text.replace('\xa0', ' ')
    text = re.sub(r'\s+', ' ', text)
    return text

## concat()

In [5]:
def concat(row):
    """
    Concatenates Post Title and Post Content with a period if the Post Title does not end with a punctuation mark.

    Args:
        row (pd.Series): A row from the DataFrame containing 'Post_Title' and 'Post_Content'.
    Returns:
        str: Concatenated string of Post Title and Post Content.
    """
    end_punctuations = {'!', '?'}
    if row['Post_Title'] and row['Post_Title'][-1] in end_punctuations:
        return row['Post_Title'] + ' ' + row['Post_Content']
    else:
        return row['Post_Title'].rstrip() + '. ' + row['Post_Content']

## clean_data()

In [6]:
def clean_data(df):
    '''
    Clean the DataFrame by concatenating Post Title and Content, clean the text, and return only 'Post_ID' and 'Post_Title_Content' columns
    Args:
        df (pd.DataFrame): DataFrame containing 'Post_ID', 'Post_Title', and 'Post_Content' columns.
    Returns:
        pd.DataFrame: DataFrame with 'Post_ID' and 'Post_Title_Content' columns.
    '''
    df['Post_Title_Content'] = df.apply(concat, axis=1)
    df['Post_Title_Content'] = df['Post_Title_Content'].apply(clean_text)
    return df[['Post_ID', 'Post_Title_Content']]


## embed_data()

In [7]:
def embed_data(
        df, model_name='multi-qa-mpnet-base-dot-v1', 
        mode = "3-gram", output_csv='./data/sample/data_embeddings.csv',
        load = True, save = False
    ):
    '''
    Embeds the DataFrame using a model_name. 
    If load is True, it will load existing embeddings from the CSV, else it will compute new embeddings.
    If save is True, it will save the embeddings to the CSV, else it will not save the embeddings.

    Args:
        df (pd.DataFrame): DataFrame with 'Post_ID' and 'Post_Title_Content'.
        model_name (str): Name of the SBERT model to use for embeddings.
        mode (str): Mode for text splitting (e.g., "3-gram").
        output_csv (str): Path to the output CSV file.
        load (bool): Whether to load existing embeddings from the CSV.
        save (bool): Whether to save new embeddings to the CSV.
    Returns:
        pd.DataFrame: DataFrame with 'Post_ID' and 'Embedding' columns.
    '''
    
    # Check if embeddings already exist
    if load and os.path.exists(output_csv):
        df = pd.read_csv(output_csv, index_col=0)
        meta_df = df.iloc[:,:3]
        emb_df = df.iloc[:, 3:]
        return meta_df, emb_df
    
    # Load SBERT model
    model = SentenceTransformer(model_name)

    all_records = []
    
    for _ , row in tqdm(df.iterrows(), total=len(df), desc="Embedding data"):

        # sentence tokenize text
        sentence_list = sent_tokenize(row['Post_Title_Content'])

        # Create n-grams based on mode
        if mode == '3-gram':
            # Create n-grams from sentences
            n_grams_list = []
            clause_list = []
            # Tokenize and create n-grams for each sentence
            for sentence in sentence_list:
                # Strip whitespace
                sentence = sentence.strip()
                # Split sentence into raw clauses (based on commas)
                if "," in sentence:
                    # Create 3-grams from clauses
                    clauses = sentence.split(",")
                    # Tokenize and create n-grams for each clause
                    for clause in clauses:
                        # strip whitespace
                        clause = clause.strip()
                        # Word tokenize clause
                        tokens = word_tokenize(clause)
                        # Remove punctuation
                        tokens = [token.strip() for token in tokens if token not in string.punctuation]
                        # If clause is 1 or 2-gram
                        if len(tokens) < 3:
                            # append as is
                            n_grams_list.append(' '.join(tokens))
                            clause_list.append(clause)
                        # else if clause is 3-gram
                        elif len(tokens) == 3:
                            # create 2-grams
                            clause_bigrams = list(ngrams(tokens, 2))
                            # join the words in the bigrams
                            clause_bigrams = [' '.join(bigram) for bigram in clause_bigrams]
                            # append the bigrams and the original clause to the n-grams list
                            n_grams_list += [clause_bigrams[0], ' '.join(tokens), clause_bigrams[1]]
                            # append the original clause to the clause list equal to the number of n-grams
                            clause_list += [clause]*(len(clause_bigrams) + 1)
                        # else create n-grams but also with 2-gram for the start and end
                        else:                            
                            # create 2-grams
                            clause_bigrams = list(ngrams(tokens, 2))
                            # join the words in the bigrams
                            clause_bigrams = [' '.join(bigram) for bigram in clause_bigrams]
                            # create 3-grams
                            clause_ngrams = list(ngrams(tokens, 3))
                            # join the words in the 3-grams
                            clause_ngrams = [' '.join(trigram) for trigram in clause_ngrams]
                            # append the first bigram to the n-grams list
                            n_grams_list.append(clause_bigrams[0])
                            # append the trigrams to the n-grams list
                            n_grams_list += clause_ngrams
                            # append the last bigram to the n-grams list
                            n_grams_list.append(clause_bigrams[-1])
                            # append the original clause to the clause list equal to the number of n-grams
                            clause_list += [clause]*(len(clause_ngrams) + 2)
                else:
                    # If no commas, treat the whole sentence as a single clause
                    # Word tokenize sentence
                    tokens = word_tokenize(sentence)
                    tokens = [token.strip() for token in tokens if token not in string.punctuation]
                    # If clause is 1 or 2-gram
                    if len(tokens) < 3:
                        # append as is
                        n_grams_list.append(' '.join(tokens))
                        clause_list.append(sentence)
                    # else if clause is 3-gram
                    elif len(tokens) == 3:
                        # create 2-grams
                        clause_bigrams = list(ngrams(tokens, 2))
                        # join the words in the bigrams
                        clause_bigrams = [' '.join(bigram) for bigram in clause_bigrams]
                        # append the bigrams and the original clause to the n-grams list
                        n_grams_list += [clause_bigrams[0], ' '.join(tokens), clause_bigrams[1]]
                        # append the original clause to the clause list equal to the number of n-grams
                        clause_list += [sentence]*(len(clause_bigrams) + 1)
                    # else create n-grams but also with 2-gram for the start and end
                    else:                            
                        # create 2-grams
                        clause_bigrams = list(ngrams(tokens, 2))
                        # join the words in the bigrams
                        clause_bigrams = [' '.join(bigram) for bigram in clause_bigrams]
                        # create 3-grams
                        clause_ngrams = list(ngrams(tokens, 3))
                        # join the words in the 3-grams
                        clause_ngrams = [' '.join(trigram) for trigram in clause_ngrams]
                        # append the first bigram to the n-grams list
                        n_grams_list.append(clause_bigrams[0])
                        # append the trigrams to the n-grams list
                        n_grams_list += clause_ngrams
                        # append the last bigram to the n-grams list
                        n_grams_list.append(clause_bigrams[-1])
                        # append the original clause to the clause list equal to the number of n-grams
                        clause_list += [sentence]*(len(clause_ngrams) + 2)
            # Create a copy of the n-grams list
            n_grams_list_copy = n_grams_list.copy()
            # Create a copy of the clause list
            clause_list_copy = clause_list.copy()

        # SBERT embeddings
        embeddings = model.encode(n_grams_list_copy, convert_to_numpy=True, normalize_embeddings=True)

        # Collect data
        for text_idx, (clause, n_gram, embedding) in enumerate(zip(clause_list_copy, n_grams_list_copy, embeddings)):
            all_records.append({
                'Post_ID': row['Post_ID'],
                'clause': clause,
                'n_gram': n_gram,
                'embedding': embedding.tolist()  # store as list for CSV-friendly format
            })

    # Create DataFrame
    df = pd.DataFrame(all_records)

    # Split embedding into separate columns and create embedding DataFrame
    emb_dim = len(df['embedding'][0])
    emb_cols = [f'emb_{i}' for i in range(emb_dim)]
    emb_df = pd.DataFrame(df['embedding'].tolist(), columns=emb_cols)

    # Meta data DataFrame
    meta_df = df[['Post_ID', 'clause', 'n_gram']]

    if save:
        # Save embeddings to CSV
        final_df = pd.concat([meta_df, emb_df], axis=1)
        final_df.to_csv(output_csv, index=False)


    # Export to CSV
    final_df.to_csv(output_csv, index=True)

    return meta_df, emb_df

## load_symptom_repr_words()

In [8]:
def load_symptom_repr_words():
    '''
    Loads symptom representative words from ./data/symptom_list_v3.csv.
    Args:
        None
    Returns:
        A list of symptom representative words.
    '''
    df = pd.read_csv('./data/symptoms_list_v3.csv', sep=';')
    repr_words = df['representative_words']
    # Turn each comma‑separated string into a list of trimmed items
    repr_words = repr_words.apply(
        lambda x: [p.strip() for p in x.split(',')]
    )
    # Flatten the list of lists
    repr_words = [item for sublist in repr_words for item in sublist]
    # Remove duplicates
    repr_words = list(set(repr_words))
    return repr_words

## load_risk_factor_repr_words()

In [9]:
def load_risk_factor_repr_words():
    '''
    Loads risk representative words from ./data/risk_factors.csv.
    Args:
        None
    Returns:
        A list of risk representative words.
    '''
    df = pd.read_csv('./data/risk_factors.csv', sep=',')
    repr_words = df['representative_words']
    # Turn each comma‑separated string into a list of trimmed items
    repr_words = repr_words.apply(
        lambda x: [p.strip() for p in x.split(';')]
    )
    # Flatten the list of lists
    repr_words = [item for sublist in repr_words for item in sublist]
    # Remove duplicates
    repr_words = list(set(repr_words))
    return repr_words

## embed_rep_words()

In [10]:
def embed_rep_words(
       symptoms_rw_list, risk_factors_rw_list,
       model_name='multi-qa-mpnet-base-dot-v1',
       output_csv='./data/sample/representative_word_embeddings.csv',
       load = True, save = False
):
    '''
    Embeds risk factor and symptom representative words from the DataFrame.
    Args:
        symptoms_wr_list: List of symptom representative words.
        risk_factors_wr_list: List of risk factor representative words.
        model_name: Name of the SBERT model to use.
        output_csv: Path to the output CSV file.
        load: Whether to load existing embeddings from the CSV file.
        save: Whether to save new embeddings to the CSV file.
    '''

    # Check if load is True and embeddings already exist
    if load and os.path.exists(output_csv):
        df = pd.read_csv(output_csv)
        meta_df = df.iloc[:,:1]
        emb_df = df.iloc[:, 1:]
        return meta_df, emb_df
    
    # Combine representative words
    repr_words = symptoms_rw_list + risk_factors_rw_list

    # Load SBERT model
    model = SentenceTransformer(model_name)

    # Initialize list to hold all records
    all_records = []

    # Get embeddings
    embeddings = model.encode(repr_words, convert_to_numpy=True, normalize_embeddings=True)

    # Collect data
    for word, embedding in zip(repr_words, embeddings):
        all_records.append({
            'repr_word': word,
            'embedding': embedding.tolist()  # store as list for CSV-friendly format
        })

    # Create DataFrame
    df = pd.DataFrame(all_records)

    # Split embedding into separate columns and create embedding DataFrame
    emb_dim = len(df['embedding'][0])
    emb_cols = [f'emb_{i}' for i in range(emb_dim)]
    emb_df = pd.DataFrame(df['embedding'].tolist(), columns=emb_cols)

    # Meta data DataFrame
    meta_df = df[['repr_word']]

    if save:
        # Save embeddings to CSV
        final_df = pd.concat([meta_df, emb_df], axis=1)
        final_df.to_csv(output_csv, index=False)

    return meta_df, emb_df

## detect_variables()

In [None]:
def detect_variables(
        data_meta_df, data_emb_df,
        repr_words_meta_df, repr_words_emb_df,
        threshold = 0.75,
        load = True, save = False
    ):
    '''
    Detects variables (representative words) in the posts by comparing embeddings using cosine similiarity.
    Args:
        data_meta_df: Metadata DataFrame for the posts.
        data_emb_df: Embeddings DataFrame for the posts.
        repr_words_meta_df: Metadata DataFrame for the representative words.
        repr_words_emb_df: Embeddings DataFrame for the representative words.
        threshold: Cosine similarity threshold for considering a match.
        load: Whether to load existing detected variables from the CSV file.
        save: Whether to save the detected variables to a CSV file.
    Returns:
        DataFrame containing the detected variables for each post.
        DataFrame containing the detected variables for each ngrams.
    '''
    # define output CSV path
    symptom_per_post_output_csv = './data/sample/detected_symptoms_per_post.csv'
    symptom_per_ngram_output_csv = './data/sample/detected_symptoms_per_ngram.csv'
    repr_words_per_post_output_csv = './data/sample/detected_repr_words_per_post.csv'
    repr_words_per_ngram_output_csv = './data/sample/detected_repr_words_per_ngram.csv'

    # Check if load is True and embeddings already exist
    if load and os.path.exists(symptom_per_post_output_csv) and os.path.exists(symptom_per_ngram_output_csv) and os.path.exists(repr_words_per_post_output_csv) and os.path.exists(repr_words_per_ngram_output_csv):
        symptoms_post_df = pd.read_csv(symptom_per_post_output_csv)
        symptoms_ngram_df = pd.read_csv(symptom_per_ngram_output_csv)
        repr_words_post_df = pd.read_csv(repr_words_per_post_output_csv)
        repr_words_ngram_df = pd.read_csv(repr_words_per_ngram_output_csv)
        return symptoms_post_df, symptoms_ngram_df, repr_words_post_df, repr_words_ngram_df

    # Get representative words, post IDs, clauses, and ngrams
    repr_words = repr_words_meta_df['repr_word'].tolist()
    post_ids_list = data_meta_df['Post_ID'].tolist()
    clause_list = data_meta_df['clause'].tolist()
    ngram_list = data_meta_df['n_gram'].tolist()

    # Initialize lists to store variable data
    detected_variables_per_post = []
    detected_variables_per_ngram = []

    # Initialize post row
    post_row = {'Post_ID': '', **{word:0 for word in repr_words}}

    # Iterate through each ngram and its embedding (converted to numpy)
    # while using tqdm for progress tracking
    for post_id, clause, ngram, ngram_emb in tqdm(zip(post_ids_list, clause_list, ngram_list, data_emb_df.values), total=len(post_ids_list), desc="Detecting variables"):
        # Initialize variable for the current post
        ngram_row = {'Post_ID': post_id, 'clause': clause, 'n_gram': ngram, **{word:0 for word in repr_words}}
        # Initialise controller variable to hold the highest cosine similarity repr_word w.r.t the current ngram
        highest = {'repr_word': '', 'value':0}

        # Making sure the Post_ID is correct with the current ngram
        # Check if the post_row is empty
        if post_row['Post_ID'] == '':
            # Assign post_row with the current post_id
            post_row['Post_ID'] = post_id
        # Else if the post_row is not empty and the Post_ID is different from the current post_id,
        elif post_row['Post_ID'] != post_id:
            # Append the existing post_row to detected_variables_per_post
            detected_variables_per_post.append(post_row)
            # Reset post_row for the new post
            post_row = {'Post_ID': '', **{word:0 for word in repr_words}}

        # Iterate through each representative word and its embedding (converted to numpy)
        for repr_word_emb, repr_word in zip(repr_words_emb_df.values, repr_words):
            # Calculate cosine similarity
            cos_sim = util.cos_sim(ngram_emb, repr_word_emb).item()
            # Check if the cosine similarity is above the threshold and above the current highest cosine similarity for the ngram
            if (cos_sim >= threshold) and (cos_sim > highest['value']):
                # If so, update the highest variable
                highest['repr_word'] = repr_word
                highest['value'] = cos_sim

        # Update the ngram_row and post_row if a representative word was found
        if highest['repr_word'] != '':
            ngram_row[highest['repr_word']] = 1
            post_row[highest['repr_word']] = 1

        # Append the ngram_row to detected_variables_per_ngram
        detected_variables_per_ngram.append(ngram_row)
    # Append the the last post_row to detected_variables_per_post
    detected_variables_per_post.append(post_row)

    # Create DataFrames for the detected variables
    variables_per_post = pd.DataFrame(detected_variables_per_post.copy())
    variables_per_ngram = pd.DataFrame(detected_variables_per_ngram.copy())

    if save:
        # Save detected variables to CSV
        variables_per_post.to_csv(per_post_output_csv, index=False)
        variables_per_ngram.to_csv(per_ngram_output_csv, index=True)

    return variables_per_post, variables_per_ngram

# Main

## Prepare Posts

In [19]:
data = load_data()
sampled_data = sample_data(data, n=10)
cleaned_sampled_data = clean_data(sampled_data)
data_meta_df, data_emb_df = embed_data(
    cleaned_sampled_data,
    load=True,
    save=False
)
data_meta_df

Unnamed: 0,Post_ID,clause,n_gram
0,Anxi-1412,EXISTING WITH ANXIETY.,EXISTING WITH
1,Anxi-1412,EXISTING WITH ANXIETY.,EXISTING WITH ANXIETY
2,Anxi-1412,EXISTING WITH ANXIETY.,WITH ANXIETY
3,Anxi-1412,My name is Dennis,My name
4,Anxi-1412,My name is Dennis,My name is
...,...,...,...
2031,Anxi-6474,and I don't know if I can handle the stress.,the stress
2032,Anxi-6474,Thanks for listening xx,Thanks for
2033,Anxi-6474,Thanks for listening xx,Thanks for listening
2034,Anxi-6474,Thanks for listening xx,for listening xx


## Prepare Representative Words

In [20]:
symptom_repr_words_list = load_symptom_repr_words()
risk_factor_repr_words_list = load_risk_factor_repr_words()
repr_words_meta_df, repr_words_emb_df = embed_rep_words(
    symptoms_rw_list=symptom_repr_words_list,
    risk_factors_rw_list=risk_factor_repr_words_list,
    load=True
)
repr_words_meta_df

Unnamed: 0,repr_word
0,trouble remembering
1,bored
2,flashback
3,numb
4,hopeless
...,...
134,cyclone
135,land degradation
136,night shift
137,job responsibility


## Detect Variables

In [21]:
data_meta_df

Unnamed: 0,Post_ID,clause,n_gram
0,Anxi-1412,EXISTING WITH ANXIETY.,EXISTING WITH
1,Anxi-1412,EXISTING WITH ANXIETY.,EXISTING WITH ANXIETY
2,Anxi-1412,EXISTING WITH ANXIETY.,WITH ANXIETY
3,Anxi-1412,My name is Dennis,My name
4,Anxi-1412,My name is Dennis,My name is
...,...,...,...
2031,Anxi-6474,and I don't know if I can handle the stress.,the stress
2032,Anxi-6474,Thanks for listening xx,Thanks for
2033,Anxi-6474,Thanks for listening xx,Thanks for listening
2034,Anxi-6474,Thanks for listening xx,for listening xx


In [24]:
post_variables, ngram_variables = detect_variables(
    data_meta_df=data_meta_df,
    data_emb_df=data_emb_df,
    repr_words_meta_df=repr_words_meta_df,
    repr_words_emb_df=repr_words_emb_df,
    threshold=0.7,
    load=True,
    save=False
)

## Sampling Test

In [25]:
post_num = 7

In [26]:
cols = post_variables.iloc[post_num-1:post_num].columns[(post_variables.iloc[post_num-1:post_num] == 1).any()]
new_cols = cols.append(pd.Index(['Post_ID']))
post_id = post_variables.iloc[post_num-1:post_num]['Post_ID'].values[0]
post_variables.iloc[post_num-1:post_num][new_cols]

Unnamed: 0,can't stop worrying,lack motivation,exhausted,guilt,afraid,failure,sleeping too much,heatwave,Post_ID
6,1,1,1,1,1,1,1,1,Depr-5053


In [30]:
print(sampled_data[sampled_data["Post_ID"] == post_id]['Post_Title'].item())
print(sampled_data[sampled_data["Post_ID"] == post_id]['Post_Content'].item())
print(sampled_data[sampled_data["Post_ID"] == post_id]['Post_Date'].item())

Lost motivation and direction
I feel guilty because I am wasting time and not doing anything useful. I feel like I have no purpose. I feel guilty because I have no excuse for complaining Other people have much worse situations. I normally am very goal orientated and feel bad if I am not achieving things. I am marking time waiting for summer to finish. I hate the heat. I am scared in case fires start and I don't like going out in case I get sunburnt. I had a good job which I left in October. I had been struggling because my husband kept getting sick. He spent 6 weeks in hospital Sept-Oct '14 just after I started work and came home so weak he could barely walk from one room to the next. (we previously enjoyed bushwalking) He then had lots of doctors appointments and tests to plan for further surgery to prevent him having the same problem. I was very stressed and always worried about him. I couldn't afford to take off much time for carers leave because I hadn't worked long enough. My plan

In [28]:
detail_col = cols.append(pd.Index(['clause', 'n_gram']))
ngram_variables[(ngram_variables["Post_ID"] == post_id) & (ngram_variables.sum(axis=1, numeric_only=True) == 1)][detail_col]

Unnamed: 0,can't stop worrying,lack motivation,exhausted,guilt,afraid,failure,sleeping too much,heatwave,clause,n_gram
