# Libraries

In [28]:
import contractions
import language_tool_python
import os
import re
import pandas as pd
from language_tool_python.utils import correct
from spellchecker import SpellChecker
from tqdm.notebook import tqdm
tqdm.pandas()  # enable tqdm for pandas

# Read Posts and Comments Data

In [88]:
posts_df = pd.read_csv('./data/BeyondBlue/conditions_data_post_stitched.csv')
comments_df = pd.read_csv('./data/BeyondBlue/data_comments_stitched.csv')
posts_df.head()

Unnamed: 0,Post_ID,Post_Title,Post_Content,Post_Author,Post_Author_Rank,Post_Date,Post_Category,Number_of_Comments,Post_URL
0,Anxi-1,I’m stuck!,I’ve never written on a forum like this before...,Guest_39557583,Community Member,20-06-2025,Anxiety,6,/t5/anxiety/i-m-stuck/td-p/611578
1,Anxi-2,Back injury anxiety,Hi. I'm new here. I am extremely anxious when ...,Guest42,Community Member,10-07-2025,Anxiety,0,/t5/anxiety/back-injury-anxiety/td-p/612114
2,Anxi-3,"Anxiety, Injustice, and Fear: Workplace Exploi...","Hi everyone,I’m going through one of the most ...",Joker_J,Community Member,09-07-2025,Anxiety,1,/t5/anxiety/anxiety-injustice-and-fear-workpla...
3,Anxi-4,Im lost and wasn't sure what should I do next.,I am international student to Tasmania in 2021...,tevont,Community Member,20-06-2025,Anxiety,2,/t5/anxiety/im-lost-and-wasn-t-sure-what-shoul...
4,Anxi-5,Just broke free from rude friend and I still f...,"Recently, I have left my old friend who would ...",waffle_puppy,Community Member,27-06-2025,Anxiety,3,/t5/anxiety/just-broke-free-from-rude-friend-a...


In [5]:
comments_df.head()

Unnamed: 0,Post_ID,Comment_ID,Comment_Content,Comment_Author,Comment_Author_Rank,Comment_Date,Comment_Datetime,Comment_Category
0,Suic-1,Suic-1_comment-1,Hello and welcome. You matter. That you have w...,smallwolf,Community Champion,04-05-2025,2025-05-04 15:55:00,Suicidal thoughts and self-harm
1,Suic-1,Suic-1_comment-2,The warmest of welcomes to you at what sounds ...,therising,Valued Contributor,05-05-2025,2025-05-05 04:44:00,Suicidal thoughts and self-harm
2,Suic-1,Suic-1_comment-3,That sounds really tough and I can’t imagine w...,Ben5,Community Member,05-05-2025,2025-05-05 23:26:00,Suicidal thoughts and self-harm
3,Suic-1,Suic-1_comment-4,Thank you i appreciate your response. Professi...,Done_,Community Member,06-05-2025,2025-05-06 20:35:00,Suicidal thoughts and self-harm
4,Suic-1,Suic-1_comment-5,I didn’t expect anyone to respond or even ackn...,Done_,Community Member,06-05-2025,2025-05-06 20:53:00,Suicidal thoughts and self-harm


In [6]:
len(comments_df)

100585

In [7]:
len(posts_df)

17499

## Sample 10 posts

In [8]:
sampled_posts = posts_df.sample(n=10, random_state=42)
sampled_posts

Unnamed: 0,Post_ID,Post_Title,Post_Content,Post_Author,Post_Author_Rank,Post_Date,Post_Category,Number_of_Comments,Post_URL
1411,Anxi-1412,EXISTING WITH ANXIETY,"My name is Dennis, I am 77, married to an amaz...",DocP,Community Member,05-07-2022,Anxiety,3,/t5/anxiety/existing-with-anxiety/td-p/538634
7555,Depr-50,Help Please 🙏🏻,To anyone who is reading I really need your he...,Gracie_PY4,Community Member,09-05-2025,Depression,1,/t5/depression/help-please/td-p/610278
4930,Anxi-4931,Drug Trigger,"Hi all, I’ve recently started having anxiety, ...",Scatterbrain,Community Member,14-06-2018,Anxiety,4,/t5/anxiety/drug-trigger/td-p/396705
10054,Depr-2549,Alone,Hi my name is Lee. I have recently split up wi...,Lee1234,Community Member,20-04-2020,Depression,1,/t5/depression/alone/td-p/492160
2714,Anxi-2715,Situational anxiety,In 2016 I felt the creep of what was later dia...,Nervous_Nell,Community Member,30-09-2020,Anxiety,1,/t5/anxiety/situational-anxiety/td-p/519977
101,Anxi-102,Undiagnosed Anxiety,Hi. This discussion is for anyone who either c...,Eurovision_Fan,Community Member,17-03-2025,Anxiety,3,/t5/anxiety/undiagnosed-anxiety/td-p/608342
12558,Depr-5053,Lost motivation and direction,I feel guilty because I am wasting time and no...,Elizabeth CP,Blue Voices Member,02-01-2016,Depression,8,/t5/depression/lost-motivation-and-direction/t...
1097,Anxi-1098,Hard start to the year,"hey everyone, This year hasn’t gone the way I ...",Heidi_1,Community Member,18-01-2023,Anxiety,2,/t5/anxiety/hard-start-to-the-year/td-p/557173
10226,Depr-2721,Depressed and sadness,"Hello, This is the first time I'm posting a ne...",SilvaLady,Community Member,19-09-2019,Depression,40,/t5/depression/depressed-and-sadness/td-p/468224
6473,Anxi-6474,Anxiety + Uni nursing placement,I start a 2 week nursing placement in the surg...,Elea,Community Member,07-01-2016,Anxiety,5,/t5/anxiety/anxiety-uni-nursing-placement/td-p...


# List of all Comment_Author_Rank

In [9]:
unique_ranks = sorted(comments_df['Comment_Author_Rank'].dropna().unique())
unique_ranks

['Beyond Blue Staff',
 'Blue Voices Member',
 'Champion Alumni',
 'Community Champion',
 'Community Member',
 'Moderator',
 'Valued Contributor']

# Helper Functions

## `smart_concat()`

In [10]:
def punct_aware_concat(post_title, post_content):
    '''Concatenate post title and content intelligently. 
    If either is missing, return the other. If both are missing, return a default message.
    If both are present, concatenate with a separator. The default separator is '. '.
    If there is already punctuation at the end of the title or start of the content, avoid double punctuation.
    Parameters:
        post_title (str): The title of the post.
        post_content (str): The content of the post.
    Returns:
        str: The concatenated string.
    '''
    if not post_title and not post_content:
        return "No content available"
    if not post_title:
        return post_content
    if not post_content:
        return post_title
    if re.match(r'[.!?]$', post_title):
        separator = ' '  # No extra punctuation needed
    else:
        separator = '. '
    return f"{post_title}{separator}{post_content}"

## `check_text()`

Using the libraries: Contractions + PySpellChecker + LanguageTool

In [11]:
# set up Java environment for LanguageTool
java_home = r"C:\Program Files\Eclipse Adoptium\jdk-21.0.8.9-hotspot"
os.environ["JAVA_HOME"] = java_home
os.environ["PATH"] = java_home + r"\bin;" + os.environ["PATH"]

# Initialize LanguageTool for english
tool = language_tool_python.LanguageTool('en')  # English

def check_text(text):
    '''Check and correct grammar and spelling in the given text.
    Args:
        text (str): The input text to be checked and corrected.
    Returns:
        str: The corrected text.
    '''
    # strip leading/trailing whitespace
    text = text.strip()
    
    # Expand contractions
    text = contractions.fix(text)

    # Check grammar only
    matches = tool.check(text)
    text = language_tool_python.utils.correct(text, matches)

    # Check for spelling errors
    spell = SpellChecker()
    
    # Tokenize while preserving punctuation
    tokens = re.findall(r"\w+|[^\w\s]", text, re.UNICODE)
    
    corrected_tokens = []
    for token in tokens:
        if token.isalpha():
            corrected = spell.correction(token)
            corrected_tokens.append(corrected if corrected else token)
        else:
            corrected_tokens.append(token)
    
    # Reconstruct the paragraph
    corrected_text = ""
    for i, token in enumerate(corrected_tokens):
        if i > 0:
            prev = corrected_tokens[i - 1]
            # Add space if both current and previous tokens are alphanumeric
            if (token.isalnum() and prev.isalnum()) or (prev.isalnum() and token in ['(', '[']):
                corrected_text += " "
            # Add space after punctuation if needed
            elif prev in ['.', ',', ';', ':', '?', '!'] and token.isalnum():
                corrected_text += " "
        corrected_text += token

    return corrected_text

## `filter_comments_by_post()`

In [12]:
def filter_comments_by_post(df_comments, post_id):
    '''Filter comments based on the given post ID.
    Args:
        df_comments (pd.DataFrame): DataFrame containing comments with a 'post_id' column.
        post_id (str): The post ID to filter comments by.
    Returns:
        pd.DataFrame: Filtered DataFrame containing comments for the specified post ID.
    '''
    filtered_comments = df_comments[df_comments['Post_ID'] == post_id].copy()
    return filtered_comments

## `get_post_title_content()`

In [13]:
def get_post_title_content(sample=None, post_id=None):
    '''Print the title and content of a post sample. If there is a sample, use the sample's title and content.
    If no sample is provided, use the post_id to fetch the post details 
    Args:
        sample (pd.Series): A pandas Series representing a post with 'Post_Title' and 'Post_Content' columns.
        post_id (str, optional): The post ID to display. Defaults to None.
    Returns:
        None
    '''
    if sample is None and post_id is None:
        print("No sample or post_id provided.")
        return
    if sample is None and post_id is not None:
        global posts_df
        match = posts_df.loc[posts_df['Post_ID'] == post_id, 'Post_Content']
        if match.empty:
            print("Post ID not found in posts_df")
        else:
            for i, content in enumerate(match.tolist(), 1):
                title = posts_df.loc[posts_df['Post_ID'] == post_id, 'Post_Title'].values[0]
                content = posts_df.loc[posts_df['Post_ID'] == post_id, 'Post_Content'].values[0]
    if sample is not None:
        title = sample['Post_Title']
        content = sample['Post_Content']    
    
    return punct_aware_concat(title, content)

## `get_comment_content()`

In [14]:
def get_comment_content(sample=None, comment_id=None):
    '''Print the content of a comment sample. If there is a sample, use the sample's content.
    If no sample is provided, use the comment_id to fetch the comment details 
    Args:
        sample (pd.Series): A pandas Series representing a comment with 'Comment_Content' column.
        comment_id (str, optional): The comment ID to display. Defaults to None.
    Returns:
        None
    '''
    if sample is None and comment_id is None:
        print("No sample or comment_id provided.")
        return
    if sample is None and comment_id is not None:
        global comments_df
        match = comments_df.loc[comments_df['Comment_ID'] == comment_id, 'Comment_Content']
        if match.empty:
            print("Comment ID not found in comments_df")
        else:
            for i, content in enumerate(match.tolist(), 1):
                content = comments_df.loc[comments_df['Comment_ID'] == comment_id, 'Comment_Content'].values[0]
    if sample is not None:
        content = sample['Comment_Content']

    return content

## `is_post_author_commented()`

In [15]:
def is_post_author_commented(author_id, df_comments):
    '''Check if the post author has commented in the comments DataFrame.
    Args:
        author (str): The author's name to check for.
        df_comments (pd.DataFrame): DataFrame containing comments with an 'Author' column.
    Returns:
        bool: True if the author has commented, False otherwise.
    '''
    return author_id in df_comments['Comment_Author'].values

## `get_post_url()`

In [21]:
def get_post_url(post_id, posts_df):
    ''' return the post url based on the post_id '''
    post = posts_df[posts_df['Post_ID'] == post_id]
    return 'https://forums.beyondblue.org.au/' + post['Post_URL'].values[0] if not post.empty else None

## `merge_table()`

In [99]:
def merge_table(posts_df, comments_df):
    '''Merge a post sample with its associated comments.
    Args:
        posts_df (pd.DataFrame): A DataFrame representing posts.
        comments_df (pd.DataFrame): DataFrame containing comments with a 'Post_ID' column.
    Returns:
        pd.DataFrame: A DataFrame containing the post details and its associated comments.
    '''
    # check if posts_df and comments_df are not None
    if posts_df is None or comments_df is None:
        return None

    # Initialize an empty list to hold the merged results
    merged_list = list()
    
    # iterate through posts_df
    for row in tqdm(posts_df.itertuples(index=False, name="Post"), total=len(posts_df)):
        post_id = row.Post_ID
        post_author = row.Post_Author
        post_title = row.Post_Title
        post_content = row.Post_Content
        
        # Filter comments for the given post_id
        filtered_comments = filter_comments_by_post(comments_df, post_id)

        # check if the post author has commented
        author_commented = is_post_author_commented(post_author, filtered_comments)
        # If author has commented, include the post in the merged DataFrame
        if author_commented:
            # get the post title and content
            post_text = punct_aware_concat(post_title, post_content)
            # correct the post text
            post_text = check_text(post_text)
            # get author's last comment
            # if the author has multiple comments, get the last one
            # if the last comment by the author is not a string (NaN), get the previous one
            # loop through the author's comments in reverse order to find the last valid comment            
            author_last_comment = None
            for comment in filtered_comments[filtered_comments['Comment_Author'] == post_author].iloc[::-1].itertuples(index=False):
                if isinstance(comment.Comment_Content, str):
                    author_last_comment = comment
                    break
            if author_last_comment is None:
                continue  # skip if no valid comment found            
            # get author's last comment id
            author_last_comment_id = author_last_comment.Comment_ID
            # create a new row (dictionary) for the merged DataFrame
            new_row = {
                "Post_ID": post_id,
                "Author": post_author,
                "Author_Post": post_text,
                "Author_Last_Comment": check_text(author_last_comment.Comment_Content),
            }            
            # create columns for each unique ranks with default value of 0
            for rank in unique_ranks:
                new_row[rank] = 0
            # iterate filtered comments and count ranks before the author's last comment
            for comment in filtered_comments.itertuples(index=False, name="Comment"):
                if comment.Comment_ID == author_last_comment_id:
                    break # stop counting when reaching author's last comment
                if comment.Comment_Author == post_author:
                    continue # skip author's own comments
                rank = comment.Comment_Author_Rank
                new_row[rank] += 1

            # append the new row to the merged list
            merged_list.append(new_row)

    merged_df = pd.DataFrame(merged_list)


    return merged_df

In [None]:
merged_table = merge_table(posts_df=posts_df, comments_df=comments_df)
merged_table.head()

In [None]:
merged_table.to_csv('./data/BeyondBlue/commented_post_authors.csv', index=False)

# Testings

## Sample Post

In [13]:
# Sample one observation from posts_df
post_sample = posts_df.sample(n=1, random_state=42).iloc[0]
post_sample

Post_ID                                                       Anxi-1412
Post_Title                                        EXISTING WITH ANXIETY
Post_Content          My name is Dennis, I am 77, married to an amaz...
Post_Author                                                        DocP
Post_Author_Rank                                       Community Member
Post_Date                                                    05-07-2022
Post_Category                                                   Anxiety
Number_of_Comments                                                    3
Post_URL                  /t5/anxiety/existing-with-anxiety/td-p/538634
Name: 1411, dtype: object

## Print post title and content

In [14]:
get_post_title_content(post_sample)

"EXISTING WITH ANXIETY. My name is Dennis, I am 77, married to an amazing lady for 55years who has been with me through thick and thin. I have had anxiety even before I knew the word. I try not to blame my condition on my early childhood but it is so hard. My last relapse was triggered when we moved to a rural area and I couldn't handle it so we returned to Adelaide and rented the house.i am at present being treated by The Older Persons Health Team who I cannot speak to highly about. Up to now I have not been one to conform to medication but I relize if I don't I won' t get better. Can anyone suggest more up to date texts on anxiety?"

In [15]:
# sample one observation from comments_df
comment_sample = comments_df.sample(n=1, random_state=42).iloc[0]
comment_sample

Post_ID                                                          Anxi-92
Comment_ID                                           Anxi-92_comment-147
Comment_Content        For me, symptoms of anxiety can be: Sweating a...
Comment_Author                                                Guest_4643
Comment_Author_Rank                                     Community Member
Comment_Date                                                  02-03-2020
Comment_Datetime                                     2020-03-02 19:43:00
Comment_Category                                                 Anxiety
Name: 63259, dtype: object

## Print comment

In [16]:
get_comment_content(comment_sample)

'For me, symptoms of anxiety can be: Sweating a lot. Heart palpitations, sometimes chest pain (but it\'s bearable). Dry mouth and sore throat, or a "lump in my throat" if you will. Nausea, sometimes vomiting and stomach problems like gastro. Excessive thoughts of course, bad ones. And probably more, I can\'t think of any more currently. Sorry to hear that so many others deal with anxiety and the symptoms everyone here has too. Thinking of you all here. Tayla'

## Testing the `check_text()` function

In [17]:
col = 'Post_Content' if 'Post_Content' in posts_df.columns else posts_df.columns[0]
original = post_sample[col]
print("Original:\n", original, "\n")
if isinstance(original, str) and original.strip():
    corrected = check_text(original)
else:
    corrected = original
print("Corrected:\n", corrected)

Original:
 My name is Dennis, I am 77, married to an amazing lady for 55years who has been with me through thick and thin. I have had anxiety even before I knew the word. I try not to blame my condition on my early childhood but it is so hard. My last relapse was triggered when we moved to a rural area and I couldn't handle it so we returned to Adelaide and rented the house.i am at present being treated by The Older Persons Health Team who I cannot speak to highly about. Up to now I have not been one to conform to medication but I relize if I don't I won' t get better. Can anyone suggest more up to date texts on anxiety? 

Corrected:
 My name is Dennis, I am 77, married to an amazing lady for 55 years who has been with me through thick and thin. I have had anxiety even before I knew the word. I try not to blame my condition on my early childhood but it is so hard. My last relapse was triggered when we moved to a rural area and I could not handle it so we returned to Adelaide and rented

## Testing the `filter_comments_by_post()` function

In [18]:
filtered_comments = filter_comments_by_post(comments_df, "Anxi-1412")
filtered_comments

Unnamed: 0,Post_ID,Comment_ID,Comment_Content,Comment_Author,Comment_Author_Rank,Comment_Date,Comment_Datetime,Comment_Category
69703,Anxi-1412,Anxi-1412_comment-1,Hi DocPI don’t have experience with meds but I...,rhombusslope,Community Member,17-07-2022,2022-07-17 23:29:00,Anxiety
69704,Anxi-1412,Anxi-1412_comment-2,Dear DocP I'm really sorry that you haven't ha...,ecomama,Valued Contributor,17-07-2022,2022-07-17 23:50:00,Anxiety
69705,Anxi-1412,Anxi-1412_comment-3,"Hello Dennis, and a warm welcome to the forums...",geoff,Champion Alumni,18-07-2022,2022-07-18 02:41:00,Anxiety


In [19]:

post_id = 'Depr-6474'

if 'Post_ID' not in posts_df.columns:
    print("Column 'Post_ID' not found in posts_df")
else:
    match = posts_df.loc[posts_df['Post_ID'] == post_id, 'Post_Content']
    if match.empty:
        print(f"No post found with Post_ID {post_id}")
    else:
        for i, content in enumerate(match.tolist(), 1):
            print(f"\n--- Post {i} (Post_ID={post_id}) ---\n")
            if isinstance(content, str):
                print(content)
            else:
                print(repr(content))



--- Post 1 (Post_ID=Depr-6474) ---

I broke down again yesterday and wanted to end it all. I would never do it, but its the only thought that I have atm. I think about how bad I feel every day and I just want these feelings to go away. I dont know where to go from here. I feel like I just exist with no purpose what so ever. I have lost the will and motivation for life and have completely lost myself in the process and it scares me that I will never feel happy again. I just feel empty inside. I go to work as a distraction and try to fill up my weekend with things to do, but I dont get any enjoyment out of it. I feel like I have hit rock bottom and I am trying to get out of it, but I dont know where to go from here. I feel so alone, even though I have good friends, I just don't feel they understand. I feel guilty because my family know that I am not being myself but I dont know how to find myself again. I am on anti-depressants and am seeing a psychologist, but I feel like its not enoug