# Libraries

In [1]:
import re
import pickle
import os
import pandas as pd
import time
from tqdm.notebook import tqdm
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.common.by import By
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from datetime import datetime as Datetime
import dateparser



In [2]:
# upgrade ipywidgets if progress bar does not work
# !pip install --upgrade ipywidgets 

# Support Functions

## get_post_urls()

In [3]:
def get_post_urls(df):
    """
    Extracts post URLs from the DataFrame.
    
    Parameters:
    df (DataFrame): DataFrame containing post data.
    
    Returns:
    list: List of post URLs.
    """
    return df['Post_URL'].tolist()

## filter_posts()

In [4]:
def filter_posts(df):
    """
    Filters the DataFrame to include only posts by community members with more than one comment.
    
    Parameters:
    df (DataFrame): DataFrame containing post data.
    
    Returns:
    DataFrame: Filtered DataFrame.
    """
    df = df[df['Post_Author_Rank'] == 'Community Member']
    df = df[df['Number_of_Comments'] > 1]
    return df

## get_category_code()

In [5]:
def get_category_code(df):
    """
    Extracts the category code from the DataFrame.
    
    Parameters:
    df (DataFrame): DataFrame containing post data.
    
    Returns:
    str: Category code.
    """
    return df['Post_Category'].iloc[0][:4] if not df.empty else None

## get_category()

In [6]:
def get_category(df):
    """
    Extracts the category code from the DataFrame.
    
    Parameters:
    df (DataFrame): DataFrame containing post data.
    
    Returns:
    str: Category code.
    """
    return df['Post_Category'].iloc[0] if not df.empty else None

## get_post_ids()

In [7]:
def get_post_ids(df):
    """
    Extracts post IDs from the DataFrame.
    
    Parameters:
    df (DataFrame): DataFrame containing post data.
    
    Returns:
    list: List of post IDs.
    """
    return df['Post_ID'].tolist()

## find_last_page()

In [8]:
def find_last_page(soup):
    """ Extracts the last page number from a BeautifulSoup object containing the HTML of a web page.
    Args:
        soup (BeautifulSoup): A BeautifulSoup object containing the HTML of the web page.
    Returns:
        int: The last page number found in the script tag, or 1 if no script tag is found or no page numbers are present.
    """
    script_tag = soup.find('script', string=re.compile(r'LITHIUM.Cache.CustomEvent.set'))

    if script_tag:
        script_content = script_tag.string
    
        page_numbers = re.findall(r'"page":(\d+)', script_content)
        page_numbers = list(map(int, page_numbers))  # Convert to integers
        
        max_page = max(page_numbers) if page_numbers else 1
        return max_page
    else:
        return 1

## format_text()

In [9]:
def format_text(text):
  """  Cleans and standardizes text by removing leading/trailing whitespace,
  replacing newlines, tabs, and non-breaking spaces with a single space.
  Args:
      text (str): The text to be formatted.
  Returns:
      str: The cleaned and standardized text.
  """
  text = text.strip()
  text = text.replace('\n', ' ')
  text = text.replace('\t', ' ')
  text = text.replace('\xa0', ' ')
  text = text.replace('\u202f', ' ')
  text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
  return text

## extract_datetime()

In [10]:
def extract_datetime(datetime_str):
    """
    Extracts the date and time from a datetime string.
    
    Parameters:
    datetime_str (str): String containing the date and time.
    
    Returns:
    tuple: Containing date str and time str.
    """
    pattern = r'(\d{2}-\d{2}-\d{4})\s+(\d{2}:\d{2}\s+[AP]M)'
    match = re.search(pattern, datetime_str)
    if match:
        date_str = match.group(1)
        time_str = match.group(2)
        return date_str, time_str


## format_date()

In [11]:
def format_date(date_str):
    """
    Formats a date string to a standard format.
    
    Parameters:
    date_str (str): The date string to be formatted.
    
    Returns:
    str: The formatted date string.
    """
    match = re.search(r'\b\d{2}-\d{2}-\d{4}\b', date_str)
    new_date_str = ''
    if match:
        new_date_str = match.group()
    else:
        reference_date = Datetime.now()
        parsed = dateparser.parse(date_str, settings={'RELATIVE_BASE': reference_date})
        new_date_str = parsed.strftime("%d-%m-%Y")
    new_date_str = new_date_str.replace('\u200e', '')
    return new_date_str.strip()
    

## format_datetime()

In [12]:
def format_datetime(date_str, time_str):
    """
    Formats a date time string to a standard format.
    
    Parameters:
    date_str (str): The date string to be formatted.
    time_str (str): The time string to be formatted.
    
    Returns:
    datetime: The formatted datetime.
    """
    formatted_date = format_date(date_str)
    datetime = pd.to_datetime(f"{formatted_date} {time_str}", format="%d-%m-%Y %I:%M %p")
    return datetime
    

## check_comment_lengths()

In [13]:
def check_comment_lengths(*args):
    """
    Checks if all provided lists have the same length.
    
    Parameters:
    *args: Variable length argument list containing lists to be checked.
    
    Returns:
    bool: True if all lists have the same length, False otherwise.
    """
    lengths = [len(arg) for arg in args]
    return all(length == lengths[0] for length in lengths)

## save_progress()

In [14]:
def save_progress(comment_dict, last_completed):
    """ Saves the progress of the scraping operation to a file and the scraped data to a pickle file.
    Args:
        forum (dict): A dictionary containing forum metadata such as 'id'.
        last_completed (list): A list of indices representing the last completed posts.
        post_dict (dict): A dictionary containing the scraped post data.
    Returns:
        None
    """
    status_string = ""
    id = comment_dict["Post_ID"][0][:4]  # Extract the first four characters of the Post_ID as the ID
    for i, num in enumerate(last_completed):
        separator = "|" if i > 0 else ""
        status_string = status_string + separator + str(num)
    try:
        with open(f'./data/BeyondBlue/{id}_comments_progress_status', 'w') as f:
            f.write(str(status_string))
        with open(f'./data/BeyondBlue/{id}_comments_progress_data.pkl', 'wb') as f:
            pickle.dump(comment_dict, f)
    except PermissionError:
        time.sleep(3)
        with open(f'./data/BeyondBlue/{id}_comments_progress_status', 'w') as f:
            f.write(str(status_string))
        with open(f'./data/BeyondBlue/{id}_comments_progress_data.pkl', 'wb') as f:
            pickle.dump(comment_dict, f)

## load_progress()

In [15]:
def load_progress(progress_status_file, progress_data_file):
    """ Loads the last completed indices and the scraped post data from files.
    Args:
        progress_file (str): The path to the file containing the last completed indices.
        data_post_file (str): The path to the pickle file containing the scraped post data.
    Returns:
        tuple: A tuple containing the last completed indices and the scraped post data.
    """
    if os.path.exists(progress_status_file):
        try:
            with open(progress_status_file, 'r') as f:
                last_completed = str(f.read())
                last_completed = last_completed.split("|")
        except PermissionError:
            time.sleep(3)
            with open(progress_status_file, 'r') as f:
                last_completed = str(f.read())
                last_completed = last_completed.split("|")
    else:
        return 0, 0
    if os.path.exists(progress_data_file):
        try:
            with open(progress_data_file, 'rb') as f:
                comments_dict = pickle.load(f)
        except PermissionError:
            time.sleep(3)
            with open(progress_data_file, 'rb') as f:
                comments_dict = pickle.load(f)
    else:
        return 0, 0
    return last_completed, comments_dict  # Default to 0 if no progress file exists


# Main Functions

## scrape_comments()

In [16]:
def scrape_comments(post_df):
    """ 
    Scrapes comments from posts urls in the DataFrame.

    Parameters:
    post_df (DataFrame): DataFrame containing post data.

    Returns:
    comment_df (DataFrame): a preview of DataFrame containing scraped comment data.
    """

    # Filter posts by community members with more than one comment, there should be no empty comments
    df = filter_posts(post_df)

    # Extract the category code (four characters)
    category_code = get_category_code(df) 

    # Check if progress file exists
    progress_status_file = f'./data/BeyondBlue/{category_code}_comments_progress_status'
    progress_data_file = f'./data/BeyondBlue/{category_code}_comments_progress_data.pkl'
    last_completed, comment_dict = load_progress(progress_status_file, progress_data_file)
    if last_completed and comment_dict:
        print(f"Resuming from last completed post: {last_completed[2]} out of {last_completed[3]} and comment page {last_completed[0]} out of {last_completed[1]} pages.")
        last_comment_page = int(last_completed[0])  # Start from the last completed comment page        
        comment_pages = int(last_completed[1])  # End at the last completed comment
        last_post_index = int(last_completed[2])  # Start from the last completed post index
        max_post_index = int(last_completed[3])  # End at the last completed post index

        # Exctract the last comment ID count from the last comment ID in the dictionary
        # This is to ensure that the comment IDs are unique across all posts
        comment_id_count = int(comment_dict['Comment_ID'][-1].split('_')[-1][8:])

        # Check if the last completed comment page and post index match
        if last_comment_page == comment_pages and last_post_index == max_post_index:
            print("No new comments to scrape, exiting.")
            return pd.DataFrame(comment_dict.copy())
        elif last_comment_page < comment_pages and last_post_index == max_post_index:
            print("Resuming from the last completed comment page.")
            comment_start_page = last_comment_page + 1 # Start from the next comment page
            post_start_index = last_post_index  # Continue from the last post index
        elif last_comment_page < comment_pages and last_post_index < max_post_index:
            print("Resuming from the last completed comment page and post index.")
            comment_start_page = last_comment_page + 1 # Start from the next comment page
            post_start_index = last_post_index  # Continue from the last post index
        elif last_comment_page == comment_pages and last_post_index < max_post_index:
            print("Resuming from the last completed post index.")
            comment_start_page = 1  # Start from the first comment page
            comment_id_count = 0 # Reset comment ID count
            post_start_index = last_post_index + 1  # Continue from the next post index

    else:
        print("Starting from the beginning.")
        comment_start_page = 1 # Start from the first comment page by default
        comment_pages = 1 # End at the first comment page by default
        post_start_index = 0 # Start from the first post index by default
        max_post_index = len(df) - 1 # End at the last post index
        comment_id_count = 0 # Initialize comment ID count
        # Initialize the comment dictionary to store scraped data
        comment_dict = {
            'Post_ID':[],
            'Comment_ID':[],
            'Comment_Content':[],
            'Comment_Author':[],
            'Comment_Author_Rank':[],
            'Comment_Date':[],
            'Comment_Datetime':[],
            'Comment_Category':[],
        }    
    
    urls = get_post_urls(df) # Extract the post URLs
    post_ids = get_post_ids(df) # Extract the post IDs
    category = get_category(df) # Extract the category

    # Initialize the Selenium WebDriver
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service)
   
    # Initialize progress bars
    post_pbar = tqdm(total=len(urls), desc="Posts Progress", dynamic_ncols=True, colour='blue')
    comment_pbar = tqdm(desc="Post Comments Progress", dynamic_ncols=True, colour='green')

    # Update the progress bar to start from the last completed post index
    if post_start_index > 0:
        post_pbar.update(post_start_index)

    # Loop through each post URL and scrape comments
    for post_current_index in range(post_start_index, max_post_index + 1):
        # Get the post URL and ID
        post_url = urls[post_current_index]
        post_id = post_ids[post_current_index]        
        post_url = 'https://forums.beyondblue.org.au' + post_url

        # Update the progress bar with the current post index
        post_pbar.set_postfix({'Current Post_ID': post_id})
        
        # Navigate to the post URL and wait for the page to load
        driver.get(post_url)
        WebDriverWait(driver, 10).until(
            # Wait for the page to load completely (would be very slow if we use)
            # lambda d: d.execute_script("return document.readyState") == "complete"
            # expected_conditions.visibility_of_element_located((By.CLASS_NAME, "lia-paging-page-last"))

            # Wait for the comments section to be visible
            expected_conditions.visibility_of_element_located((By.CSS_SELECTOR, "div.lia-component-message-list-detail-with-inline-editors > div.linear-message-list > div.lia-linear-display-message-view > div > div"))
        )

        # Parse the page source with BeautifulSoup
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        # Always find the last page number of comments
        # This is to ensure that we scrape all comments, even if the post has been updated
        comment_pages = find_last_page(soup)

        # Update the progress bar with the total number of comment pages
        comment_pbar.reset(total=comment_pages)

        # Update the progress bar to start from the 
        if comment_start_page > 1:
            comment_pbar.update(comment_start_page - 1)

        # Loop through each comment page and scrape comments
        for comment_current_page in range(comment_start_page, comment_pages + 1):
            # Construct the comment page URL and navigate to it
            comment_page_url = f'{post_url}/page/{str(comment_current_page)}'
            driver.get(comment_page_url)
            WebDriverWait(driver, 10).until(
                # Wait for the comments section to be visible (would be very slow if we use)
                # lambda d: d.execute_script("return document.readyState") == "complete"
                # expected_conditions.visibility_of_element_located((By.CLASS_NAME, "lia-quilt-row-footer"))

                # Wait for the comments section to be visible
                expected_conditions.visibility_of_element_located((By.CSS_SELECTOR, "div.lia-component-message-list-detail-with-inline-editors > div.linear-message-list > div.lia-linear-display-message-view > div > div"))
            )

            # Parse the page source with BeautifulSoup
            soup = BeautifulSoup(driver.page_source, 'html.parser')

            # Find all comment elements
            comment_list = soup.select('div.lia-component-message-list-detail-with-inline-editors > div.linear-message-list > div.lia-linear-display-message-view > div > div')
            
            # Scrape comment metadata
            # post id
            post_id_list = [post_id] * len(comment_list)
            
            # comment ids
            comment_id_list = [f"{post_id}_comment-{comment_id_count + i}" for i in range(1, len(comment_list)+1)]
            comment_id_count += len(comment_list)
            
            # comment contents
            comment_content_list = [format_text(comment.select_one('div.lia-message-body-content').get_text()) for comment in comment_list]

            # comment authors
            comment_author_list = [format_text(comment.select_one('span.UserName').get_text()) for comment in comment_list]

            # comment author ranks
            comment_author_rank_list = [format_text(comment.select_one('div.lia-message-author-rank').get_text()) for comment in comment_list]

            # comment dates
            comment_date_list = []
            for comment in comment_list:
                date_element = comment.select_one('span.local-date')
                if date_element:
                    date_text = date_element.get_text().strip()
                else:
                    datetime_text = comment.select_one('span.local-friendly-date').get('title')
                    date_text, _ = extract_datetime(datetime_text)
                formatted = format_date(date_text)
                comment_date_list.append(formatted)
                
            # comment datetime
            comment_datetime_list = []
            for comment in comment_list:
                date_element = comment.select_one('span.local-date')
                if date_element:
                    date_text = comment.select_one('span.local-date').get_text()
                    time_text = comment.select_one('span.local-time').get_text()
                else:
                    datetime_text = comment.select_one('span.local-friendly-date').get('title')
                    date_text, time_text = extract_datetime(datetime_text)
                formatted_datetime = format_datetime(date_text, time_text)
                comment_datetime_list.append(formatted_datetime)

            # comment category
            comment_category_list = [category] * len(comment_content_list)

            # Check if all lists have the same length
            assert check_comment_lengths(
                post_id_list, comment_id_list, 
                comment_content_list, comment_author_list,
                comment_author_rank_list, comment_date_list,
                comment_category_list, 
                ) is True, "Comment metadata fields dont have same length."
            
            # Append the scraped data to the dictionary
            comment_dict['Post_ID'].extend(post_id_list)
            comment_dict['Comment_ID'].extend(comment_id_list)
            comment_dict['Comment_Content'].extend(comment_content_list)
            comment_dict['Comment_Author'].extend(comment_author_list)
            comment_dict['Comment_Author_Rank'].extend(comment_author_rank_list)
            comment_dict['Comment_Date'].extend(comment_date_list)
            comment_dict['Comment_Datetime'].extend(comment_datetime_list)
            comment_dict['Comment_Category'].extend(comment_category_list)

            # save progress after each comment page
            confirmed_comment_dict = comment_dict.copy()
            save_progress(comment_dict=confirmed_comment_dict, last_completed=[comment_current_page, comment_pages, post_current_index, max_post_index])
            
            # Update the progress bar
            comment_pbar.update(1)

            # Sleep for 30 seconds every 50 comments to avoid rate limiting
            if len(comment_dict['Comment_ID']) % 50 == 0:
                time.sleep(30)

        # Update the post progress bar
        post_pbar.update(1)

        # Reset comment id count for the next post
        comment_id_count = 0
    
    # Update the post progress bar to indicate completion
    post_pbar.set_postfix({'Current Post_ID': "finished"})

    # Close the WebDriver after scraping is done
    driver.quit()

    # Save the progress and scraped data to files
    comment_df = pd.DataFrame(comment_dict.copy())
    comment_df.to_csv(f'./data/BeyondBlue/{category_code}_data_comments.csv', index=False)
    
    return comment_df  # Return the first 10 rows of the DataFrame for preview

# Testing

## Read Data and Sample

In [33]:
df = pd.read_csv('./data/BeyondBlue/Suic_data_post.csv')

In [181]:
sampled_df = df.sample(6)  # For testing purposes, sample 3 posts
sampled_df

Unnamed: 0,Post_ID,Post_Title,Post_Content,Post_Author,Post_Author_Rank,Post_Date,Post_Category,Number_of_Comments,Post_URL
316,Suic-317,It’s all falling down,I’ve been suffering with all kinds of symptoms...,Leone39,Community Member,08-07-2023,Suicidal thoughts and self-harm,2,/t5/suicidal-thoughts-and-self-harm/it-s-all-f...
760,Suic-761,my mother,"my mother and i have quite a ""rough"" relations...",dopamine,Community Member,23-12-2021,Suicidal thoughts and self-harm,2,/t5/suicidal-thoughts-and-self-harm/my-mother/...
857,Suic-858,Hi,"Hi, this is my first time here. I don't really...",hanbanana,Community Member,03-09-2021,Suicidal thoughts and self-harm,7,/t5/suicidal-thoughts-and-self-harm/hi/td-p/9482
1159,Suic-1160,Newbie.,Im not sure im even posting this on the right ...,Di76,Community Member,27-12-2020,Suicidal thoughts and self-harm,2,/t5/suicidal-thoughts-and-self-harm/newbie/td-...
551,Suic-552,??? Whats wrong,"Feeling nothing, im stepping out my comfort zo...",Guest_4593,Community Member,11-08-2020,Suicidal thoughts and self-harm,158,/t5/suicidal-thoughts-and-self-harm/whats-wron...
564,Suic-565,Limit to support,First of all I’d like to thank beyondblue for ...,redtornado,Community Member,31-08-2022,Suicidal thoughts and self-harm,2,/t5/suicidal-thoughts-and-self-harm/limit-to-s...


## Scrape Test

In [198]:
scrape_comments(sampled_df)

Resuming from last completed post: 2 out of 5 and comment page 1 out of 1 pages.
Resuming from the last completed post index.


Posts Progress:   0%|          | 0/6 [00:00<?, ?it/s]

Post Comments Progress: 0it [00:00, ?it/s]

['Suic-1160_comment-1', 'Suic-1160_comment-2']
['Suic-552_comment-1', 'Suic-552_comment-2', 'Suic-552_comment-3', 'Suic-552_comment-4', 'Suic-552_comment-5', 'Suic-552_comment-6', 'Suic-552_comment-7', 'Suic-552_comment-8', 'Suic-552_comment-9']
['Suic-552_comment-10', 'Suic-552_comment-11', 'Suic-552_comment-12', 'Suic-552_comment-13', 'Suic-552_comment-14', 'Suic-552_comment-15', 'Suic-552_comment-16', 'Suic-552_comment-17', 'Suic-552_comment-18', 'Suic-552_comment-19']
['Suic-552_comment-20', 'Suic-552_comment-21', 'Suic-552_comment-22', 'Suic-552_comment-23', 'Suic-552_comment-24', 'Suic-552_comment-25', 'Suic-552_comment-26', 'Suic-552_comment-27', 'Suic-552_comment-28', 'Suic-552_comment-29']
['Suic-552_comment-30', 'Suic-552_comment-31', 'Suic-552_comment-32', 'Suic-552_comment-33', 'Suic-552_comment-34', 'Suic-552_comment-35', 'Suic-552_comment-36', 'Suic-552_comment-37', 'Suic-552_comment-38', 'Suic-552_comment-39']
['Suic-552_comment-40', 'Suic-552_comment-41', 'Suic-552_comm

Unnamed: 0,Post_ID,Comment_ID,Comment_Content,Comment_Author,Comment_Author_Rank,Comment_Date,Comment_Datetime,Comment_Category
0,Suic-317,Suic-317_comment-1,Hi Leone39 When I read your post I wished it p...,therising,Valued Contributor,08-07-2023,2023-07-08 12:08:00,Suicidal thoughts and self-harm
1,Suic-317,Suic-317_comment-2,welcome back? (sounds odd or off in the circum...,smallwolf,Community Champion,08-07-2023,2023-07-08 21:58:00,Suicidal thoughts and self-harm
2,Suic-761,Suic-761_comment-1,"Hi Dopamine, Thanks for reaching out tonight, ...",Sophie_M,Moderator,23-12-2021,2021-12-23 22:11:00,Suicidal thoughts and self-harm
3,Suic-761,Suic-761_comment-2,Hi dopamine I feel for you so much as you face...,therising,Valued Contributor,24-12-2021,2021-12-24 07:33:00,Suicidal thoughts and self-harm
4,Suic-858,Suic-858_comment-1,"Hi, welcome You've done well for your introduc...",white knight,Community Champion,03-09-2021,2021-09-03 13:08:00,Suicidal thoughts and self-harm
...,...,...,...,...,...,...,...,...
168,Suic-552,Suic-552_comment-156,Hi Guest_4593 We're sorry to hear how much you...,Sophie_M,Moderator,23-08-2022,2022-08-23 00:35:00,Suicidal thoughts and self-harm
169,Suic-552,Suic-552_comment-157,I hurt. I hurt myself. I feel nothing. I feel ...,Guest_4593,Community Member,15-09-2022,2022-09-15 23:25:00,Suicidal thoughts and self-harm
170,Suic-552,Suic-552_comment-158,"Hi there, Thanks for sharing this update with ...",Sophie_M,Moderator,16-09-2022,2022-09-16 13:58:00,Suicidal thoughts and self-harm
171,Suic-565,Suic-565_comment-1,Hi redtornado Thank you for confirming your sa...,Sophie_M,Moderator,31-08-2022,2022-08-31 20:01:00,Suicidal thoughts and self-harm


# Scraping

## Scrape 'Suicidal Thoughts and Self-Harm' Comments

In [199]:
suic_df = pd.read_csv('./data/BeyondBlue/Suic_data_post.csv')

In [201]:
suic_df.head(1)

Unnamed: 0,Post_ID,Post_Title,Post_Content,Post_Author,Post_Author_Rank,Post_Date,Post_Category,Number_of_Comments,Post_URL
0,Suic-1,I give up,I don’t want to be here anymore. I’m tired of ...,Done_,Community Member,02-05-2025,Suicidal thoughts and self-harm,18,/t5/suicidal-thoughts-and-self-harm/i-give-up/...


In [210]:
scrape_comments(suic_df)

Starting from the beginning.


Posts Progress:   0%|          | 0/1112 [00:00<?, ?it/s]

Post Comments Progress: 0it [00:00, ?it/s]

Unnamed: 0,Post_ID,Comment_ID,Comment_Content,Comment_Author,Comment_Author_Rank,Comment_Date,Comment_Datetime,Comment_Category
0,Suic-1,Suic-1_comment-1,Hello and welcome. You matter. That you have w...,smallwolf,Community Champion,04-05-2025,2025-05-04 15:55:00,Suicidal thoughts and self-harm
1,Suic-1,Suic-1_comment-2,The warmest of welcomes to you at what sounds ...,therising,Valued Contributor,05-05-2025,2025-05-05 04:44:00,Suicidal thoughts and self-harm
2,Suic-1,Suic-1_comment-3,That sounds really tough and I can’t imagine w...,Ben5,Community Member,05-05-2025,2025-05-05 23:26:00,Suicidal thoughts and self-harm
3,Suic-1,Suic-1_comment-4,Thank you i appreciate your response. Professi...,Done_,Community Member,06-05-2025,2025-05-06 20:35:00,Suicidal thoughts and self-harm
4,Suic-1,Suic-1_comment-5,I didn’t expect anyone to respond or even ackn...,Done_,Community Member,06-05-2025,2025-05-06 20:53:00,Suicidal thoughts and self-harm
...,...,...,...,...,...,...,...,...
12330,Suic-1316,Suic-1316_comment-2,"Shay, Hi again. As the song in The Sound of Mu...",The_Real_David_Charles,Community Member,21-06-2013,2013-06-21 16:59:00,Suicidal thoughts and self-harm
12331,Suic-1316,Suic-1316_comment-3,"dear Shay, good points by David Charles and Da...",geoff,Champion Alumni,22-06-2013,2013-06-22 06:11:00,Suicidal thoughts and self-harm
12332,Suic-1316,Suic-1316_comment-4,"No-one goes unnoticed David, which is why I tr...",Damien,Blue Voices Member,22-06-2013,2013-06-22 10:33:00,Suicidal thoughts and self-harm
12333,Suic-1316,Suic-1316_comment-5,"Hey Shay, Im 25 and growing up I spent ALL my ...",Couch_Dracula,Community Member,01-07-2013,2013-07-01 17:55:00,Suicidal thoughts and self-harm


## Scrape 'PTSD and trauma' comments

In [17]:
ptsd_post_df = pd.read_csv('./data/BeyondBlue/PTSD_data_post.csv')
ptsd_post_df.head(1)

Unnamed: 0,Post_ID,Post_Title,Post_Content,Post_Author,Post_Author_Rank,Post_Date,Post_Category,Number_of_Comments,Post_URL
0,PTSD-1,cPTSD and anxiety,"Hi,I'm struggling with cPTSD and am having pan...",Chriss,Community Member,08-07-2025,PTSD and trauma,4,/t5/ptsd-and-trauma/cptsd-and-anxiety/td-p/612042


In [18]:
scrape_comments(ptsd_post_df)

Starting from the beginning.


Posts Progress:   0%|          | 0/1650 [00:00<?, ?it/s]

Post Comments Progress: 0it [00:00, ?it/s]

Unnamed: 0,Post_ID,Comment_ID,Comment_Content,Comment_Author,Comment_Author_Rank,Comment_Date,Comment_Datetime,Comment_Category
0,PTSD-1,PTSD-1_comment-1,"Dear Chriss~Welcome here to the Forum, a place...",Croix,Community Champion,08-07-2025,2025-07-08 22:44:00,PTSD and trauma
1,PTSD-1,PTSD-1_comment-2,Hello Chriss I'm so sorry that you feel this w...,TrueSeeker,Community Member,09-07-2025,2025-07-09 10:29:00,PTSD and trauma
2,PTSD-1,PTSD-1_comment-3,"Hi Croix, Thank you for your words of support....",Chriss,Community Member,09-07-2025,2025-07-09 15:30:00,PTSD and trauma
3,PTSD-1,PTSD-1_comment-4,"Dear Chriss~I'm glad you are getting support, ...",Croix,Community Champion,09-07-2025,2025-07-09 22:55:00,PTSD and trauma
4,PTSD-1,PTSD-1_comment-5,there's a lot of trust to build there my frien...,Algernon,Community Member,16-07-2025,2025-07-16 01:51:00,PTSD and trauma
...,...,...,...,...,...,...,...,...
14775,PTSD-2059,PTSD-2059_comment-10,"Hi Bingee Girl, I saw your post and I know how...",ilovetoread73_,Community Member,15-05-2015,2015-05-15 00:59:00,PTSD and trauma
14776,PTSD-2059,PTSD-2059_comment-11,"Hello out therr Hello BKYTH, Neil 1 snd i love...",Bingee_Girl,Community Member,21-06-2015,2015-06-21 22:42:00,PTSD and trauma
14777,PTSD-2059,PTSD-2059_comment-12,I think the real tragedy of past traumas is wh...,BKYTH,Community Member,23-06-2015,2015-06-23 22:13:00,PTSD and trauma
14778,PTSD-2060,PTSD-2060_comment-1,"Hi BelleAnthony2014, This is a tough situation...",Pixie15,Community Member,01-02-2015,2015-02-01 17:53:00,PTSD and trauma


## Scrape 'Depression' comments

In [19]:
depr_post_df = pd.read_csv('./data/BeyondBlue/Depr_data_post.csv')
depr_post_df.head(1)

Unnamed: 0,Post_ID,Post_Title,Post_Content,Post_Author,Post_Author_Rank,Post_Date,Post_Category,Number_of_Comments,Post_URL
0,Depr-1,Following the breadcrumbs to improve mental he...,"Hi everyone, The last few months have been som...",indigo22,Community Champion,01-08-2024,Depression,154,/t5/depression/following-the-breadcrumbs-to-im...


In [20]:
scrape_comments(depr_post_df)

Starting from the beginning.


Posts Progress:   0%|          | 0/5310 [00:00<?, ?it/s]

Post Comments Progress: 0it [00:00, ?it/s]

Unnamed: 0,Post_ID,Comment_ID,Comment_Content,Comment_Author,Comment_Author_Rank,Comment_Date,Comment_Datetime,Comment_Category
0,Depr-2,Depr-2_comment-1,I truly feel for you max. I feel like I’m in a...,Unholy_Idiot,Community Member,10-07-2025,2025-07-10 18:38:00,Depression
1,Depr-2,Depr-2_comment-2,"Hi 44Max44 and wave to Unholy_Idiot, I am wond...",Eagle Ray,Valued Contributor,10-07-2025,2025-07-10 23:43:00,Depression
2,Depr-3,Depr-3_comment-1,Hello I can understand how hard it can be to f...,TrueSeeker,Community Member,08-07-2025,2025-07-08 10:08:00,Depression
3,Depr-3,Depr-3_comment-2,"Hi Oshinxx1 and wave to TrueSeeker, I can rela...",Eagle Ray,Valued Contributor,09-07-2025,2025-07-09 23:36:00,Depression
4,Depr-5,Depr-5_comment-1,HiI'm down & sad & couldn't understand my ange...,JacintaMarie,Community Member,18-02-2025,2025-02-18 22:31:00,Depression
...,...,...,...,...,...,...,...,...
35663,Depr-6616,Depr-6616_comment-1,"""No One Belongs Here More Than You"". This is a...",Vegetarian Marshmallow,Community Member,02-05-2013,2013-05-02 12:27:00,Depression
35664,Depr-6616,Depr-6616_comment-2,Deep intake of breath! I understand what you a...,Jemima,Community Member,02-05-2013,2013-05-02 13:13:00,Depression
35665,Depr-6616,Depr-6616_comment-3,"hello, you must have heard about Beyond Blue w...",geoff,Champion Alumni,03-05-2013,2013-05-03 05:33:00,Depression
35666,Depr-6617,Depr-6617_comment-1,"Dear Chloe, We're really sorry to hear you've ...",Ruth_M,Community Member,28-04-2013,2013-04-28 19:22:00,Depression
