## Groupings

In [2]:
# Grouping levels
level_1_grouping = [
    "Introduce yourself",
    "Mental health conditions",
    "Caring for myself and others",
    "People like me"
]

level_2_grouping = {
    "conditions": {
        "anxiety": {
            "id": "Anxi",
            "name": "Anxiety",
            "url": "https://forums.beyondblue.org.au/t5/anxiety/bd-p/c1-sc2-b1"
        },
        "depression": {
            "id": "Depr",
            "name": "Depression",
            "url": "https://forums.beyondblue.org.au/t5/depression/bd-p/c1-sc2-b2"
        },
        "ptsd": {
            "id": "PTSD",
            "name": "PTSD and trauma",
            "url": "https://forums.beyondblue.org.au/t5/ptsd-and-trauma/bd-p/c1-sc2-b3"
        },
        "suicidal": {
            "id": "Suic",
            "name": "Suicidal thoughts and self-harm",
            "url": "https://forums.beyondblue.org.au/t5/suicidal-thoughts-and-self-harm/bd-p/c1-sc2-b4"
        }
    },
}

other_grouping = {
    "Introduce yourself": {
        "Welcome and orientation",
        "BB Social Zone",
        "Forums feedback and updates"
    },
    "Mental health conditions": [
        "Anxiety",
        "Depression",
        "PTSD and trauma",
        "Suicidal thoughts and self-harm"
    ],
    "Caring for myself and others": [
        "Staying well",
        "Treatments, health professionals and therapies",
        "Relationship and family issues",
        "Supporting family and friends",
        "Long-term support over the journey"
    ],
    "People like me": [
        "Young people",
        "Sexuality and gender identity",
        "Multicultural experiences",
        "Grief and loss"
    ]
}

## Libraries

In [None]:
import pandas as pd
import os
import pickle
import re
import copy
import time
import dateparser

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
from pathlib import Path
from datetime import date as Date
from datetime import datetime as Datetime


In [4]:
# # upgrade ipywidgets if progress bar does not work
# !pip install --upgrade ipywidgets

## Helper Functions

### format_text()

In [None]:
def format_text(text):
  """  Cleans and standardizes text by removing leading/trailing whitespace,
  replacing newlines, tabs, and non-breaking spaces with a single space.
  Args:
      text (str): The text to be formatted.
  Returns:
      str: The cleaned and standardized text.
  """
  text = text.strip()
  text = text.replace('\n', ' ')
  text = text.replace('\t', ' ')
  text = text.replace('\xa0', ' ')
  return text

### save_progress()

In [None]:
def save_progress(forum, last_completed, post_dict):
    """ Saves the progress of the scraping operation to a file and the scraped data to a pickle file.
    Args:
        forum (dict): A dictionary containing forum metadata such as 'id'.
        last_completed (list): A list of indices representing the last completed posts.
        post_dict (dict): A dictionary containing the scraped post data.
    Returns:
        None
    """
    status_string = ""
    for i, num in enumerate(last_completed):
        separator = "|" if i > 0 else ""
        status_string = status_string + separator + str(num)
    with open(f'./data/BeyondBlue/{forum["id"]}_progress_file', 'w') as f:
        f.write(str(status_string))
    with open(f'./data/BeyondBlue/{forum["id"]}_data_post.pkl', 'wb') as f:
        pickle.dump(post_dict, f)

### load_progress()

In [None]:
def load_progress(progress_file, data_post_file):
    """ Loads the last completed indices and the scraped post data from files.
    Args:
        progress_file (str): The path to the file containing the last completed indices.
        data_post_file (str): The path to the pickle file containing the scraped post data.
    Returns:
        tuple: A tuple containing the last completed indices and the scraped post data.
    """
    if os.path.exists(progress_file):
        with open(progress_file, 'r') as f:
            last_completed = str(f.read())
            last_completed = last_completed.split("|")
    else:
        return 0,0
    if os.path.exists(data_post_file):
        with open(data_post_file, 'rb') as f:
            post_dict = pickle.load(f)
    else:
        return 0,0
    return last_completed, post_dict  # Default to 0 if no progress file exists


### find_last_page()

In [None]:
def find_last_page(soup):
    """ Extracts the last page number from a BeautifulSoup object containing the HTML of a web page.
    Args:
        soup (BeautifulSoup): A BeautifulSoup object containing the HTML of the web page.
    Returns:
        int: The last page number found in the script tag, or 1 if no script tag is found or no page numbers are present.
    """
    script_tag = soup.find('script', string=re.compile(r'LITHIUM.Cache.CustomEvent.set'))

    if script_tag:
        script_content = script_tag.string
    
        page_numbers = re.findall(r'"page":(\d+)', script_content)
        page_numbers = list(map(int, page_numbers))  # Convert to integers
        
        max_page = max(page_numbers) if page_numbers else 1
        return max_page
    else:
        return 1

### check_post_lengths()

In [None]:
def check_post_lengths(id, title, content, author, author_rank, date, category, comments, urls):
    """ Checks if all metadata fields in a given post_dict dictionary have matching lengths.
    Args:
        id (list): List of post IDs.
        title (list): List of post titles.
        content (list): List of post content.
        author (list): List of post authors.
        author_rank (list): List of author ranks.
        date (list): List of post dates.
        category (list): List of post categories.
        comments (list): List of comments on the posts.
        urls (list): List of URLs for the posts.
    Returns:
        bool: True if all lists have the same length, False otherwise.
    """
    # Check if all lists have the same length
    # If they do, return True; otherwise, print the lengths and return False
    if len(id) == len(title) == len(content) == len(author) == len(author_rank) == len(date) == len(category) == len(comments) == len(urls):
        return True
    else:
        print(f"Lengths - ID: {len(id)}, Title: {len(title)}, Content: {len(content)}, Author: {len(author)}, Author Rank: {len(author_rank)}, Date: {len(date)}, Category: {len(category)}, Comments: {len(comments)}, URLs: {len(urls)}")
        return False

## Main Functions

### check_num_posts()

In [None]:
def check_num_posts(scrape = False, show_changes_from_last = False):
    """ Scrapes the Beyond Blue forum to get the number of posts in each category and saves the data to a CSV file.
    Args:
        scrape (bool): If True, the function will scrape the data from the website.
        show_changes_from_last (bool): If True, the function will show the changes in post counts from the last recorded data.
    Returns:
        pd.DataFrame: A DataFrame containing the post counts for each category, with columns for the date, time, and counts for each category.
    """
    file = Path("./data/BeyondBlue/post_count_history.csv")
    if file.exists():
        df = pd.read_csv(file)
    else:
        print("file doesn't exist")

    if scrape:
        service = Service(ChromeDriverManager().install())
        driver = webdriver.Chrome(service=service)
        driver.get("https://forums.beyondblue.org.au/")
        try:
            # Wait up to 10 seconds for the element to appear
            page_data = WebDriverWait(driver, 10).until(
                expected_conditions.visibility_of_element_located((By.CLASS_NAME, "lia-message-count"))
            )
        except:
            print(f"Page failed to load correctly. Check again later.")
            driver.quit()
            return None

        soup = BeautifulSoup(driver.page_source, 'html.parser')
        date = Date.today().strftime("%d-%m-%Y")
        time = Datetime.now().strftime("%H:%M:%S")
        counts = [date, time]
        counts.extend([int(tag.get_text().replace("\n","").strip()) for tag in soup.select('span.lia-message-count')])
        
        if file.exists():
            df.loc[len(df)] = counts
        else:
            categories = ['Date', 'Time']
            categories.extend([tag.get_text() for tag in soup.select('td > h2 > a.lia-link-navigation')])
            df = pd.DataFrame([counts], columns=categories)
        
        driver.quit()       
        df.to_csv("./data/BeyondBlue/post_count_history.csv", index = False)
    
    if show_changes_from_last:
        headers = df.columns.to_list()[2:]
        last_row = df.iloc[-1][2:]
        scd_last_row = df.iloc[-2][2:]
        diff = [x - y for x, y in zip(last_row, scd_last_row)]
        df = pd.DataFrame([diff], columns=headers)
        df = df.loc[:, df.iloc[0] != 0]
        if df.shape[1] == 0:
            print("There are no changes to the data")
            
    return df

### pickle_to_dataframe()

In [None]:
def pickle_to_dataframe(forum, save_to_csv=False):
    """
    Convert a pickle file to a pandas DataFrame.
    
    Args:
        forum (dict): forum information containing 'id'.
        save_to_csv (bool): If True, save the DataFrame to a CSV file.
        
    Returns:
        pd.DataFrame: DataFrame containing the data from the pickle file.
    """
    pickle_file = f'./data/BeyondBlue/{forum["id"]}_data_post.pkl'
    with open(pickle_file, 'rb') as f:
        data = pickle.load(f)
    csv_file = f'./data/BeyondBlue/{forum["id"]}_data_post_temp.csv'
    data_frame = pd.DataFrame(data) if isinstance(data, dict) else pd.DataFrame.from_records(data)
    if save_to_csv:        
        data_frame.to_csv(csv_file, index=False)
    return data_frame

### stitch_data()

In [None]:
def stitch_data(group, fora, save_to_csv=False):
    """
    Stitch together multiple pickle files into a single DataFrame.
    Args:
        group (str): The group name for which the data is being stitched.
        fora (list): List of forum identifiers to be stitched.
        save_to_csv (bool): If True, save the stitched DataFrame to a CSV file.
    Returns:
        pd.DataFrame: A DataFrame containing the stitched data from all specified fora.
    """
    stitch_df = pd.DataFrame()
    for forum in fora:        
        # Load the pickle file
        pickle_file = f'./data/BeyondBlue/{level_2_grouping[group][forum]["id"]}_data_post.pkl'
        if not os.path.exists(pickle_file):
            print(f"Pickle file {pickle_file} does not exist.")
            continue
        
        with open(pickle_file, 'rb') as f:
            data = pickle.load(f)
        
        # Convert to DataFrame
        df = pd.DataFrame(data) if isinstance(data, dict) else pd.DataFrame.from_records(data)

        # append to the stitch DataFrame
        if stitch_df.empty:
            stitch_df = df
        else:            
            stitch_df = pd.concat([stitch_df, df], ignore_index=True)
        
    # Save to CSV if required
    if save_to_csv:
        csv_file = f'./data/BeyondBlue/{group}_data_post_stitched.csv'
        stitch_df.to_csv(csv_file, index=False)

    # Return the DataFrame    
    return stitch_df

### scrape_posts()

In [None]:
def scrape_posts(forum, restart=False):
    """ Scrapes posts from a specific forum category on the Beyond Blue website and save to csv file.
    Args:
        forum (dict): A dictionary containing forum metadata such as 'id', 'name', and 'url'.
        restart (bool): If True, the scraping will restart from the beginning, ignoring any existing progress.
    Returns:
        None
    """
    url_base = forum['url']
    forum_id = forum['id']
    forum_name = forum['name']
    print(f"Scraping posts from category: {forum_name}")

    # Check if progress file exists
    progress_file = f'./data/BeyondBlue/{forum_id}_progress_file'
    data_post_file = f'./data/BeyondBlue/{forum_id}_data_post.pkl'    
    print(f"Checking if progress file and data_post file exists and not restarting the scrape")
    if os.path.exists(progress_file) and os.path.exists(data_post_file) and not restart:
        print("progress file and data_post file exists")        
        print("getting data from progress file and data_post file")        
        last_completed, post_dict_cache = load_progress(progress_file, data_post_file)
        if last_completed == 0 and post_dict_cache == 0:
            print("Error in loading the progress. Please check the file.")
        start_page = int(last_completed[0])
        max_page = int(last_completed[1])
        post_dict = post_dict_cache
        post_id_cnt = int(post_dict['Post_ID'][-1][5:])

    # If no existing data, create new
    else:
        print("progress.txt and data_post.pkl does not exist")        
        print("setting up new variables")
        start_page = 0
        post_id_cnt = 0 
        max_page = None  # Set to None to find the last page later
        # Initialize post_dict with empty lists for each key
        post_dict = {
            'Post_ID':[],
            'Post_Title':[],
            'Post_Content':[],
            'Post_Author':[],
            'Post_Author_Rank':[],
            'Post_Date':[],
            'Post_Category':[],
            'Number_of_Comments':[],
            'Post_URL':[],
        }
    
    # Setup Chrome WebDriver
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service)

    if not max_page:
        print("max_page is not set, finding the last page number from the website.")
        driver.get(url_base)
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        try:
            # Wait up to 10 seconds for the element to appear
            page_posts = WebDriverWait(driver, 10).until(
                expected_conditions.visibility_of_element_located((By.CLASS_NAME, "lia-paging-page-last"))
            )
        except:
            print(f"{url_base} failed to load correctly.")
        max_page = find_last_page(soup)
        print(f"max_page is set to {max_page}")

    pbar = tqdm(total=max_page, desc="Scraping in progress...", dynamic_ncols=True)
    if start_page != 1:
        pbar.update(start_page)

    for page in range(start_page + 1, max_page + 1):
        # go to level 2 grouping page
        url = url_base + '/page/' + str(page)
        driver.get(url)
        try:
            # Wait up to 10 seconds for the element to appear
            page_posts = WebDriverWait(driver, 10).until(
                expected_conditions.visibility_of_element_located((By.CLASS_NAME, "lia-paging-page-last"))
            )
        except:
            print(f"Page {page} failed to load correctly.")
        
        if page_posts:
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            
            # post title
            post_title_list = [tag.find_next_sibling('a').get_text() for tag in soup.select('div.all-discussions > section > article > div > h3 > a.UserAvatar.lia-link-navigation')]

            # post id
            post_id_list = [post_id_cnt + i for i in range(1,len(post_title_list) + 1)]
            post_id_cnt = post_id_cnt + len(post_id_list)

            # post content
            post_content_list = [format_text(content.get_text()) for content in soup.select('div.all-discussions > section > article > div > p.body-text')]

            # post author
            post_author_list = [tag.get_text() for tag in soup.select('div.all-discussions > section > article > aside > div.custom-tile-author-info > div.custom-author-name > strong > a > span')]

            # author rank
            post_author_rank_list = [tag.get_text() for tag in soup.select('div.all-discussions > section > article > aside > div.custom-tile-author-info > div.custom-tile-author-rank > em')]

            # post date
            post_date_list = [tag.get_text() for tag in soup.select('div.all-discussions > section > article > aside > div.custom-tile-category-content > div.custom-tile-date > time')]
            post_date_cleaned_list = []
            for text in post_date_list:
                match = re.search(r'\b\d{2}-\d{2}-\d{4}\b', text)
                if match:
                    post_date_cleaned_list.append(match.group())
                else:
                    reference_date = Datetime.now()
                    parsed = dateparser.parse(text, settings={'RELATIVE_BASE': reference_date})
                    post_date_cleaned_list.append(parsed.strftime("%d-%m-%Y"))

            # post category
            post_category_list = [tag.get_text() for tag in soup.select('div.all-discussions > section > article > aside > div.custom-tile-category-content > div.custom-tile-category-board > div.custom-tile-category > strong > a')]
            
            # update id to reflect the category
            post_id_list = [category[:4]+"-"+str(num_id) for category, num_id in zip(post_category_list, post_id_list)]

            # number of comments
            post_num_comments_list = [int(tag.get_text()) for tag in soup.select('div.all-discussions > section > article > div > h3 > ul.custom-tile-statistics > li.custom-tile-replies > b')]

            # urls        
            url_list = [tag.get('href', '') for tag in soup.select('div.all-discussions > section > article > div > h3 > a')]
            post_url_list = [url for url in url_list if '/viewprofilepage/' not in url]

            assert check_post_lengths(
                post_id_list, post_title_list, 
                post_content_list, post_author_list,
                post_author_rank_list, post_date_cleaned_list,
                post_category_list, post_num_comments_list,
                post_url_list
                ) is True, "Post metadata fields dont have same length."
           
            post_dict['Post_ID'].extend(post_id_list)
            post_dict['Post_Title'].extend(post_title_list)
            post_dict['Post_Content'].extend(post_content_list)
            post_dict['Post_Author'].extend(post_author_list)
            post_dict['Post_Author_Rank'].extend(post_author_rank_list)
            post_dict['Post_Date'].extend(post_date_cleaned_list)
            post_dict['Post_Category'].extend(post_category_list)
            post_dict['Number_of_Comments'].extend(post_num_comments_list)
            post_dict['Post_URL'].extend(post_url_list)
        
        else:
            print("Failed to fetch the webpage.")
            return None
        
        # saving progress
        confirmed_post = copy.deepcopy(post_dict)
        save_progress(forum = forum, last_completed=[page, max_page], post_dict=confirmed_post)
        pbar.update(1)
        pbar.set_postfix({"Processing for": forum_name,"Page No": page, "Total Posts": len(post_dict['Post_ID'])})

        if page % 30 == 0:
            time.sleep(30)
        
    pbar.close()
    driver.quit()
    
    # Storing at the Overall level
    data_post = pd.DataFrame(confirmed_post)
    data_post.to_csv(f'./data/BeyondBlue/{forum_id}_data_post.csv', index=False)

    print("scraping complete")

In [37]:
scrape_posts(level_2_grouping['conditions']['ptsd'], restart=True)

Scraping posts from category: PTSD and trauma
Checking if progress file and data_post file exists and not restarting the scrape
progress.txt and data_post.pkl does not exist
setting up new variables
max_page is not set, finding the last page number from the website.
max_page is set to 206


Scraping in progress...:   0%|          | 0/206 [00:00<?, ?it/s]

scraping complete


In [40]:
pickle_to_dataframe(level_2_grouping['conditions']['anxiety'], save_to_csv=True)

Loading data from ./data/BeyondBlue/Anxi_data_post.pkl


Unnamed: 0,Post_ID,Post_Title,Post_Content,Post_Author,Post_Author_Rank,Post_Date,Post_Category,Number_of_Comments,Post_URL
0,Anxi-1,"Anxiety, Injustice, and Fear: Workplace Exploi...","Hi everyone,I’m going through one of the most ...",Joker_J,Community Member,09-07-2025,Anxiety,0,/t5/anxiety/anxiety-injustice-and-fear-workpla...
1,Anxi-2,Just broke free from rude friend and I still f...,"Recently, I have left my old friend who would ...",waffle_puppy,Community Member,25-06-2025,Anxiety,3,/t5/anxiety/just-broke-free-from-rude-friend-a...
2,Anxi-3,Parental Anxiety,"Hi Everyone,I'm not quite sure how to put this...",Amenace,Community Member,03-07-2025,Anxiety,3,/t5/anxiety/parental-anxiety/td-p/611895
3,Anxi-4,Strategies for home sickness on holiday,"Hello, I struggle with feeling very home sick ...",Olive83,Community Member,25-06-2025,Anxiety,2,/t5/anxiety/strategies-for-home-sickness-on-ho...
4,Anxi-5,Going backwards,Good morning.I had severe depression resulting...,Guest_96807279,Community Member,02-07-2025,Anxiety,1,/t5/anxiety/going-backwards/td-p/611848
...,...,...,...,...,...,...,...,...,...
535,Anxi-536,I I think I have accidentally made myself bad ...,I've been employed at a cafe for approximately...,jcjc06,Community Member,26-01-2024,Anxiety,2,/t5/anxiety/i-i-think-i-have-accidentally-made...
536,Anxi-537,24/7 muscle twitches and my story,For the last 5 weeks I’ve had 24/7 muscle twit...,Anthony_a,Community Member,04-01-2021,Anxiety,4,/t5/anxiety/24-7-muscle-twitches-and-my-story/...
537,Anxi-538,Reality of anxiety and social health,"Hi All, I would like to discuss the best optio...",Red-Rex,Community Member,15-01-2024,Anxiety,1,/t5/anxiety/reality-of-anxiety-and-social-heal...
538,Anxi-539,Becoming more anxious after seeking outside gu...,"Hi everyone, I'm struggling today and wasn't a...",Clover9312,Community Member,15-01-2024,Anxiety,9,/t5/anxiety/becoming-more-anxious-after-seekin...


In [39]:
stitch_data('conditions', ['anxiety', 'depression', 'ptsd', 'suicidal'], save_to_csv=True)

Unnamed: 0,Post_ID,Post_Title,Post_Content,Post_Author,Post_Author_Rank,Post_Date,Post_Category,Number_of_Comments,Post_URL
0,Anxi-1,I’m stuck!,I’ve never written on a forum like this before...,Guest_39557583,Community Member,20-06-2025,Anxiety,6,/t5/anxiety/i-m-stuck/td-p/611578
1,Anxi-2,Back injury anxiety,Hi. I'm new here. I am extremely anxious when ...,Guest42,Community Member,10-07-2025,Anxiety,0,/t5/anxiety/back-injury-anxiety/td-p/612114
2,Anxi-3,"Anxiety, Injustice, and Fear: Workplace Exploi...","Hi everyone,I’m going through one of the most ...",Joker_J,Community Member,09-07-2025,Anxiety,1,/t5/anxiety/anxiety-injustice-and-fear-workpla...
3,Anxi-4,Im lost and wasn't sure what should I do next.,I am international student to Tasmania in 2021...,tevont,Community Member,20-06-2025,Anxiety,2,/t5/anxiety/im-lost-and-wasn-t-sure-what-shoul...
4,Anxi-5,Just broke free from rude friend and I still f...,"Recently, I have left my old friend who would ...",waffle_puppy,Community Member,27-06-2025,Anxiety,3,/t5/anxiety/just-broke-free-from-rude-friend-a...
...,...,...,...,...,...,...,...,...,...
17494,Suic-1312,"Feeling used, abused and discarded",Been good friends with my brother in law and a...,Cookie64,Community Member,17-07-2019,Suicidal thoughts and self-harm,1,/t5/suicidal-thoughts-and-self-harm/feeling-us...
17495,Suic-1313,13 Reasons Why,I'm not sure if anyone has seen the Netflix se...,lizzie50,Community Member,09-04-2017,Suicidal thoughts and self-harm,17,/t5/suicidal-thoughts-and-self-harm/13-reasons...
17496,Suic-1314,Feeling really down and struggling,"Hi guys, I am new to Beyond Blue and to comple...",Gray_13,Community Member,27-11-2016,Suicidal thoughts and self-harm,12,/t5/suicidal-thoughts-and-self-harm/feeling-re...
17497,Suic-1315,PLEASE READ THIS FIRST: posting in this section,Life gets pretty hard sometimes and many peopl...,Sophie_M,Moderator,07-09-2015,Suicidal thoughts and self-harm,0,/t5/suicidal-thoughts-and-self-harm/please-rea...
