In [1]:
import os
import time
import sys
import pandas as pd
from datetime import datetime
from tqdm import tqdm
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options


In [None]:
# Chrome Setup
options = Options()
# options.add_argument("--headless")  # Uncomment to run without GUI

# Initialise the driver
driver = webdriver.Chrome(
    service=Service(ChromeDriverManager().install()),
    options=options
)

# Chrome check
driver.get("https://www.google.com")
print("Chrome opened successfully.")

Chrome opened successfully.


In [None]:
# Cleaning
def format_text(text):
    return text.strip().replace('\n', ' ').replace('\t', ' ').replace('\xa0', ' ')



In [4]:
def scrape_forum(forum_path, forum_slug, forum_name, start_page=1, end_page=664):
    base_url = f"https://forums.beyondblue.org.au/t5/{forum_path}/bd-p/{forum_slug}"
    print(f"Starting scrape: {base_url}")
    all_data = []

    for page in range(start_page, end_page + 1):
        forum_page = f"{base_url}/page/{page}"
        print(f"Reading forum page: {forum_page}")
        try:
            driver.get(forum_page)
            time.sleep(2)
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            thread_links = []

            for a in soup.find_all("a", href=True):
                href = a.get("href")
                if "/td-p/" in href and "/user/" not in href:
                    full_url = "https://forums.beyondblue.org.au" + href.split("?")[0]
                    if full_url not in thread_links:
                        thread_links.append(full_url)

            for thread_url in tqdm(thread_links, desc=f"Threads in {forum_name}"):
                try:
                    thread_base = thread_url.split("/page")[0]

                    # Load first page to check if page 2 exists
                    driver.get(thread_base)
                    WebDriverWait(driver, 10).until(
                        EC.presence_of_element_located((By.CLASS_NAME, "lia-message-body-content"))
                    )
                    soup = BeautifulSoup(driver.page_source, 'html.parser')

                    # Determine if second page exists
                    pagination_links = soup.select(".lia-paging-page-link")
                    has_page_2 = any(link.get_text(strip=True) == "2" for link in pagination_links)

                    # Only include page 2 if it exists
                    pages_to_scrape = [thread_base]
                    if has_page_2:
                        pages_to_scrape.append(thread_base + "/page/2")

                    posts, authors, dates, times_ = [], [], [], []

                    for page_url in pages_to_scrape:
                        driver.get(page_url)
                        time.sleep(2)
                        soup = BeautifulSoup(driver.page_source, 'html.parser')
                        posts.extend(soup.find_all("div", class_="lia-message-body-content"))
                        authors.extend(soup.find_all("img", class_="lia-user-avatar-message"))
                        dates.extend(soup.find_all("span", class_="local-date"))
                        times_.extend(soup.find_all("span", class_="local-time"))

                    if len(posts) <= 2:
                        continue  # Skip threads with ≤2 total posts

                    try:
                        date_str = dates[0].text.strip().replace("\u200e", "") if dates else None
                        time_str = times_[0].text.strip().replace("\u200e", "") if times_ else "12:00 AM"
                        post_datetime_str = f"{date_str} {time_str}"
                        post_datetime = datetime.strptime(post_datetime_str, "%d-%m-%Y %I:%M %p")

                        if not (datetime(2015, 1, 1) <= post_datetime <= datetime(2025, 6, 30)):
                            continue  # Skip outside date range
                    except Exception as e:
                        print(f" Skipping due to date error: {thread_url}\n{e}")
                        continue

                    thread_id = thread_url.split("/")[-1]
                    thread_title_tag = soup.title.string.strip() if soup.title else "Untitled"
                    thread_title = thread_title_tag.replace(" - Beyond Blue", "").strip()

                    post_content = format_text(posts[0].text)
                    post_author = authors[0].get("title") if authors and authors[0] else "Unknown"

                    comments = []
                    author_replies = []
                    comment_authors = []

                    for i in range(1, len(posts)):
                        comment_text = format_text(posts[i].text)
                        comment_author = (
                            authors[i].get("title") if i < len(authors) and authors[i] else "Unknown"
                        )

                        if comment_author == post_author:
                            author_replies.append(comment_text)
                        else:
                            comments.append(f"{comment_author}: {comment_text}")
                            comment_authors.append(comment_author or "Unknown")

                    if len(comments) < 2:
                        continue  # Must have >1 real (non-author) comments

                    all_data.append({
                        "forum_name": forum_name,
                        "thread_id": thread_id,
                        "thread_title": thread_title,
                        "post_content": post_content,
                        "author": post_author,
                        "post_timestamp": post_datetime.strftime("%d-%m-%Y %I:%M %p"),
                        "no_of_comments": len(comments),
                        "comments_content": " ||| ".join(comments),
                        "authors_comment": " ||| ".join(author_replies),
                        "comment_authors": " ||| ".join(comment_authors)
                    })

                except Exception as e:
                    print(f" Error in thread {thread_url}:\n{e}")
                    continue

        except Exception as e:
            print(f"Failed to load forum page {forum_page}:\n{e}")
            continue

    return all_data

In [None]:
# Forum 
forums = {
    "depression": {"path": "depression", "slug": "c1-sc2-b2"},
    ##"ptsd-trauma": {"path": "ptsd-and-trauma", "slug": "c1-sc2-b3"},
    ##"suicidal-thoughts-and-self-harm": {"path": "suicidal-thoughts-and-self-harm", "slug": "c1-sc2-b4"},
    ##"anxiety": {"path": "anxiety", "slug": "c1-sc2-b1"}
}


In [6]:
final_rows = []
START_PAGE = 1
END_PAGE = 664

for forum_name, info in forums.items():
    print(f"\nScraping forum: {forum_name} (Pages {START_PAGE}–{END_PAGE})")
    forum_data = scrape_forum(info["path"], info["slug"], forum_name, start_page=START_PAGE, end_page=END_PAGE)
    final_rows.extend(forum_data)

df = pd.DataFrame(final_rows)
df.to_csv("depression_2015_25.csv", index=False)
print(f"\nScraping complete. Saved {len(df)} threads with >2 comments (excluding author replies in main comment list).")

Threads in depression: 100%|██████████| 13/13 [00:58<00:00,  4.53s/it]


Reading forum page: https://forums.beyondblue.org.au/t5/depression/bd-p/c1-sc2-b2/page/640


Threads in depression: 100%|██████████| 13/13 [00:55<00:00,  4.26s/it]


Reading forum page: https://forums.beyondblue.org.au/t5/depression/bd-p/c1-sc2-b2/page/641


Threads in depression: 100%|██████████| 13/13 [00:56<00:00,  4.34s/it]


Reading forum page: https://forums.beyondblue.org.au/t5/depression/bd-p/c1-sc2-b2/page/642


Threads in depression: 100%|██████████| 13/13 [00:59<00:00,  4.59s/it]


Reading forum page: https://forums.beyondblue.org.au/t5/depression/bd-p/c1-sc2-b2/page/643


Threads in depression: 100%|██████████| 13/13 [01:00<00:00,  4.63s/it]


Reading forum page: https://forums.beyondblue.org.au/t5/depression/bd-p/c1-sc2-b2/page/644


Threads in depression: 100%|██████████| 13/13 [00:57<00:00,  4.43s/it]


Reading forum page: https://forums.beyondblue.org.au/t5/depression/bd-p/c1-sc2-b2/page/645


Threads in depression: 100%|██████████| 13/13 [00:55<00:00,  4.30s/it]


Reading forum page: https://forums.beyondblue.org.au/t5/depression/bd-p/c1-sc2-b2/page/646


Threads in depression: 100%|██████████| 13/13 [01:02<00:00,  4.79s/it]


Reading forum page: https://forums.beyondblue.org.au/t5/depression/bd-p/c1-sc2-b2/page/647


Threads in depression: 100%|██████████| 13/13 [01:00<00:00,  4.68s/it]


Reading forum page: https://forums.beyondblue.org.au/t5/depression/bd-p/c1-sc2-b2/page/648


Threads in depression: 100%|██████████| 13/13 [01:00<00:00,  4.63s/it]


Reading forum page: https://forums.beyondblue.org.au/t5/depression/bd-p/c1-sc2-b2/page/649


Threads in depression: 100%|██████████| 13/13 [00:56<00:00,  4.36s/it]


Reading forum page: https://forums.beyondblue.org.au/t5/depression/bd-p/c1-sc2-b2/page/650


Threads in depression: 100%|██████████| 13/13 [00:59<00:00,  4.58s/it]


Reading forum page: https://forums.beyondblue.org.au/t5/depression/bd-p/c1-sc2-b2/page/651


Threads in depression: 100%|██████████| 13/13 [01:03<00:00,  4.88s/it]


Reading forum page: https://forums.beyondblue.org.au/t5/depression/bd-p/c1-sc2-b2/page/652


Threads in depression: 100%|██████████| 13/13 [00:59<00:00,  4.57s/it]


Reading forum page: https://forums.beyondblue.org.au/t5/depression/bd-p/c1-sc2-b2/page/653


Threads in depression: 100%|██████████| 13/13 [00:55<00:00,  4.29s/it]


Reading forum page: https://forums.beyondblue.org.au/t5/depression/bd-p/c1-sc2-b2/page/654


Threads in depression: 100%|██████████| 13/13 [01:00<00:00,  4.62s/it]


Reading forum page: https://forums.beyondblue.org.au/t5/depression/bd-p/c1-sc2-b2/page/655


Threads in depression: 100%|██████████| 13/13 [01:00<00:00,  4.62s/it]


Reading forum page: https://forums.beyondblue.org.au/t5/depression/bd-p/c1-sc2-b2/page/656


Threads in depression: 100%|██████████| 13/13 [00:59<00:00,  4.57s/it]


Reading forum page: https://forums.beyondblue.org.au/t5/depression/bd-p/c1-sc2-b2/page/657


Threads in depression: 100%|██████████| 13/13 [00:56<00:00,  4.33s/it]


Reading forum page: https://forums.beyondblue.org.au/t5/depression/bd-p/c1-sc2-b2/page/658


Threads in depression: 100%|██████████| 13/13 [00:59<00:00,  4.59s/it]


Reading forum page: https://forums.beyondblue.org.au/t5/depression/bd-p/c1-sc2-b2/page/659


Threads in depression: 100%|██████████| 13/13 [01:00<00:00,  4.64s/it]


Reading forum page: https://forums.beyondblue.org.au/t5/depression/bd-p/c1-sc2-b2/page/660


Threads in depression: 100%|██████████| 13/13 [00:55<00:00,  4.28s/it]


Reading forum page: https://forums.beyondblue.org.au/t5/depression/bd-p/c1-sc2-b2/page/661


Threads in depression: 100%|██████████| 13/13 [00:55<00:00,  4.25s/it]


Reading forum page: https://forums.beyondblue.org.au/t5/depression/bd-p/c1-sc2-b2/page/662


Threads in depression: 100%|██████████| 13/13 [00:59<00:00,  4.58s/it]


Reading forum page: https://forums.beyondblue.org.au/t5/depression/bd-p/c1-sc2-b2/page/663


Threads in depression: 100%|██████████| 11/11 [00:47<00:00,  4.32s/it]


Reading forum page: https://forums.beyondblue.org.au/t5/depression/bd-p/c1-sc2-b2/page/664


Threads in depression: 100%|██████████| 11/11 [00:51<00:00,  4.66s/it]



Scraping complete. Saved 5198 threads with >2 comments (excluding author replies in main comment list).


In [7]:
print(len(df))

5198
