In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import urllib.parse 
# Function to extract comments from a thread
def get_comments(thread_url):
    comments = []
    try:
        response = requests.get(thread_url)
        soup = BeautifulSoup(response.content, 'html.parser')
        comment_divs = soup.find_all('div', class_='lia-message-body-content')
        for comment in comment_divs[1:]:  # Skip the main post
            text = comment.get_text(strip=True)
            if text:
                comments.append(text)
    except Exception as e:
        comments.append(f"Error fetching comments: {e}")
    return comments

# Main scraping function
def scrape_beyondblue_combined(keywords, default_pages=100, custom_pages={}):
    all_data = []

    for keyword in keywords:
        # Use 2500 pages for 'anxiety' and 'depression', else use default
        pages = custom_pages.get(keyword, default_pages)

        encoded_keyword = urllib.parse.quote(keyword)
        base_url = f"https://forums.beyondblue.org.au/t5/forums/searchpage/tab/message?advanced=false&allow_punctuation=false&q={encoded_keyword}"


        for page in range(1, pages + 1):
            url = f"{base_url}&page={page}"
            print(f"🔍 Scraping: {url}")
            response = requests.get(url)
            soup = BeautifulSoup(response.content, 'html.parser')

            results = soup.find_all('div', class_='lia-message-view-wrapper')

            for result in results:
                post_tag = result.find('div', class_='lia-message-body-content')
                post_content = post_tag.get_text(strip=True) if post_tag else ""

                # Extract thread URL
                link_tag = result.find('a', class_='page-link')
                thread_url = 'https://forums.beyondblue.org.au' + link_tag['href'] if link_tag else "N/A"

                # Extract post date
                date_tag = result.find('span', class_='local-date')
                post_date = date_tag.get_text(strip=True) if date_tag else "Unknown"

                # Extract comments
                comments = get_comments(thread_url) if thread_url != "N/A" else []
                num_comments = len(comments)

                # Append all information
                all_data.append({
                    'keyword': keyword,
                    'post_content_preview': post_content,
                    'thread_url': thread_url,
                    'post_date': post_date,
                    'comments': comments,
                    'num_comments': num_comments
                })

                time.sleep(0.25)
            time.sleep(1)

    # Save all to one CSV
    df = pd.DataFrame(all_data)
    df.to_csv('beyondblue_combined_with_comments_timestamp.csv', index=False)
    print(f"\n✅ Done! Total posts saved: {len(df)}")
    print(df.head())

# 🔧 Topics and their respective custom page settings
topics = [
    'anxiety', 'depression', 'social anxiety', 'loneliness',
    'panic attack','PTSD','Suicidal thoughts','trauma',
    'OCD','self harm','people like me','staying well'
]

# Custom page counts
custom_page_counts = {
    'anxiety': 100,
    'depression': 100
}

# Run the function
scrape_beyondblue_combined(topics, default_pages=100, custom_pages=custom_page_counts)


🔍 Scraping: https://forums.beyondblue.org.au/t5/forums/searchpage/tab/message?advanced=false&allow_punctuation=false&q=anxiety&page=1
🔍 Scraping: https://forums.beyondblue.org.au/t5/forums/searchpage/tab/message?advanced=false&allow_punctuation=false&q=anxiety&page=2
🔍 Scraping: https://forums.beyondblue.org.au/t5/forums/searchpage/tab/message?advanced=false&allow_punctuation=false&q=anxiety&page=3
🔍 Scraping: https://forums.beyondblue.org.au/t5/forums/searchpage/tab/message?advanced=false&allow_punctuation=false&q=anxiety&page=4
🔍 Scraping: https://forums.beyondblue.org.au/t5/forums/searchpage/tab/message?advanced=false&allow_punctuation=false&q=anxiety&page=5
🔍 Scraping: https://forums.beyondblue.org.au/t5/forums/searchpage/tab/message?advanced=false&allow_punctuation=false&q=anxiety&page=6
🔍 Scraping: https://forums.beyondblue.org.au/t5/forums/searchpage/tab/message?advanced=false&allow_punctuation=false&q=anxiety&page=7
🔍 Scraping: https://forums.beyondblue.org.au/t5/forums/searchp

In [7]:
df_new=pd.read_csv("bb_posts_demo2.csv")
df_new

Unnamed: 0,keyword,page,post_preview,thread_url,post_date,num_comments,comments
0,anxiety,1,Hi my name is Michelle and I have had Anxiety ...,https://forums.beyondblue.org.au/t5/anxiety/an...,‎23-05-2025,2,Dear Michelle ~ Welcome here to the Support Fo...
1,anxiety,1,"Hi, I’ve suffered on and off from anxiety ov...",https://forums.beyondblue.org.au/t5/welcome-an...,‎28-05-2025,4,Hello new member and welcome to the forum. Ple...
2,anxiety,1,"I have had chronic anxiety since 2016, but I h...",https://forums.beyondblue.org.au/t5/anxiety/an...,‎06-01-2025,9,"Dear Guest-65543263, I can very much relate to..."
3,anxiety,1,Hello I struggle with very bad anxiety and pan...,https://forums.beyondblue.org.au/t5/anxiety/an...,‎04-03-2025,4,Hello any advice please would help ||| Hi don'...
4,anxiety,1,I’ve recently sold my mortgage free home that ...,https://forums.beyondblue.org.au/t5/anxiety/an...,‎04-06-2025,3,Hi KCT welcome Anxiety is a serious condition ...
...,...,...,...,...,...,...,...
11995,staying well,100,"Hi, \n Got my knickers in a twist trying to wo...",https://forums.beyondblue.org.au/t5/anxiety/ph...,‎26-11-2019,9,"Hi, welcome Anxiety is a serious illness, more..."
11996,staying well,100,"As the saying goes ""life begins at 60"" \n Sinc...",https://forums.beyondblue.org.au/t5/staying-we...,‎08-11-2017,9,"hi Meercat , absolutely, for me it has greatly..."
11997,staying well,100,"Hi everyone, \n \nUnder each post on the for...",https://forums.beyondblue.org.au/t5/welcome-an...,‎12-12-2018,9,Thanks for the opportunity for everyone to hav...
11998,staying well,100,"Lately I have been doing okay, I haven’t been ...",https://forums.beyondblue.org.au/t5/anxiety/a-...,‎31-01-2023,2,"Hi, welcome Re: ""I just want to know myself I ..."


In [9]:
df_my=pd.read_csv("beyondblue_combined_with_comments_timestamp.csv")
df_my

Unnamed: 0,keyword,post_content_preview,thread_url,post_date,comments,num_comments
0,anxiety,Hi my name is Michelle and I have hadAnxietymy...,https://forums.beyondblue.org.au/t5/anxiety/an...,‎23-05-2025,"[""Dear Michelle ~Welcome here to the Support F...",2
1,anxiety,"Hi, I’ve suffered on and off fromanxietyover...",https://forums.beyondblue.org.au/t5/welcome-an...,‎28-05-2025,['Hello new member and welcome to the forum.Pl...,4
2,anxiety,taking breaks and ambient noise doesn't seem t...,https://forums.beyondblue.org.au/t5/staying-we...,Unknown,"[""will make this brief as its a contextual pos...",9
3,anxiety,"I have had chronicanxietysince 2016, but I hav...",https://forums.beyondblue.org.au/t5/anxiety/an...,‎06-01-2025,"['Dear Guest-65543263,I can very much relate t...",9
4,anxiety,Hello I struggle with very badanxietyand panic...,https://forums.beyondblue.org.au/t5/anxiety/an...,‎04-03-2025,"['Hello any advice please would help', ""Hi don...",4
...,...,...,...,...,...,...
11995,staying well,Hi all \n At this point in time I have been ba...,https://forums.beyondblue.org.au/t5/relationsh...,‎13-12-2018,"['Dear ABoot,I am so very sorry to hear of the...",9
11996,staying well,"Hey everyone, hope someone can give me an outs...",https://forums.beyondblue.org.au/t5/young-peop...,‎09-01-2018,"[""Hi Spl spl,I cannot say I have come across t...",9
11997,staying well,"Hi, \n Got my knickers in a twist trying to wo...",https://forums.beyondblue.org.au/t5/anxiety/ph...,‎26-11-2019,"[""Hi, welcomeAnxiety is a serious illness, mor...",9
11998,staying well,"Hi everyone, \n \nUnder each post on the for...",https://forums.beyondblue.org.au/t5/welcome-an...,‎12-12-2018,['Thanks for the opportunity for everyone to h...,9


In [13]:
# Clean URLs just in case there’s whitespace
df_my['thread_url'] = df_my['thread_url'].astype(str).str.strip()
df_new['thread_url'] = df_new['thread_url'].astype(str).str.strip()

# Identify new rows that are in df_new but not in df_my
new_rows = df_new[~df_new['thread_url'].isin(df_my['thread_url'])]

# Optional: Drop the 'page' column from df_new if not needed
if 'page' in new_rows.columns:
    new_rows = new_rows.drop(columns=['page'])

# Rename column if necessary to match (post_preview → post_content_preview)
if 'post_preview' in new_rows.columns:
    new_rows = new_rows.rename(columns={'post_preview': 'post_content_preview'})

# Append new rows to your dataset
df_combined = pd.concat([df_my, new_rows], ignore_index=True)

# Save result
df_combined.to_csv("beyondblue_merged.csv", index=False)
print(f"✅ Appended {len(new_rows)} new rows to the dataset.")

✅ Appended 12000 new rows to the dataset.


In [15]:

df_merged = pd.read_csv("beyondblue_merged.csv")
duplicate_count = df_merged.duplicated(subset=['post_content_preview']).sum()
print(f"Number of repeated posts: {duplicate_count}")

if duplicate_count > 0:
    print("There are repeated posts in the merged dataset.")
else:
    print("All posts in the merged dataset are unique.")

Number of repeated posts: 5602
There are repeated posts in the merged dataset.


In [18]:
# Remove duplicated posts based on 'post_content_preview'
df_merged_unique = df_merged.drop_duplicates(subset=['post_content_preview'], keep='first')
df_merged_unique.to_csv("beyondblue_merged_unique.csv", index=False)
print(f"✅ Removed duplicates. Unique posts saved: {len(df_merged_unique)}")

✅ Removed duplicates. Unique posts saved: 18398
