# Mental Health Analysis ProjectA 

The purpose of this project is ...

# Data Scrapping

In [None]:
general_symptom_categories = [
    "Depression",
    "Anxiety",
    "Stress/Burnout",
    "Loneliness",
    "Low Self-Esteem",
    "Trauma/PTSD",
    "Anger/Irritability",
    "Obsessive Thoughts",
    "Addiction"
]

suicide_and_selfharm_labels = [
    "Suicidal Ideation",
    "Suicide Attempt",
    "Self-Harm",
    "Despair",
    "Urgent Help Request",
    "Grief After Suicide",
    "Coping with Suicidal Thoughts"
]


Reddit data

In [12]:
# scrapping data from reddit api
import requests
import pandas as pd
import praw
import emoji
import emot
import asyncpraw
import asyncio
from tqdm import tqdm


import os
from dotenv import load_dotenv
load_dotenv()




class RedditScraper:


    def __init__(self):

        self.auth = requests.auth.HTTPBasicAuth(os.getenv('CLIENT_ID'), os.getenv('CLIENT_SECRET'))
        self.data = {'grant_type': 'password',
                     'username': os.getenv('USERNAME'),
                     'password': os.getenv('PASSWORD')}
        self.headers = {'User-Agent': 'MyAPI/0.0.1'}
        self.res = requests.post('https://www.reddit.com/api/v1/access_token',
                                auth=self.auth, data=self.data, headers=self.headers)
        
        self.headers["Authorization"] = f'bearer {self.res.json()["access_token"]}'



        self.client_id = os.getenv('CLIENT_ID')
        self.client_secret = os.getenv('CLIENT_SECRET')
        self.username = os.getenv('USERNAME')
        self.password = os.getenv('PASSWORD')





    def get_posts_byrequests(self, subreddit, limit=1000):
        url = f'https://oauth.reddit.com/r/{subreddit}/hot'
        params = {'limit': limit}
        response = requests.get(url, headers=self.headers, params=params)
        
        if response.status_code == 200:
            return response
        else:
            raise Exception(f"Error fetching posts: {response.status_code} - {response.text}")
        


    def convert_emojis_emoticons(self, text):
        e = emot.core.emot()

        # Extract emoticons
        emoticon_results = e.emoticons(text)
        for original, meaning in zip(emoticon_results['value'], emoticon_results['mean']):
            text = text.replace(original, f" {meaning} ")

        # Extract emojis
        # emoji_results = e.emoji(text)
        # for original, meaning in zip(emoji_results['value'], emoji_results['mean']):
        #     text = text.replace(original, f" {meaning} ")

        text = emoji.demojize(text)



        return text.strip().lower()
        


    async def get_posts_byprawn(self, subreddits, limit=1000, mental="mental_"):

        reddit = asyncpraw.Reddit(client_id=self.client_id,
                             client_secret=self.client_secret,
                             user_agent='windows:mentalhealth.scraper:v1.0 (by u/IceWorth5480)',
                             username=self.username,
                             password=self.password)
        
        all_posts = []
        # , 'top', 'new'
        sort_types = ['hot']


        for subreddit_name in tqdm(subreddits, desc="Subreddits Progress"):
            subreddit = await reddit.subreddit(subreddit_name)
            for sort in sort_types:
                if sort == 'hot':
                    posts = subreddit.hot(limit=limit)
                elif sort == 'top':
                    posts = subreddit.top(limit=limit)
                elif sort == 'new':
                    posts = subreddit.new(limit=limit)

                async for post in posts:
                    if post is None:
                        continue

                    # Load top-level comments (non-blocking)
                    await post.load()
                    await post.comments.replace_more(limit=0)
                    top_comments_raw = [comment.body for comment in post.comments[:5]]
                    top_comments = [self.convert_emojis_emoticons(c) for c in top_comments_raw]

                    all_posts.append({
                        'id': post.id,
                        'subreddit': subreddit_name,
                        'sort': sort,
                        'title': post.title,
                        'selftext': self.convert_emojis_emoticons(post.selftext),
                        'created_utc': post.created_utc,
                        'score': post.score,
                        'num_comments': post.num_comments,
                        'author': str(post.author),
                        'post_url': post.url,
                        'over_18': post.over_18,
                        'flair': post.link_flair_text,
                        'top_comments': top_comments
                    })

        await reddit.close()



        df = pd.DataFrame(all_posts)
        # Drop duplicates by post ID
        df = df.drop_duplicates(subset='id').reset_index(drop=True)
        df['created_utc'] = pd.to_datetime(df['created_utc'], unit='s')
        df = df.sort_values(by='created_utc', ascending=False).reset_index(drop=True)
        # Convert list of comments to string for CSV storage
        df['top_comments'] = df['top_comments'].apply(lambda x: ' | '.join(x) if isinstance(x, list) else '')



        # check if f'./data/reddit_data/{mental}reddit_posts.csv' exists, if so merging with df and # delete dupicates by id
        if os.path.exists(f'./data/reddit_data/{mental}reddit_posts.csv'):
            existing_df = pd.read_csv(f'./data/reddit_data/{mental}reddit_posts.csv')
            df = pd.concat([existing_df, df]).drop_duplicates(subset='id').reset_index(drop=True)
       
        # save as csv 
        df.to_csv(f'./data/reddit_data/{mental}reddit_posts.csv', index=False)







In [None]:
# import asyncio



# , 'depression', 'anxiety', 'therapy', 'selfhelp', 'bpd', 'ptsd', 'socialanxiety', 'counseling'
mental_subreddits = ['mentalhealth', 'depression', 'anxiety', 'therapy', 'selfhelp', 'bpd', 'ptsd', 'socialanxiety', 'counseling']
australian_regions = ['melbourne','sydney','adelaide','perth','brisbane','canberra']
normal_subreddits = ['popular','all','AskReddit','interestingasfuck']



scraper  = RedditScraper()


await scraper.get_posts_byprawn(mental_subreddits, limit=1000, mental="mental_")




In [17]:
# read the csv file 

df_mental = pd.read_csv('./data/reddit_data/mental_reddit_posts.csv')
print(df_mental.shape)

# df_normal = pd.read_csv('./data/reddit_data/normal_reddit_posts.csv')
# print(df_normal.shape)

(93, 13)


Beyond Blue forums 

In [40]:
'''
TO DO

Extract following information from the reddit webpage 


Post ID: A unique identifier for each post.
Post Content: The text of the post.
Post Author: The author of the post.
Post Date: The date the post was made.
Post Category: Category or forum where the post was made.
Number of Comments: The total number of comments on the post.

From Comment 

Post ID: Link back to the original post.
Comment ID: A unique identifier for each comment.
Comment Content: Text of the comment.
Comment Author: Author of the comment.
Comment Date: Date the comment was posted. (the order of the comments is really important)
other meta data if available
'''


import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
import time
import re
from datetime import datetime, timedelta
import calendar


from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager


from tqdm import tqdm
import emoji
import emot





def parse_post_date(raw_date):

    weekdays = [day.lower() for day in list(calendar.day_name) ]
    today = datetime.now()

    raw_date = raw_date.strip().lower()  

    # Case 1: Day of the week (e.g., "Monday")
    if raw_date in weekdays:
        # Get weekday index: Monday = 0, Sunday = 6
        post_weekday_index = weekdays.index(raw_date)
        today_weekday_index = today.weekday()

        # Calculate difference in days
        delta_days = (today_weekday_index - post_weekday_index) % 7
        # Get actual date
        post_date = today - timedelta(days=delta_days)
        return post_date.strftime('%Y-%m-%d')  # Format as YYYY-MM-DD
    
    # Case 2: "a week ago", "2 weeks ago", etc.
    elif "week" in raw_date:
        match = re.search(r'(\d+)', raw_date)
        weeks = int(match.group(1)) if match else 1
        post_date = today - timedelta(weeks=weeks)
        return post_date.strftime('%Y-%m-%d')

    # Case 3: "a month ago", "2 months ago", etc.
    elif "month" in raw_date:
        match = re.search(r'(\d+)', raw_date)
        months = int(match.group(1)) if match else 1
        # Approximate a month as 30 days
        post_date = today - timedelta(days=30 * months)
        return post_date.strftime('%Y-%m-%d')
    
    # Case 4: Exact date format like "11-05-2025"
    else:
        # Try parsing date in the format like "25-09-2020"
        try:
            post_date = datetime.strptime(raw_date, '%d-%m-%Y')
            return post_date.strftime('%Y-%m-%d')
        except ValueError:
            return 'Unknown date'  # If format is unexpected




def convert_emojis_emoticons(text):
    e = emot.core.emot()

    # Extract emoticons
    emoticon_results = e.emoticons(text)

    for original, meaning in zip(emoticon_results['value'], emoticon_results['mean']):
        text = text.replace(original, f" {meaning} ")

    # # Extract emojis
    # emoji_results = e.emoji(text)
    # for original, meaning in zip(emoji_results['value'], emoji_results['mean']):
    #     text = text.replace(original, f" {meaning} ")

    text = emoji.demojize(text)

    return text.strip().lower()




def comment_scrapping (url, comment_pages = 1):

    
    # Setup Chrome WebDriver
    service = Service(ChromeDriverManager().install())
    comment_driver = webdriver.Chrome(service=service)

    
    

    list_comments = []

    for page in range(1, comment_pages + 1):

        comment_driver.get(url)
        time.sleep(0.05)
        soup = BeautifulSoup(comment_driver.page_source, 'html.parser')
        comments_section = soup.find('div', class_='linear-message-list message-list')
        every_comments = comments_section.find_all('div')


        for comment in every_comments:

            main_section = comment.find('div', class_='lia-quilt-row lia-quilt-row-message-main') if comment.find('div', class_='lia-quilt-row lia-quilt-row-message-main') else None
            if not main_section:
                continue
            # scrapping all the text 
            comment_text = main_section.get_text(separator=' ', strip=True) if main_section else ""
            comment_text = convert_emojis_emoticons(comment_text)
            list_comments.append(comment_text)

            # only scrapping the first 3 comments
            if len(list_comments) >= 3:
                break

        # Find next page link
        next_page = soup.find('li', class_='lia-paging-page-next')
        if next_page and next_page.find('a'):
            next_page_link = next_page.find('a')['href']
            url = next_page_link
        else:
            print("No more pages to scrape.")
            break
        
    # close the driver
    comment_driver.quit()

    # change the list of comments to a string
    list_comments = ' | '.join(list_comments) if list_comments else ""
    

    return list_comments


        



def beyondblue_scrapping(tag,address,pages=2):


    # Setup Chrome WebDriver
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service)
    url = address

    whole_data = []



    for page in tqdm(range(1, pages + 1), desc="Scraping pages"):

    
        driver.get(url)
        # time.sleep(3)  # Wait for the page to load


        # get the class "custom-message-list all-discussions"
        soup = BeautifulSoup(driver.page_source, 'html.parser')


        discussions = soup.find('div', class_='custom-message-list all-discussions').find_all(('article'))
        

        for post in discussions:

            # Extracting post id
            post_link = post.find('h3').find_all('a')[1].get('href')
            post_id = post_link.split('/')[-1]  


            full_post_link = f"https://forums.beyondblue.org.au{post_link}"
            # extract comments from post_link
            comments = comment_scrapping(full_post_link, comment_pages=1)
            

            # Extracting post title
            title_tag = post.find_all('h3')[0].find_all('a')[1]
            post_title = convert_emojis_emoticons(title_tag.text.strip()) if title_tag else ""

            # Extracting post content
            post_content = convert_emojis_emoticons(post.find('p', class_ = 'body-text').text.strip()) if post.find('p', class_ = 'body-text') else ""
            

            side_info = post.find('aside')
            side_info1 = side_info.find('div', class_='custom-tile-author-info') if side_info else None
            # Extracting post author
            post_author = side_info1.find('a').find('span').text.strip() if side_info and side_info.find('span') else ""

            author_link = post.find('aside').find('div', class_='custom-tile-author-info').find('a').get('href')
            # Extracting user id from author link
            user_id = author_link.split('user-id/')[-1]

            


            side_info2 = side_info.find('div', class_='custom-tile-category-content')
            # Extracting post tag
            post_tag = side_info2.find('a').text.strip() if side_info2 and side_info2.find('a') else ""
            
            
            raw_date = side_info2.find('time').text.strip() if side_info2 and side_info2.find('time') else ""
            # Extracting post date
            post_date = parse_post_date(raw_date)



            side_info3 = side_info.find('div', class_='custom-tile-unread-replies')
            unread = side_info3.find('span').text.strip() if side_info3 and side_info3.find('span') else ""
            # Extracting number of unread replies
            match = re.search(r'\d+', unread)
            post_unread = int(match.group()) if match else 0

            # Extracting number of comments
            number_comments = post.find('li', class_ = 'custom-tile-replies').find('b').text.strip() if post.find('li', class_ = 'custom-tile-replies') else ""


            

            post_data = {
                "Post ID": post_id,
                "Post Title": post_title,
                "Post Content": post_content,
                "Post Author": post_author,
                "User ID": user_id,
                "Post Date": post_date,
                "Post Category": post_tag,
                "Number of Comments": number_comments,
                "Comments": comments
            }
            whole_data.append(post_data)


        # Print the number of posts scraped on the current page
        # print(f"Tag: {tag} Page {page}: Scraped {len(discussions)} posts.")

        # Find next page link
        next_page = soup.find('li', class_='lia-paging-page-next')
        if next_page and next_page.find('a'):
            url = next_page.find('a')['href']
        else:
            print("No more pages to scrape.")
            break

    #close the driver
    driver.quit()


    # Convert to DataFrame 
    df = pd.DataFrame(whole_data)
    # delete duplicates by Post ID
    df = df.drop_duplicates(subset='Post ID').reset_index(drop=True)
    # Convert 'Post Date' to datetime format
    df['Post Date'] = pd.to_datetime(df['Post Date'], errors='coerce')
    # Sort by 'Post Date' in descending order
    df = df.sort_values(by='Post Date', ascending=False).reset_index(drop=True)

    # check if the file already exists
    if os.path.exists(f'./data/beyondblue_data/{tag}_beyondblue_posts.csv'):
        existing_df = pd.read_csv(f'./data/beyondblue_data/{tag}_beyondblue_posts.csv')
        df = pd.concat([existing_df, df]).drop_duplicates(subset='Post ID').reset_index(drop=True)

    df.to_csv(f'./data/beyondblue_data/{tag}_beyondblue_posts.csv', index=False)
        
        
    print(f"Data saved to ./data/beyondblue_data/{tag}_beyondblue_posts.csv")






In [None]:
# post_variables = ["Post ID", "Post Content", "Post Author", "Post Date", "Post Category", "Number of Comments"]
# mental_health_type =['Anxiety','Depression','PTSD and trauma','Suicidal thoughts and self-harm']



mental_health_urls = {
    "Anxiety": "https://forums.beyondblue.org.au/t5/anxiety/bd-p/c1-sc2-b1?&sort=kudos",
    "Depression": "https://forums.beyondblue.org.au/t5/depression/bd-p/c1-sc2-b2?&sort=kudos",
    "PTSD": "https://forums.beyondblue.org.au/t5/ptsd-and-trauma/bd-p/c1-sc2-b3?&sort=kudos",
    "Suicidal_thoughts_and_self-harm": "https://forums.beyondblue.org.au/t5/suicidal-thoughts-and-self-harm/bd-p/c1-sc2-b4?&sort=kudos",
    "Relationship and family issues":"https://forums.beyondblue.org.au/t5/relationship-and-family-issues/bd-p/c1-sc3-b3?&sort=kudos",
    "Young people":"https://forums.beyondblue.org.au/t5/young-people/bd-p/c1-sc4-b1?&sort=kudos",
    "Sexuality and gender identity":"https://forums.beyondblue.org.au/t5/sexuality-and-gender-identity/bd-p/c1-sc4-b2?&sort=kudos",
    "Multicultural experiences":"https://forums.beyondblue.org.au/t5/multicultural-experiences/bd-p/c1-sc4-b3?&sort=kudos",
    "Grief and loss":"https://forums.beyondblue.org.au/t5/grief-and-loss/bd-p/c1-sc4-b4?&sort=kudos",
}

each_data_number = 2000
pages = each_data_number // 10  # Assuming each page has 10 posts


for tag, address in mental_health_urls.items():
    try:
        beyondblue_scrapping(tag, address, pages = pages)
    except Exception as e:
        print(f"Error scraping {tag}: {e}")
        continue

Scraping pages: 100%|██████████| 199/199 [04:25<00:00,  1.33s/it]


Data saved to ./data/beyondblue_data/Anxiety_beyondblue_posts.csv


Scraping pages: 100%|██████████| 199/199 [05:36<00:00,  1.69s/it]
  df['Post Date'] = pd.to_datetime(df['Post Date'], errors='coerce')


Data saved to ./data/beyondblue_data/Depression_beyondblue_posts.csv


Scraping pages: 100%|██████████| 199/199 [05:34<00:00,  1.68s/it]


Data saved to ./data/beyondblue_data/PTSD_beyondblue_posts.csv


Scraping pages:  65%|██████▌   | 130/199 [03:38<01:55,  1.68s/it]

No more pages to scrape.





Data saved to ./data/beyondblue_data/Suicidal_thoughts_and_self-harm_beyondblue_posts.csv


Scraping pages: 100%|██████████| 199/199 [05:27<00:00,  1.64s/it]


Data saved to ./data/beyondblue_data/Relationship and family issues_beyondblue_posts.csv


Scraping pages: 100%|██████████| 199/199 [04:34<00:00,  1.38s/it]


Data saved to ./data/beyondblue_data/Young people_beyondblue_posts.csv


Scraping pages:  39%|███▊      | 77/199 [02:13<03:31,  1.74s/it]

No more pages to scrape.



  df['Post Date'] = pd.to_datetime(df['Post Date'], errors='coerce')


Data saved to ./data/beyondblue_data/Sexuality and gender identity_beyondblue_posts.csv


Scraping pages:  13%|█▎        | 25/199 [00:46<05:21,  1.85s/it]

No more pages to scrape.





Data saved to ./data/beyondblue_data/Multicultural experiences_beyondblue_posts.csv


Scraping pages:  44%|████▎     | 87/199 [02:29<03:12,  1.72s/it]

No more pages to scrape.





Data saved to ./data/beyondblue_data/Grief and loss_beyondblue_posts.csv


In [None]:

# read the csv file
df_anxiety = pd.read_csv('./data/beyondblue_data/Anxiety_beyondblue_posts.csv')
df_depression = pd.read_csv('./data/beyondblue_data/Depression_beyondblue_posts.csv')
df_ptsd = pd.read_csv('./data/beyondblue_data/PTSD_beyondblue_posts.csv')
df_self_harm = pd.read_csv('./data/beyondblue_data/Suicidal_thoughts_and_self-harm_beyondblue_posts.csv')



print(df_anxiety.shape)
# print(df_anxiety)

print(df_depression.shape)  
print(df_ptsd.shape)
print(df_self_harm.shape)

(1990, 8)
      Post ID                          Post Title  \
0      611174                Peripheral Neuropahy   
1      611132                 Apprentice mechanic   
2      611126              Depression and Anxiety   
3      611099  Easy strategies for quick response   
4      611075                                Help   
...       ...                                 ...   
1985    56941          Scared to death, OF DEATH.   
1986    38359         Severe paranoia and anxiety   
1987    37072                      Health Anxiety   
1988    37038                  Feeling very angry   
1989    94973              Running out of excuses   

                                           Post Content     Post Author  \
0     Has anyone has success with HBOT Therapy for P...  Guest_55166016   
1     I'm 4 months in to my apprenticeship and em st...  Guest_28908038   
2     I am currently an international student. I alr...  Guest_16677420   
3     Recently my anxiety has 'flared'. As soon as I.

Climate data API

# Nature Language Process
