# Mental Health Analysis ProjectA 

The purpose of this project is ...

# Data Scrapping

In [None]:
general_symptom_categories = [
    "Depression",
    "Anxiety",
    "Stress/Burnout",
    "Loneliness",
    "Low Self-Esteem",
    "Trauma/PTSD",
    "Anger/Irritability",
    "Obsessive Thoughts",
    "Addiction"
]

suicide_and_selfharm_labels = [
    "Suicidal Ideation",
    "Suicide Attempt",
    "Self-Harm",
    "Despair",
    "Urgent Help Request",
    "Grief After Suicide",
    "Coping with Suicidal Thoughts"
]


Reddit data

In [None]:
# scrapping data from reddit api
import requests
import json
import pandas as pd
import praw

import os
from dotenv import load_dotenv
load_dotenv()




class RedditScraper:


    def __init__(self):

        self.auth = requests.auth.HTTPBasicAuth(os.getenv('CLIENT_ID'), os.getenv('CLIENT_SECRET'))
        self.data = {'grant_type': 'password',
                     'username': os.getenv('USERNAME'),
                     'password': os.getenv('PASSWORD')}
        self.headers = {'User-Agent': 'MyAPI/0.0.1'}
        self.res = requests.post('https://www.reddit.com/api/v1/access_token',
                                auth=self.auth, data=self.data, headers=self.headers)
        
        self.headers["Authorization"] = f'bearer {self.res.json()["access_token"]}'

    def get_posts_byrequests(self, subreddit, limit=1000):
        url = f'https://oauth.reddit.com/r/{subreddit}/hot'
        params = {'limit': limit}
        response = requests.get(url, headers=self.headers, params=params)
        
        if response.status_code == 200:
            return response
        else:
            raise Exception(f"Error fetching posts: {response.status_code} - {response.text}")
        




    def get_posts_byprawn(self, subreddits, limit=1000, mental="mental_"):

        reddit = praw.Reddit(client_id=os.getenv('CLIENT_ID'),
                             client_secret=os.getenv('CLIENT_SECRET'),
                             user_agent='windows:mentalhealth.scraper:v1.0 (by u/IceWorth5480)',
                             username=os.getenv('USERNAME'),
                             password=os.getenv('PASSWORD'))
        
        all_posts = []
        sort_types = ['hot', 'top', 'new']


        for subreddit in subreddits:
            for sort in sort_types:
                if sort == 'hot':
                    posts = reddit.subreddit(subreddit).hot(limit=limit)
                elif sort == 'top':
                    posts = reddit.subreddit(subreddit).top(limit=limit)
                elif sort == 'new':
                    posts = reddit.subreddit(subreddit).new(limit=limit)

                for post in posts:
                    if post is None:
                        continue
                    all_posts.append({
                        'id': post.id,
                        'subreddit': subreddit,
                        'sort': sort,  # Add sort type for tracking
                        'title': post.title,
                        'selftext': post.selftext,
                        'created_utc': post.created_utc,
                        'score': post.score,
                        'num_comments': post.num_comments,
                        'author': str(post.author),
                        'url': post.url,
                        'over_18': post.over_18,
                        'flair': post.link_flair_text
                    })


        # delete dupicates by id
        all_posts = [dict(t) for t in {tuple(d.items()) for d in all_posts}]
 


        df = pd.DataFrame(all_posts)
        df['created_utc'] = pd.to_datetime(df['created_utc'], unit='s')
        df = df.sort_values(by='created_utc', ascending=False).reset_index(drop=True)

        # check if f'./data/reddit_data/{mental}reddit_posts.csv' exists, if so merging with df and # delete dupicates by id
        if os.path.exists(f'./data/reddit_data/{mental}reddit_posts.csv'):
            existing_df = pd.read_csv(f'./data/reddit_data/{mental}reddit_posts.csv')
            df = pd.concat([existing_df, df]).drop_duplicates(subset='id').reset_index(drop=True)
       
        # save as csv 
        df.to_csv(f'./data/reddit_data/{mental}reddit_posts.csv', index=False)


        
    def get_posts_byPushshift(self, subreddit, limit=1000):
        url = f'https://api.pushshift.io/reddit/search/submission/?subreddit={subreddit}&sort=desc&limit={limit}'
        response = requests.get(url)
        
        print(response.status_code)
        print(response.text)



In [None]:
mental_subreddits = ['mentalhealth', 'depression', 'anxiety', 'therapy', 'selfhelp', 'bpd', 'ptsd', 'socialanxiety', 'counseling']
normal_subreddits = ['popular','all','AskReddit','interestingasfuck']
test = RedditScraper()

test.get_posts_byprawn(mental_subreddits, limit=1000,mental="mental_")
test.get_posts_byprawn(normal_subreddits, limit=1000, mental="normal_")

In [2]:
# read the csv file 

df_mental = pd.read_csv('./data/reddit_data/mental_reddit_posts.csv')
df_normal = pd.read_csv('./data/reddit_data/normal_reddit_posts.csv')

print(df_mental.shape)
print(df_normal.shape)

(15877, 12)
(6878, 12)


Beyond Blue forums 

In [None]:
'''
TO DO

Extract following information from the reddit webpage 


Post ID: A unique identifier for each post.
Post Content: The text of the post.
Post Author: The author of the post.
Post Date: The date the post was made.
Post Category: Category or forum where the post was made.
Number of Comments: The total number of comments on the post.

From Comment 

Post ID: Link back to the original post.
Comment ID: A unique identifier for each comment.
Comment Content: Text of the comment.
Comment Author: Author of the comment.
Comment Date: Date the comment was posted. (the order of the comments is really important)
other meta data if available
'''


import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
import time
import re
from datetime import datetime, timedelta
import calendar


from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager


from tqdm import tqdm





def parse_post_date(raw_date):

    weekdays = list(calendar.day_name)
    today = datetime.now()

    raw_date = raw_date.strip()

    if raw_date in weekdays:
        # Get weekday index: Monday = 0, Sunday = 6
        post_weekday_index = weekdays.index(raw_date)
        today_weekday_index = today.weekday()

        # Calculate difference in days
        delta_days = (today_weekday_index - post_weekday_index) % 7

        # Get actual date
        post_date = today - timedelta(days=delta_days)
        return post_date.strftime('%Y-%m-%d')  # Format as YYYY-MM-DD

    else:
        # Try parsing date in the format like "25-09-2020"
        try:
            post_date = datetime.strptime(raw_date, '%d-%m-%Y')
            return post_date.strftime('%Y-%m-%d')
        except ValueError:
            return 'Unknown date'  # If format is unexpected








# post_variables = ["Post ID", "Post Content", "Post Author", "Post Date", "Post Category", "Number of Comments"]
mental_health_type =['Anxiety','Depression','PTSD and trauma','Suicidal thoughts and self-harm']






def beyondblue_scrapping(tag,address):


    # Setup Chrome WebDriver
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service)
    url = address

    whole_data = []



    for page in tqdm(range(1, 20), desc="Scraping pages"):

    
        driver.get(url)
        time.sleep(3)  # Wait for the page to load


        # get the class "custom-message-list all-discussions"
        soup = BeautifulSoup(driver.page_source, 'html.parser')


        discussions = soup.find('div', class_='custom-message-list all-discussions').find_all(('article'))
        

        for post in discussions:

            # Extracting post id
            post_id = post.get('class')[-1] if post.get('class') else ""

            # Extracting post title
            title_tag = post.find_all('h3')[0].find_all('a')[1]
            post_title = title_tag.text.strip() if title_tag else ""

            # Extracting post content
            post_content = post.find('p', class_ = 'body-text').text.strip() if post.find('p', class_ = 'body-text') else ""
            

            side_info = post.find('aside')
            side_info1 = side_info.find('div', class_='custom-tile-author-info') if side_info else None
            # Extracting post author
            post_author = side_info1.find('a').find('span').text.strip() if side_info and side_info.find('span') else ""


            side_info2 = side_info.find('div', class_='custom-tile-category-content')
            # Extracting post tag
            post_tag = side_info2.find('a').text.strip() if side_info2 and side_info2.find('a') else ""
            
            # Extracting post date
            raw_date = side_info2.find('time').text.strip() if side_info2 and side_info2.find('time') else ""
            post_date = parse_post_date(raw_date)



            side_info3 = side_info.find('div', class_='custom-tile-unread-replies')
            unread = side_info3.find('span').text.strip() if side_info3 and side_info3.find('span') else ""
            # Extracting number of unread replies
            match = re.search(r'\d+', unread)
            post_unread = int(match.group()) if match else 0

            # Extracting number of comments
            post_comments = post.find('li', class_ = 'custom-tile-replies').find('b').text.strip() if post.find('li', class_ = 'custom-tile-replies') else ""

            post_data = {
                "Post ID": post_id,
                "Post Title": post_title,
                "Post Content": post_content,
                "Post Author": post_author,
                "Post Date": post_date,
                "Post Category": post_tag,
                "Number of Comments": post_comments
            }
            whole_data.append(post_data)


        # Find next page link
        next_page = soup.find('li', class_='lia-paging-page-next')
        if next_page and next_page.find('a'):
            url = next_page.find('a')['href']
        else:
            print("No more pages to scrape.")
            break

    #close the driver
    driver.quit()


    # Convert to DataFrame
    df = pd.DataFrame(whole_data)
    # Save to CSV
    df.to_csv(f'./data/beyondblue_data/{tag}_beyondblue_posts.csv', index=False)
    print(f"Data saved to ./data/beyondblue_data/{tag}_beyondblue_posts.csv")






In [28]:
mental_health_urls = {
    "Anxiety": "https://forums.beyondblue.org.au/t5/anxiety/bd-p/c1-sc2-b1?&sort=recent",
    "Depression": "https://forums.beyondblue.org.au/t5/depression/bd-p/c1-sc2-b2?&sort=recent",
    "PTSD": "https://forums.beyondblue.org.au/t5/ptsd-and-trauma/bd-p/c1-sc2-b3?&sort=recent",
    "Suicidal_thoughts_and_self-harm": "https://forums.beyondblue.org.au/t5/suicidal-thoughts-and-self-harm/bd-p/c1-sc2-b4?&sort=recent"
}


for tag, address in mental_health_urls.items():
    try:
        beyondblue_scrapping(tag, address)
    except Exception as e:
        print(f"Error scraping {tag}: {e}")
        continue

Scraping pages: 100%|██████████| 4/4 [00:18<00:00,  4.65s/it]


Data saved to ./data/beyondblue_data/Anxiety_beyondblue_posts.csv


Scraping pages: 100%|██████████| 4/4 [00:19<00:00,  4.86s/it]


Data saved to ./data/beyondblue_data/Depression_beyondblue_posts.csv


Scraping pages: 100%|██████████| 4/4 [00:21<00:00,  5.43s/it]


Data saved to ./data/beyondblue_data/PTSD_beyondblue_posts.csv


Scraping pages: 100%|██████████| 4/4 [00:20<00:00,  5.16s/it]


Data saved to ./data/beyondblue_data/Suicidal_thoughts_and_self-harm_beyondblue_posts.csv


In [29]:

# read the csv file
df_anxiety = pd.read_csv('./data/beyondblue_data/Anxiety_beyondblue_posts.csv')
df_depression = pd.read_csv('./data/beyondblue_data/Depression_beyondblue_posts.csv')
df_ptsd = pd.read_csv('./data/beyondblue_data/PTSD_beyondblue_posts.csv')
df_self_harm = pd.read_csv('./data/beyondblue_data/Suicidal_thoughts_and_self-harm_beyondblue_posts.csv')


print(df_anxiety.shape)
print(df_depression.shape)  
print(df_ptsd.shape)
print(df_self_harm.shape)

(40, 7)
(40, 7)
(40, 7)
(40, 7)


Climate data API

# Nature Language Process
