# Mental Health Analysis ProjectA 

The purpose of this project is ...

# Data Scrapping

In [None]:
general_symptom_categories = [
    "Depression",
    "Anxiety",
    "Stress/Burnout",
    "Loneliness",
    "Low Self-Esteem",
    "Trauma/PTSD",
    "Anger/Irritability",
    "Obsessive Thoughts",
    "Addiction"
]

suicide_and_selfharm_labels = [
    "Suicidal Ideation",
    "Suicide Attempt",
    "Self-Harm",
    "Despair",
    "Urgent Help Request",
    "Grief After Suicide",
    "Coping with Suicidal Thoughts"
]


Reddit data

In [None]:
# scrapping data from reddit api
import requests
import json
import pandas as pd
import praw

import os
from dotenv import load_dotenv
load_dotenv()




class RedditScraper:


    def __init__(self):

        self.auth = requests.auth.HTTPBasicAuth(os.getenv('CLIENT_ID'), os.getenv('CLIENT_SECRET'))
        self.data = {'grant_type': 'password',
                     'username': os.getenv('USERNAME'),
                     'password': os.getenv('PASSWORD')}
        self.headers = {'User-Agent': 'MyAPI/0.0.1'}
        self.res = requests.post('https://www.reddit.com/api/v1/access_token',
                                auth=self.auth, data=self.data, headers=self.headers)
        
        self.headers["Authorization"] = f'bearer {self.res.json()["access_token"]}'

    def get_posts_byrequests(self, subreddit, limit=1000):
        url = f'https://oauth.reddit.com/r/{subreddit}/hot'
        params = {'limit': limit}
        response = requests.get(url, headers=self.headers, params=params)
        
        if response.status_code == 200:
            return response
        else:
            raise Exception(f"Error fetching posts: {response.status_code} - {response.text}")
        




    def get_posts_byprawn(self, subreddits, limit=1000, mental="mental_"):

        reddit = praw.Reddit(client_id=os.getenv('CLIENT_ID'),
                             client_secret=os.getenv('CLIENT_SECRET'),
                             user_agent='windows:mentalhealth.scraper:v1.0 (by u/IceWorth5480)',
                             username=os.getenv('USERNAME'),
                             password=os.getenv('PASSWORD'))
        
        all_posts = []
        sort_types = ['hot', 'top', 'new']


        for subreddit in subreddits:
            for sort in sort_types:
                if sort == 'hot':
                    posts = reddit.subreddit(subreddit).hot(limit=limit)
                elif sort == 'top':
                    posts = reddit.subreddit(subreddit).top(limit=limit)
                elif sort == 'new':
                    posts = reddit.subreddit(subreddit).new(limit=limit)

                for post in posts:
                    if post is None:
                        continue
                    all_posts.append({
                        'id': post.id,
                        'subreddit': subreddit,
                        'sort': sort,  # Add sort type for tracking
                        'title': post.title,
                        'selftext': post.selftext,
                        'created_utc': post.created_utc,
                        'score': post.score,
                        'num_comments': post.num_comments,
                        'author': str(post.author),
                        'url': post.url,
                        'over_18': post.over_18,
                        'flair': post.link_flair_text
                    })


        # delete dupicates by id
        all_posts = [dict(t) for t in {tuple(d.items()) for d in all_posts}]
 


        df = pd.DataFrame(all_posts)
        df['created_utc'] = pd.to_datetime(df['created_utc'], unit='s')
        df = df.sort_values(by='created_utc', ascending=False).reset_index(drop=True)

        # check if f'./data/reddit_data/{mental}reddit_posts.csv' exists, if so merging with df and # delete dupicates by id
        if os.path.exists(f'./data/reddit_data/{mental}reddit_posts.csv'):
            existing_df = pd.read_csv(f'./data/reddit_data/{mental}reddit_posts.csv')
            df = pd.concat([existing_df, df]).drop_duplicates(subset='id').reset_index(drop=True)
       
        # save as csv 
        df.to_csv(f'./data/reddit_data/{mental}reddit_posts.csv', index=False)


        
    def get_posts_byPushshift(self, subreddit, limit=1000):
        url = f'https://api.pushshift.io/reddit/search/submission/?subreddit={subreddit}&sort=desc&limit={limit}'
        response = requests.get(url)
        
        print(response.status_code)
        print(response.text)



In [None]:
mental_subreddits = ['mentalhealth', 'depression', 'anxiety', 'therapy', 'selfhelp', 'bpd', 'ptsd', 'socialanxiety', 'counseling']
normal_subreddits = ['popular','all','AskReddit','interestingasfuck']
test = RedditScraper()

test.get_posts_byprawn(mental_subreddits, limit=1000,mental="mental_")
test.get_posts_byprawn(normal_subreddits, limit=1000, mental="normal_")

In [2]:
# read the csv file 

df_mental = pd.read_csv('./data/reddit_data/mental_reddit_posts.csv')
df_normal = pd.read_csv('./data/reddit_data/normal_reddit_posts.csv')

print(df_mental.shape)
print(df_normal.shape)

(15877, 12)
(6878, 12)


Beyond Blue forums 

In [None]:
'''
TO DO

Extract following information from the reddit webpage 


Post ID: A unique identifier for each post.
Post Content: The text of the post.
Post Author: The author of the post.
Post Date: The date the post was made.
Post Category: Category or forum where the post was made.
Number of Comments: The total number of comments on the post.

From Comment 

Post ID: Link back to the original post.
Comment ID: A unique identifier for each comment.
Comment Content: Text of the comment.
Comment Author: Author of the comment.
Comment Date: Date the comment was posted. (the order of the comments is really important)
other meta data if available
'''


import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
import time


from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager


from tqdm import tqdm






post_variables = ["Post ID", "Post Content", "Post Author", "Post Date", "Post Category", "Number of Comments"]


# def beyondblue_scrapping():


# Setup Chrome WebDriver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)


# i want to scrape the data from this url "https://forums.beyondblue.org.au/t5/anxiety/bd-p/c1-sc2-b1?&sort=recent"

url = "https://forums.beyondblue.org.au/t5/anxiety/bd-p/c1-sc2-b1?&sort=recent"
driver.get(url)
time.sleep(3)  # Wait for the page to load

# get the class "custom-message-list all-discussions"
soup = BeautifulSoup(driver.page_source, 'html.parser')





discussions = soup.find('div', class_='custom-message-list all-discussions').find_all(('article'))
# extract all the article elements within the discussions


for post in discussions:

    post_id = post.get('class')[-1]

    title_tag = post.find_all('h3').find_all('a')[1]
    post_title = title_tag.text.strip() if title_tag else ""

    post_content = post.find('p', class_ = 'body-text').text.strip() if post.find('p', class_ = 'body-text') else ""
    

    side_info = post.find('aside').find('div', class_='custom-title-author-info')
    post_author = side_info.find('span').text.strip() if side_info and side_info.find('span') else ""

    post_date = None

    post_tag = None

    post_reply = None

    post_comments = None






#close the driver
driver.quit()






[<article class="custom-message-tile custom-thread-unread custom-message-number-0">
<div>
<h3>
<a class="UserAvatar lia-link-navigation" href="/t5/user/viewprofilepage/user-id/55389" title="View profile">
<img alt="Guest_28908038" class="lia-user-avatar-message" src="https://forums.beyondblue.org.au/t5/image/serverpage/avatar-name/default_messageCopy/avatar-theme/candy/avatar-collection/Default_Avatar/avatar-display-size/profile/version/2?xdesc=1.0"/>
</a>
<a href="/t5/anxiety/apprentice-mechanic/td-p/611132" title="Apprentice mechanic">Apprentice mechanic</a>
<ul class="custom-tile-statistics">
<li class="custom-tile-replies"><span>replies: </span><b>3</b></li>
</ul>
</h3>
<p class="truncate-text">
                    I'm 4 months in to my apprenticeship and em struggling all day just to
get to the end I'm struggling with tasks that I have done for years with
no issues I em constantly worried that I'm going to forget something
blow a engine up leave a sump plug lo...
                 

Climate data API

# Nature Language Process
