# Mental Health Analysis ProjectA 

The purpose of this project is ...

# Data Scrapping

Reddit data

In [1]:
# scrapping data from reddit api
import requests
import pandas as pd
# import praw
import emoji
import emot
import asyncpraw
# import asyncio
from tqdm import tqdm


import os
from dotenv import load_dotenv
load_dotenv()




class RedditScraper:


    def __init__(self):

        self.auth = requests.auth.HTTPBasicAuth(os.getenv('CLIENT_ID'), os.getenv('CLIENT_SECRET'))
        self.data = {'grant_type': 'password',
                     'username': os.getenv('USERNAME'),
                     'password': os.getenv('PASSWORD')}
        self.headers = {'User-Agent': 'MyAPI/0.0.1'}
        self.res = requests.post('https://www.reddit.com/api/v1/access_token',
                                auth=self.auth, data=self.data, headers=self.headers)
        
        self.headers["Authorization"] = f'bearer {self.res.json()["access_token"]}'



        self.client_id = os.getenv('CLIENT_ID')
        self.client_secret = os.getenv('CLIENT_SECRET')
        self.username = os.getenv('USERNAME')
        self.password = os.getenv('PASSWORD')





    def get_posts_byrequests(self, subreddit, limit=1000):
        url = f'https://oauth.reddit.com/r/{subreddit}/hot'
        params = {'limit': limit}
        response = requests.get(url, headers=self.headers, params=params)
        
        if response.status_code == 200:
            return response
        else:
            raise Exception(f"Error fetching posts: {response.status_code} - {response.text}")
        


    def convert_emojis_emoticons(self, text):
        e = emot.core.emot()

        # Extract emoticons
        emoticon_results = e.emoticons(text)
        for original, meaning in zip(emoticon_results['value'], emoticon_results['mean']):
            text = text.replace(original, f" {meaning} ")

        # Extract emojis
        # emoji_results = e.emoji(text)
        # for original, meaning in zip(emoji_results['value'], emoji_results['mean']):
        #     text = text.replace(original, f" {meaning} ")

        text = emoji.demojize(text)



        return text.strip().lower()
        


    async def get_posts_byprawn(self, subreddits, limit=1000, mental="mental_"):

        reddit = asyncpraw.Reddit(client_id=self.client_id,
                             client_secret=self.client_secret,
                             user_agent='windows:mentalhealth.scraper:v1.0 (by u/IceWorth5480)',
                             username=self.username,
                             password=self.password)
        
        all_posts = []
        # , 'top', 'new'
        sort_types = ['hot']


        for subreddit_name in tqdm(subreddits, desc="Subreddits Progress"):
            subreddit = await reddit.subreddit(subreddit_name)
            for sort in sort_types:
                if sort == 'hot':
                    posts = subreddit.hot(limit=limit)
                elif sort == 'top':
                    posts = subreddit.top(limit=limit)
                elif sort == 'new':
                    posts = subreddit.new(limit=limit)

                async for post in posts:
                    if post is None:
                        continue

                    # Load top-level comments (non-blocking)
                    await post.load()
                    await post.comments.replace_more(limit=0)
                    top_comments_raw = [comment.body for comment in post.comments[:5]]  # Get top 5 comments
                    top_comments = [self.convert_emojis_emoticons(c) for c in top_comments_raw]

                    all_posts.append({
                        'id': post.id,
                        'subreddit': subreddit_name,
                        'sort': sort,
                        'title': post.title,
                        'selftext': self.convert_emojis_emoticons(post.selftext),
                        'created_utc': post.created_utc,
                        'score': post.score,
                        'num_comments': post.num_comments,
                        'author': str(post.author),
                        'post_url': post.url,
                        'over_18': post.over_18,
                        'flair': post.link_flair_text,
                        'top_comments': top_comments
                    })

        await reddit.close()



        df = pd.DataFrame(all_posts)
        # Drop duplicates by post ID
        df = df.drop_duplicates(subset='id').reset_index(drop=True)
        df['created_utc'] = pd.to_datetime(df['created_utc'], unit='s')
        df = df.sort_values(by='created_utc', ascending=False).reset_index(drop=True)
        # Convert list of comments to string for CSV storage
        df['top_comments'] = df['top_comments'].apply(lambda x: ' | '.join(x) if isinstance(x, list) else '')



        # check if f'./data/reddit_data/{mental}reddit_posts.csv' exists, if so merging with df and # delete dupicates by id
        if os.path.exists(f'./data/reddit_data/{mental}reddit_posts.csv'):
            existing_df = pd.read_csv(f'./data/reddit_data/{mental}reddit_posts.csv')
            df = pd.concat([existing_df, df]).drop_duplicates(subset='id').reset_index(drop=True)
       
        # save as csv 
        df.to_csv(f'./data/reddit_data/{mental}reddit_posts.csv', index=False)







In [6]:




# , 'depression', 'anxiety', 'therapy', 'selfhelp', 'bpd', 'ptsd', 'socialanxiety', 'counseling'
mental_subreddits = ['mentalhealth', 'depression', 'anxiety', 'therapy', 'selfhelp', 'bpd', 'ptsd', 'socialanxiety', 'counseling']
australian_regions = ['melbourne','sydney','adelaide','perth','brisbane','canberra']
normal_subreddits = ['popular','all','AskReddit','interestingasfuck']



scraper  = RedditScraper()

# await scraper.get_posts_byprawn(mental_subreddits, limit=1000, mental="mental_")
await scraper.get_posts_byprawn(normal_subreddits, limit=1000, mental="normal_")




Subreddits Progress: 100%|██████████| 4/4 [1:20:33<00:00, 1208.28s/it]


In [None]:
# read the csv file 

df_mental = pd.read_csv('./data/reddit_data/mental_reddit_posts.csv')
print(df_mental.shape)

df_normal = pd.read_csv('./data/reddit_data/normal_reddit_posts.csv')
print(df_normal.shape)

(7646, 13)
(180, 13)


Beyond Blue forums 

In [2]:
'''
TO DO

Extract following information from the reddit webpage 


Post ID: A unique identifier for each post.
Post Content: The text of the post.
Post Author: The author of the post.
Post Date: The date the post was made.
Post Category: Category or forum where the post was made.
Number of Comments: The total number of comments on the post.

From Comment 

Post ID: Link back to the original post.
Comment ID: A unique identifier for each comment.
Comment Content: Text of the comment.
Comment Author: Author of the comment.
Comment Date: Date the comment was posted. (the order of the comments is really important)
other meta data if available
'''



from bs4 import BeautifulSoup
import pandas as pd
import time
import re
from datetime import datetime, timedelta
import calendar
import os


from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager


from tqdm import tqdm
import emoji
import emot





def parse_post_date(raw_date):

    weekdays = [day.lower() for day in list(calendar.day_name) ]
    today = datetime.now()

    raw_date = raw_date.strip().lower()  

    # Case 1: Day of the week (e.g., "Monday")
    if raw_date in weekdays:
        # Get weekday index: Monday = 0, Sunday = 6
        post_weekday_index = weekdays.index(raw_date)
        today_weekday_index = today.weekday()

        # Calculate difference in days
        delta_days = (today_weekday_index - post_weekday_index) % 7
        # Get actual date
        post_date = today - timedelta(days=delta_days)
        return post_date.strftime('%Y-%m-%d')  # Format as YYYY-MM-DD
    
    # Case 2: "a week ago", "2 weeks ago", etc.
    elif "week" in raw_date:
        match = re.search(r'(\d+)', raw_date)
        weeks = int(match.group(1)) if match else 1
        post_date = today - timedelta(weeks=weeks)
        return post_date.strftime('%Y-%m-%d')

    # Case 3: "a month ago", "2 months ago", etc.
    elif "month" in raw_date:
        match = re.search(r'(\d+)', raw_date)
        months = int(match.group(1)) if match else 1
        # Approximate a month as 30 days
        post_date = today - timedelta(days=30 * months)
        return post_date.strftime('%Y-%m-%d')
    
    # Case 4: Exact date format like "11-05-2025"
    else:
        # Try parsing date in the format like "25-09-2020"
        try:
            post_date = datetime.strptime(raw_date, '%d-%m-%Y')
            return post_date.strftime('%Y-%m-%d')
        except ValueError:
            return 'Unknown date'  # If format is unexpected




def convert_emojis_emoticons(text):
    e = emot.core.emot()

    # Extract emoticons
    emoticon_results = e.emoticons(text)

    for original, meaning in zip(emoticon_results['value'], emoticon_results['mean']):
        text = text.replace(original, f" {meaning} ")

    # # Extract emojis
    # emoji_results = e.emoji(text)
    # for original, meaning in zip(emoji_results['value'], emoji_results['mean']):
    #     text = text.replace(original, f" {meaning} ")

    text = emoji.demojize(text)

    return text.strip().lower()




def comment_scrapping (url, comment_pages = 1):

    
    # Setup Chrome WebDriver
    service = Service(ChromeDriverManager().install())
    comment_driver = webdriver.Chrome(service=service)

    
    

    list_comments = []

    for page in range(1, comment_pages + 1):

        comment_driver.get(url)
        time.sleep(0.005)
        soup = BeautifulSoup(comment_driver.page_source, 'html.parser')
        comments_section = soup.find('div', class_='linear-message-list message-list')
        every_comments = comments_section.find_all('div')


        for comment in every_comments:

            main_section = comment.find('div', class_='lia-quilt-row lia-quilt-row-message-main') if comment.find('div', class_='lia-quilt-row lia-quilt-row-message-main') else None
            if not main_section:
                continue
            # scrapping all the text 
            comment_text = main_section.get_text(separator=' ', strip=True) if main_section else ""
            comment_text = convert_emojis_emoticons(comment_text)
            list_comments.append(comment_text)

            # only scrapping the first 3 comments
            if len(list_comments) >= 3:
                break

        # Find next page link
        next_page = soup.find('li', class_='lia-paging-page-next')
        if next_page and next_page.find('a'):
            next_page_link = next_page.find('a')['href']
            url = next_page_link
        else:
            print("No more pages to scrape.")
            break
        
    # close the driver
    comment_driver.quit()

    # change the list of comments to a string
    list_comments = ' | '.join(list_comments) if list_comments else ""
    

    return list_comments


        



def beyondblue_scrapping(tag,address,pages=2):


    # Setup Chrome WebDriver
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service)
    url = address

    whole_data = []



    for page in tqdm(range(1, pages + 1), desc="Scraping pages"):

    
        driver.get(url)
        # time.sleep(3)  # Wait for the page to load


        # get the class "custom-message-list all-discussions"
        soup = BeautifulSoup(driver.page_source, 'html.parser')


        discussions = soup.find('div', class_='custom-message-list all-discussions').find_all(('article'))
        

        for post in discussions:

            # Extracting post id
            post_link = post.find('h3').find_all('a')[1].get('href')
            post_id = post_link.split('/')[-1]  


            full_post_link = f"https://forums.beyondblue.org.au{post_link}"
            # extract comments from post_link
            comments = comment_scrapping(full_post_link, comment_pages=1)
            

            # Extracting post title
            title_tag = post.find_all('h3')[0].find_all('a')[1]
            post_title = convert_emojis_emoticons(title_tag.text.strip()) if title_tag else ""

            # Extracting post content
            post_content = convert_emojis_emoticons(post.find('p', class_ = 'body-text').text.strip()) if post.find('p', class_ = 'body-text') else ""
            

            side_info = post.find('aside')
            side_info1 = side_info.find('div', class_='custom-tile-author-info') if side_info else None
            # Extracting post author
            post_author = side_info1.find('a').find('span').text.strip() if side_info and side_info.find('span') else ""

            author_link = post.find('aside').find('div', class_='custom-tile-author-info').find('a').get('href')
            # Extracting user id from author link
            user_id = author_link.split('user-id/')[-1]

            


            side_info2 = side_info.find('div', class_='custom-tile-category-content')
            # Extracting post tag
            post_tag = side_info2.find('a').text.strip() if side_info2 and side_info2.find('a') else ""
            
            
            raw_date = side_info2.find('time').text.strip() if side_info2 and side_info2.find('time') else ""
            # Extracting post date
            post_date = parse_post_date(raw_date)



            side_info3 = side_info.find('div', class_='custom-tile-unread-replies')
            unread = side_info3.find('span').text.strip() if side_info3 and side_info3.find('span') else ""
            # Extracting number of unread replies
            match = re.search(r'\d+', unread)
            post_unread = int(match.group()) if match else 0

            # Extracting number of comments
            number_comments = post.find('li', class_ = 'custom-tile-replies').find('b').text.strip() if post.find('li', class_ = 'custom-tile-replies') else ""


            

            post_data = {
                "Post ID": post_id,
                "Post Title": post_title,
                "Post Content": post_content,
                "Post Author": post_author,
                "User ID": user_id,
                "Post Date": post_date,
                "Post Category": post_tag,
                "Number of Comments": number_comments,
                "Comments": comments
            }
            whole_data.append(post_data)


        # Print the number of posts scraped on the current page
        # print(f"Tag: {tag} Page {page}: Scraped {len(discussions)} posts.")

        # Find next page link
        next_page = soup.find('li', class_='lia-paging-page-next')
        if next_page and next_page.find('a'):
            url = next_page.find('a')['href']
        else:
            print("No more pages to scrape.")
            break







        if page % 50 == 0:
            # save the data to a csv file
            temp_df = pd.DataFrame(whole_data)
            temp_df = temp_df.drop_duplicates(subset='Post ID').reset_index(drop=True)
            temp_df['Post Date'] = pd.to_datetime(temp_df['Post Date'], errors='coerce')
            temp_df = temp_df.sort_values(by='Post Date', ascending=False).reset_index(drop=True)
            # check if the file already exists
            if not os.path.exists(f'./data/beyondblue_data/{tag}_beyondblue_posts.csv'):
                temp_df.to_csv(f'./data/beyondblue_data/{tag}_beyondblue_posts.csv', index=False)
            else:
                existing_df = pd.read_csv(f'./data/beyondblue_data/{tag}_beyondblue_posts.csv')
                temp_df = pd.concat([existing_df, temp_df]).drop_duplicates(subset='Post ID').reset_index(drop=True)
                temp_df.to_csv(f'./data/beyondblue_data/{tag}_beyondblue_posts.csv', index=False)

            print(f'The data for tag {tag} has been scraped to page {page}')

            # if the oldest post is older than 2020-01-01, stop the scraping
            # if not temp_df.empty and temp_df['Post Date'].min() < pd.to_datetime('2020-01-01'):
            #     print(f"The oldest post is older than 2020-01-01, stopping the scraping for tag {tag}.")
            #     break


    #close the driver
    driver.quit()





    # Convert to DataFrame 
    df = pd.DataFrame(whole_data)
    # delete duplicates by Post ID
    df = df.drop_duplicates(subset='Post ID').reset_index(drop=True)
    # Convert 'Post Date' to datetime format
    df['Post Date'] = pd.to_datetime(df['Post Date'], errors='coerce')
    # Sort by 'Post Date' in descending order
    df = df.sort_values(by='Post Date', ascending=False).reset_index(drop=True)

    # check if the file already exists
    if os.path.exists(f'./data/beyondblue_data/{tag}_beyondblue_posts.csv'):
        existing_df = pd.read_csv(f'./data/beyondblue_data/{tag}_beyondblue_posts.csv')
        df = pd.concat([existing_df, df]).drop_duplicates(subset='Post ID').reset_index(drop=True)

    df.to_csv(f'./data/beyondblue_data/{tag}_beyondblue_posts.csv', index=False)
        
        
    print(f"Data saved to ./data/beyondblue_data/{tag}_beyondblue_posts.csv")






In [None]:
# post_variables = ["Post ID", "Post Content", "Post Author", "Post Date", "Post Category", "Number of Comments"]
# mental_health_type =['Anxiety','Depression','PTSD and trauma','Suicidal thoughts and self-harm']



mental_health_urls = {
    # "Anxiety": "https://forums.beyondblue.org.au/t5/anxiety/bd-p/c1-sc2-b1?&sort=recent",
    # "Depression": "https://forums.beyondblue.org.au/t5/depression/bd-p/c1-sc2-b2?&sort=recent",
    # "PTSD": "https://forums.beyondblue.org.au/t5/ptsd-and-trauma/bd-p/c1-sc2-b3?&sort=recent",
    "Suicide_selfharm": "https://forums.beyondblue.org.au/t5/suicidal-thoughts-and-self-harm/bd-p/c1-sc2-b4?&sort=recent",
    # "Staying_well": "https://forums.beyondblue.org.au/t5/staying-well/bd-p/c1-sc3-b1?&sort=recent",
    # "Treament": "https://forums.beyondblue.org.au/t5/treatments-health-professionals/bd-p/c1-sc3-b2?&sort=recent",
    # "Relationship_family_issues":"https://forums.beyondblue.org.au/t5/relationship-and-family-issues/bd-p/c1-sc3-b3?&sort=recent",
    "Youth":"https://forums.beyondblue.org.au/t5/young-people/bd-p/c1-sc4-b1?&sort=recent",
    "Sex_identity":"https://forums.beyondblue.org.au/t5/sexuality-and-gender-identity/bd-p/c1-sc4-b2?&sort=recent",
    "Multiculture":"https://forums.beyondblue.org.au/t5/multicultural-experiences/bd-p/c1-sc4-b3?&sort=recent",
    "Grief_loss":"https://forums.beyondblue.org.au/t5/grief-and-loss/bd-p/c1-sc4-b4?&sort=recent"
}

each_tag_number_data = 1500
pages = each_tag_number_data // 10  # Assuming each page has 10 posts


for tag, address in mental_health_urls.items():
    try:
        beyondblue_scrapping(tag, address, pages = pages)
    except Exception as e:
        print(f"Error scraping {tag}: {e}")
        continue

Scraping pages:  33%|███▎      | 50/150 [1:09:34<2:19:39, 83.79s/it]

The data for tag Anxiety has been scraped to page 50


Scraping pages:  67%|██████▋   | 100/150 [2:18:21<1:09:11, 83.02s/it]

The data for tag Anxiety has been scraped to page 100


Scraping pages: 100%|██████████| 150/150 [3:27:40<00:00, 83.07s/it]  

The data for tag Anxiety has been scraped to page 150





Data saved to ./data/beyondblue_data/Anxiety_beyondblue_posts.csv


  temp_df['Post Date'] = pd.to_datetime(temp_df['Post Date'], errors='coerce')
Scraping pages:  33%|███▎      | 50/150 [1:09:02<2:17:28, 82.48s/it]

The data for tag Depression has been scraped to page 50


  temp_df['Post Date'] = pd.to_datetime(temp_df['Post Date'], errors='coerce')
Scraping pages:  67%|██████▋   | 100/150 [2:17:26<1:09:42, 83.66s/it]

The data for tag Depression has been scraped to page 100


  temp_df['Post Date'] = pd.to_datetime(temp_df['Post Date'], errors='coerce')
Scraping pages: 100%|██████████| 150/150 [3:26:46<00:00, 82.71s/it]

The data for tag Depression has been scraped to page 150



  df['Post Date'] = pd.to_datetime(df['Post Date'], errors='coerce')


Data saved to ./data/beyondblue_data/Depression_beyondblue_posts.csv


Scraping pages:  33%|███▎      | 50/150 [1:08:19<2:18:45, 83.26s/it]

The data for tag PTSD has been scraped to page 50


Scraping pages:  67%|██████▋   | 100/150 [2:17:02<1:09:31, 83.44s/it]

The data for tag PTSD has been scraped to page 100


Scraping pages: 100%|██████████| 150/150 [3:25:18<00:00, 82.13s/it]  

The data for tag PTSD has been scraped to page 150





Data saved to ./data/beyondblue_data/PTSD_beyondblue_posts.csv


Scraping pages:   0%|          | 0/150 [00:45<?, ?it/s]


KeyboardInterrupt: 

In [9]:

# read the csv file

df_anxiety = pd.read_csv('./data/beyondblue_data/Anxiety_beyondblue_posts.csv')
print(df_anxiety.shape)
# print(df_anxiety)


(3000, 9)


Climate data API

In [9]:

import pandas as pd
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

from tqdm import tqdm

url = "https://reg.bom.gov.au/climate/data/"

# weather_types = ['Rainfall', 'Temp_Max', 'Temp_Min', 'Solar exposure']
# australian_regions = ['NSW', 'VIC', 'QLD', 'WA', 'SA', 'TAS', 'ACT', 'NT']
# australian_citys = ['Sydney', 'Melbourne', 'Brisbane', 'Perth', 'Adelaide', 'Hobart', 'Canberra', 'Darwin']

weather_types = ['Temp_Min']
australian_regions = ['SA', 'TAS']
australian_citys = ['Adelaide', 'Hobart']

zip_url = {}

for weather_type in weather_types:
    for region, city in zip(australian_regions, australian_citys):
        service = Service(ChromeDriverManager().install())
        driver = webdriver.Chrome(service=service)

        try:
            driver.get(url)
            wait = WebDriverWait(driver, 10)

            # Select weather type
            data_about_dropdown = wait.until(EC.presence_of_element_located((By.ID, 'ncc_obs_code_group')))

            if weather_type == 'Rainfall':
                data_about_dropdown.find_element(By.XPATH, "//option[text()='Rainfall']").click()

            elif weather_type == 'Temp_Max':
                data_about_dropdown.find_element(By.XPATH, "//option[text()='Temperature']").click()

                # ✅ Wait for the secondary dropdown to become visible
                sub_select = wait.until(EC.visibility_of_element_located((By.ID, 'elementSubSelectLine')))
                time.sleep(1)

                # ✅ Select Maximum Temperature
                element_select = driver.find_element(By.ID, 'elementSubSelect')
                element_select.find_element(By.XPATH, "//option[text()='Maximum temperature']").click()

            elif weather_type == 'Temp_Min':
                data_about_dropdown.find_element(By.XPATH, "//option[text()='Temperature']").click()

                # ✅ Wait for the secondary dropdown to become visible
                sub_select = wait.until(EC.visibility_of_element_located((By.ID, 'elementSubSelectLine')))
                time.sleep(1)

                # ✅ Select Minimum Temperature (fixed typo)
                element_select = driver.find_element(By.ID, 'elementSubSelect')
                element_select.find_element(By.XPATH, "//option[text()='Minimum temperature']").click()


            elif weather_type == 'Solar exposure':
                data_about_dropdown.find_element(By.XPATH, "//option[text()='Solar exposure']").click()

            time.sleep(1)

            # Input city
            location_input = driver.find_element(By.ID, 'p_locSearch')
            location_input.clear()
            location_input.send_keys(city)

            # Click Find button
            find_button = driver.find_element(By.ID, 'text')
            find_button.click()

            time.sleep(4)

            # Select the first matching town
            match_list = wait.until(EC.presence_of_element_located((By.ID, 'matchList')))
            match_list.find_elements(By.TAG_NAME, 'option')[0].click()

            time.sleep(1)

            # Tick 'Only show open stations'
            open_station_checkbox = driver.find_element(By.ID, 'openStation')
            if not open_station_checkbox.is_selected():
                open_station_checkbox.click()

            time.sleep(2)

            # Select the first available station
            nearest_stations = wait.until(EC.presence_of_element_located((By.ID, 'nearest10')))
            station_options = nearest_stations.find_elements(By.TAG_NAME, 'option')
            if len(station_options) == 0:
                print(f'No stations found for {city} - {weather_type}')
                driver.quit()
                continue

            station_options[0].click()

            time.sleep(1)

            # Click Get Data button 
            get_data_button = driver.find_element(By.ID, 'getData')
            time.sleep(5)  
            get_data_button.click()

            time.sleep(5)  # Allow time for the new page to load

            # Switch to the new tab
            driver.switch_to.window(driver.window_handles[-1])

            # Wait for the downloads list to appear
            wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'downloads')))

            # Scrape the All Years of Data link
            all_years_link = driver.find_element(By.LINK_TEXT, 'All years of data')
            href = all_years_link.get_attribute('href')

            # Save the link
            zip_url_key = f'{region}_{city}_{weather_type}'
            zip_url[zip_url_key] = href

            print(f'Collected ZIP URL for {city} - {weather_type}: {href}')

            driver.close()  # Close the data tab
            driver.switch_to.window(driver.window_handles[0])  # Switch back to the main tab

        except Exception as e:
            print(f'Error processing {city} - {weather_type}: {e}')
        finally:
            driver.quit()

# Display all collected URLs
print("\nAll collected ZIP URLs:")
for key, link in zip_url.items():
    print(f'{key}: {link}')


# Save the URLs to a CSV file
df_urls = pd.DataFrame(list(zip_url.items()), columns=['Region_City_WeatherType', 'ZIP_URL'])
df_urls.to_csv('./data/AUS_weather/complement.csv', index=False)

Collected ZIP URL for Adelaide - Temp_Min: https://reg.bom.gov.au/jsp/ncc/cdio/weatherData/av?p_display_type=dailyZippedDataFile&p_stn_num=023000&p_c=-105845614&p_nccObsCode=123&p_startYear=2025
Collected ZIP URL for Hobart - Temp_Min: https://reg.bom.gov.au/jsp/ncc/cdio/weatherData/av?p_display_type=dailyZippedDataFile&p_stn_num=094029&p_c=-1768336182&p_nccObsCode=123&p_startYear=2025

All collected ZIP URLs:
SA_Adelaide_Temp_Min: https://reg.bom.gov.au/jsp/ncc/cdio/weatherData/av?p_display_type=dailyZippedDataFile&p_stn_num=023000&p_c=-105845614&p_nccObsCode=123&p_startYear=2025
TAS_Hobart_Temp_Min: https://reg.bom.gov.au/jsp/ncc/cdio/weatherData/av?p_display_type=dailyZippedDataFile&p_stn_num=094029&p_c=-1768336182&p_nccObsCode=123&p_startYear=2025


In [None]:


# download the zip files from the bom website and extract the data
# and save the data to a csv file with the name of the region and city
# delete the zip file after extracting the data

for region, city in zip(australian_regions, australian_citys):
    



# Nature Language Process


In [10]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ------------------- -------------------- 6.3/12.8 MB 38.6 MB/s eta 0:00:01
     --------------------------------------- 12.8/12.8 MB 33.5 MB/s eta 0:00:00
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [11]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')  # For tokenization

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [1]:

import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import spacy
import emoji
import emot

class NLP_OPERATORS:

    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.nlp = spacy.load('en_core_web_sm', disable=["parser", "ner", "textcat"])
        self.nlp.max_length = 5_000_000

    def convert_emojis_emoticons(self, text):
        e = emot.core.emot()

        # Extract emoticons
        emoticon_results = e.emoticons(text)
        for original, meaning in zip(emoticon_results['value'], emoticon_results['mean']):
            text = text.replace(original, f" {meaning} ")

        # # Extract emojis
        # emoji_results = e.emoji(text)
        # for original, meaning in zip(emoji_results['value'], emoji_results['mean']):
        #     text = text.replace(original, f" {meaning} ")


        # Convert emojis to text
        text = emoji.demojize(text)

        return text.strip().lower()

    def basic_cleaning(self, text):
        
        # Convert emojis and emoticons to text
        text = self.convert_emojis_emoticons(text)
        # Remove unwanted characters and patterns
        text = text.replace('\n', ' ').replace('\r', ' ')
        # Remove URLs, HTML tags, extra spaces, and special characters
        text = re.sub(r'https?://\S+', '', text)
        # Remove HTML tags
        text = re.sub(r'<.*?>', '', text)
        # remove extra spaces and quotes
        text = re.sub(r'\s+', ' ', text)
        # Remove quotes
        text = re.sub(r'&quot;', '', text)
        # Remove HTML tags and special characters
        text = re.sub(r'</?.*?>', '', text)
        # Remove non-alphanumeric characters and digits
        text = re.sub(r'[^a-zA-Z0-9 ]', '', text)
        # Remove digits
        text = re.sub(r'\d+', '', text)
        
        return text.strip()

    def text_preprocessing(self, text, regex=False, remove_stop_word=False, lemmatisation=False, lower_case=False, return_tokens=False):
        # Apply basic cleaning
        if regex:
            text = self.basic_cleaning(text)

        if lower_case:
            text = text.lower()

        tokens = word_tokenize(text)

        if remove_stop_word:
            tokens = [word for word in tokens if word not in self.stop_words]

        if lemmatisation:
            doc = self.nlp(' '.join(tokens))
            tokens = [token.lemma_ for token in doc]

        if return_tokens:
            return tokens
        else:
            return " ".join(tokens)
    
    def feature_extraction(self,tokens, ngram_range=(1, 1)):
        pass
        

In [None]:
import os 


clean_operator = NLP_OPERATORS()

# get the current path
current_path = os.getcwd()


# read the csv file
df_mental = pd.read_csv(os.path.join(current_path, 'data', 'reddit_data', 'mental_reddit_posts.csv'))
df_normal = pd.read_csv(os.path.join(current_path, 'data', 'reddit_data', 'normal_reddit_posts.csv'))


print(df_mental.shape)
print(df_normal.shape)                                                                                          
df_mental['preprocessed_token_title'] = df_mental['title'].apply(lambda x: clean_operator.text_preprocessing(x, regex=True, remove_stop_word=True, lemmatisation=True, lower_case=True, return_tokens=True))
df_mental['preprocessed_token_selftext'] =df_mental['selftext'].apply(lambda x: clean_operator.text_preprocessing(x, regex=True, remove_stop_word=True, lemmatisation=True, lower_case=True, return_tokens=True))
df_mental['preprocessed_token_top_comments'] = df_mental['top_comments'].apply(lambda x: clean_operator.text_preprocessing(x, regex=True, remove_stop_word=True, lemmatisation=True, lower_case=True, return_tokens=True))
df_mental.to_csv(os.path.join(current_path, 'data', 'reddit_data','clean', 'clean_mental_reddit_posts.csv'), index=False)









# read all the csv files in beyondblue_data repository and concatenate them into a single dataframe
df_beyondblue = pd.DataFrame()
beyondblue_data_path = os.path.join(current_path, 'data', 'beyondblue_data')
for file in os.listdir(beyondblue_data_path):
    if file.endswith('.csv'):
        temp_df = pd.read_csv(os.path.join(beyondblue_data_path, file))
        df_beyondblue = pd.concat([df_beyondblue, temp_df], ignore_index=True)
# remove duplicates by Post ID
df_beyondblue = df_beyondblue.drop_duplicates(subset='Post ID').reset_index(drop=True)
# Convert 'Post Date' to datetime format
df_beyondblue['Post Date'] = pd.to_datetime(df_beyondblue['Post Date'], errors='coerce')
# Sort by 'Post Date' in descending order
df_beyondblue = df_beyondblue.sort_values(by='Post Date', ascending=False).reset_index(drop=True)

print(df_beyondblue.shape)

df_beyondblue['preprocessed_token_post_title'] = df_beyondblue['Post Title'].apply(lambda x: clean_operator.text_preprocessing(x, regex=True, remove_stop_word=True, lemmatisation=True, lower_case=True, return_tokens=True))
df_beyondblue['preprocessed_token_post_content'] = df_beyondblue['Post Content'].apply(lambda x: clean_operator.text_preprocessing(x, regex=True, remove_stop_word=True, lemmatisation=True, lower_case=True, return_tokens=True))
df_beyondblue['preprocessed_token_comments'] = df_beyondblue['Comments'].apply(lambda x: clean_operator.text_preprocessing(x, regex=True, remove_stop_word=True, lemmatisation=True, lower_case=True, return_tokens=True))
df_beyondblue.to_csv(os.path.join(current_path, 'data', 'beyondblue_data','clean', 'clean_beyondblue_posts.csv'), index=False)




(7646, 13)
(2434, 13)


feature extraction

# CATEGORIZATION

In [None]:
general_symptom_categories = [
    "Depression",
    "Anxiety",
    "Stress/Burnout",
    "Loneliness",
    "Low Self-Esteem",
    "Trauma/PTSD",
    "Anger/Irritability",
    "Obsessive Thoughts",
    "Addiction"
]

suicide_and_selfharm_labels = [
    "Suicidal Ideation",
    "Suicide Attempt",
    "Self-Harm",
    "Despair",
    "Urgent Help Request",
    "Grief After Suicide",
    "Coping with Suicidal Thoughts"
]

# NETWORK VISUALIZATION

# MODEL TRAINING