# Mental Health Analysis ProjectA 

The purpose of this project is ...

# Data Scrapping

Reddit data

In [1]:
# scrapping data from reddit api
import requests
import pandas as pd
# import praw
import emoji
import emot
import asyncpraw
# import asyncio
from tqdm import tqdm


import os
from dotenv import load_dotenv
load_dotenv()




class RedditScraper:


    def __init__(self):

        self.auth = requests.auth.HTTPBasicAuth(os.getenv('CLIENT_ID'), os.getenv('CLIENT_SECRET'))
        self.data = {'grant_type': 'password',
                     'username': os.getenv('USERNAME'),
                     'password': os.getenv('PASSWORD')}
        self.headers = {'User-Agent': 'MyAPI/0.0.1'}
        self.res = requests.post('https://www.reddit.com/api/v1/access_token',
                                auth=self.auth, data=self.data, headers=self.headers)
        
        self.headers["Authorization"] = f'bearer {self.res.json()["access_token"]}'



        self.client_id = os.getenv('CLIENT_ID')
        self.client_secret = os.getenv('CLIENT_SECRET')
        self.username = os.getenv('USERNAME')
        self.password = os.getenv('PASSWORD')





    def get_posts_byrequests(self, subreddit, limit=1000):
        url = f'https://oauth.reddit.com/r/{subreddit}/hot'
        params = {'limit': limit}
        response = requests.get(url, headers=self.headers, params=params)
        
        if response.status_code == 200:
            return response
        else:
            raise Exception(f"Error fetching posts: {response.status_code} - {response.text}")
        


    def convert_emojis_emoticons(self, text):
        e = emot.core.emot()

        # Extract emoticons
        emoticon_results = e.emoticons(text)
        for original, meaning in zip(emoticon_results['value'], emoticon_results['mean']):
            text = text.replace(original, f" {meaning} ")

        # Extract emojis
        # emoji_results = e.emoji(text)
        # for original, meaning in zip(emoji_results['value'], emoji_results['mean']):
        #     text = text.replace(original, f" {meaning} ")

        text = emoji.demojize(text)



        return text.strip().lower()
        


    async def get_posts_byprawn(self, subreddits, limit=1000, mental="mental_"):

        reddit = asyncpraw.Reddit(client_id=self.client_id,
                             client_secret=self.client_secret,
                             user_agent='windows:mentalhealth.scraper:v1.0 (by u/IceWorth5480)',
                             username=self.username,
                             password=self.password)
        
        all_posts = []
        # , 'top', 'new'
        sort_types = ['hot']


        for subreddit_name in tqdm(subreddits, desc="Subreddits Progress"):
            subreddit = await reddit.subreddit(subreddit_name)
            for sort in sort_types:
                if sort == 'hot':
                    posts = subreddit.hot(limit=limit)
                elif sort == 'top':
                    posts = subreddit.top(limit=limit)
                elif sort == 'new':
                    posts = subreddit.new(limit=limit)

                async for post in posts:
                    if post is None:
                        continue

                    # Load top-level comments (non-blocking)
                    await post.load()
                    await post.comments.replace_more(limit=0)
                    top_comments_raw = [comment.body for comment in post.comments[:5]]  # Get top 5 comments
                    top_comments = [self.convert_emojis_emoticons(c) for c in top_comments_raw]

                    all_posts.append({
                        'id': post.id,
                        'subreddit': subreddit_name,
                        'sort': sort,
                        'title': post.title,
                        'selftext': self.convert_emojis_emoticons(post.selftext),
                        'created_utc': post.created_utc,
                        'score': post.score,
                        'num_comments': post.num_comments,
                        'author': str(post.author),
                        'post_url': post.url,
                        'over_18': post.over_18,
                        'flair': post.link_flair_text,
                        'top_comments': top_comments
                    })

        await reddit.close()



        df = pd.DataFrame(all_posts)
        # Drop duplicates by post ID
        df = df.drop_duplicates(subset='id').reset_index(drop=True)
        df['created_utc'] = pd.to_datetime(df['created_utc'], unit='s')
        df = df.sort_values(by='created_utc', ascending=False).reset_index(drop=True)
        # Convert list of comments to string for CSV storage
        df['top_comments'] = df['top_comments'].apply(lambda x: ' | '.join(x) if isinstance(x, list) else '')



        # check if f'./data/reddit_data/{mental}reddit_posts.csv' exists, if so merging with df and # delete dupicates by id
        if os.path.exists(f'./data/reddit_data/{mental}reddit_posts.csv'):
            existing_df = pd.read_csv(f'./data/reddit_data/{mental}reddit_posts.csv')
            df = pd.concat([existing_df, df]).drop_duplicates(subset='id').reset_index(drop=True)
       
        # save as csv 
        df.to_csv(f'./data/reddit_data/{mental}reddit_posts.csv', index=False)







In [None]:



mental_subreddits = ['mentalhealth', 'depression', 'anxiety', 'therapy', 'selfhelp', 'bpd', 'ptsd', 'socialanxiety', 'counseling']
normal_subreddits = ['popular','all','AskReddit','interestingasfuck']
# australian_regions = ['melbourne','sydney','adelaide','perth','brisbane','canberra']


scraper  = RedditScraper()

# await scraper.get_posts_byprawn(mental_subreddits, limit=1000, mental="mental_")
await scraper.get_posts_byprawn(normal_subreddits, limit=1000, mental="normal_")




Subreddits Progress: 100%|██████████| 4/4 [1:20:33<00:00, 1208.28s/it]


In [None]:
# read the csv file 

df_mental = pd.read_csv('./data/reddit_data/mental_reddit_posts.csv')
print(df_mental.shape)

df_normal = pd.read_csv('./data/reddit_data/normal_reddit_posts.csv')
print(df_normal.shape)

(7646, 13)
(180, 13)


Beyond Blue forums 

In [2]:
'''
TO DO

Extract following information from the reddit webpage 


Post ID: A unique identifier for each post.
Post Content: The text of the post.
Post Author: The author of the post.
Post Date: The date the post was made.
Post Category: Category or forum where the post was made.
Number of Comments: The total number of comments on the post.

From Comment 

Post ID: Link back to the original post.
Comment ID: A unique identifier for each comment.
Comment Content: Text of the comment.
Comment Author: Author of the comment.
Comment Date: Date the comment was posted. (the order of the comments is really important)
other meta data if available
'''



from bs4 import BeautifulSoup
import pandas as pd
import time
import re
from datetime import datetime, timedelta
import calendar
import os


from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager


from tqdm import tqdm
import emoji
import emot





def parse_post_date(raw_date):

    weekdays = [day.lower() for day in list(calendar.day_name) ]
    today = datetime.now()

    raw_date = raw_date.strip().lower()  

    # Case 1: Day of the week (e.g., "Monday")
    if raw_date in weekdays:
        # Get weekday index: Monday = 0, Sunday = 6
        post_weekday_index = weekdays.index(raw_date)
        today_weekday_index = today.weekday()

        # Calculate difference in days
        delta_days = (today_weekday_index - post_weekday_index) % 7
        # Get actual date
        post_date = today - timedelta(days=delta_days)
        return post_date.strftime('%Y-%m-%d')  # Format as YYYY-MM-DD
    
    # Case 2: "a week ago", "2 weeks ago", etc.
    elif "week" in raw_date:
        match = re.search(r'(\d+)', raw_date)
        weeks = int(match.group(1)) if match else 1
        post_date = today - timedelta(weeks=weeks)
        return post_date.strftime('%Y-%m-%d')

    # Case 3: "a month ago", "2 months ago", etc.
    elif "month" in raw_date:
        match = re.search(r'(\d+)', raw_date)
        months = int(match.group(1)) if match else 1
        # Approximate a month as 30 days
        post_date = today - timedelta(days=30 * months)
        return post_date.strftime('%Y-%m-%d')
    
    # Case 4: Exact date format like "11-05-2025"
    else:
        # Try parsing date in the format like "25-09-2020"
        try:
            post_date = datetime.strptime(raw_date, '%d-%m-%Y')
            return post_date.strftime('%Y-%m-%d')
        except ValueError:
            return 'Unknown date'  # If format is unexpected




def convert_emojis_emoticons(text):
    e = emot.core.emot()

    # Extract emoticons
    emoticon_results = e.emoticons(text)

    for original, meaning in zip(emoticon_results['value'], emoticon_results['mean']):
        text = text.replace(original, f" {meaning} ")

    # # Extract emojis
    # emoji_results = e.emoji(text)
    # for original, meaning in zip(emoji_results['value'], emoji_results['mean']):
    #     text = text.replace(original, f" {meaning} ")

    text = emoji.demojize(text)

    return text.strip().lower()




def comment_scrapping (url, comment_pages = 1):

    
    # Setup Chrome WebDriver
    service = Service(ChromeDriverManager().install())
    comment_driver = webdriver.Chrome(service=service)

    
    

    list_comments = []

    for page in range(1, comment_pages + 1):

        comment_driver.get(url)
        time.sleep(0.005)
        soup = BeautifulSoup(comment_driver.page_source, 'html.parser')
        comments_section = soup.find('div', class_='linear-message-list message-list')
        every_comments = comments_section.find_all('div')


        for comment in every_comments:

            main_section = comment.find('div', class_='lia-quilt-row lia-quilt-row-message-main') if comment.find('div', class_='lia-quilt-row lia-quilt-row-message-main') else None
            if not main_section:
                continue
            # scrapping all the text 
            comment_text = main_section.get_text(separator=' ', strip=True) if main_section else ""
            comment_text = convert_emojis_emoticons(comment_text)
            list_comments.append(comment_text)

            # only scrapping the first 3 comments
            if len(list_comments) >= 3:
                break

        # Find next page link
        next_page = soup.find('li', class_='lia-paging-page-next')
        if next_page and next_page.find('a'):
            next_page_link = next_page.find('a')['href']
            url = next_page_link
        else:
            print("No more pages to scrape.")
            break
        
    # close the driver
    comment_driver.quit()

    # change the list of comments to a string
    list_comments = ' | '.join(list_comments) if list_comments else ""
    

    return list_comments


        



def beyondblue_scrapping(tag,address,pages=2):


    # Setup Chrome WebDriver
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service)
    url = address

    whole_data = []



    for page in tqdm(range(1, pages + 1), desc="Scraping pages"):

    
        driver.get(url)
        # time.sleep(3)  # Wait for the page to load


        # get the class "custom-message-list all-discussions"
        soup = BeautifulSoup(driver.page_source, 'html.parser')


        discussions = soup.find('div', class_='custom-message-list all-discussions').find_all(('article'))
        

        for post in discussions:

            # Extracting post id
            post_link = post.find('h3').find_all('a')[1].get('href')
            post_id = post_link.split('/')[-1]  


            full_post_link = f"https://forums.beyondblue.org.au{post_link}"
            # extract comments from post_link
            comments = comment_scrapping(full_post_link, comment_pages=1)
            

            # Extracting post title
            title_tag = post.find_all('h3')[0].find_all('a')[1]
            post_title = convert_emojis_emoticons(title_tag.text.strip()) if title_tag else ""

            # Extracting post content
            post_content = convert_emojis_emoticons(post.find('p', class_ = 'body-text').text.strip()) if post.find('p', class_ = 'body-text') else ""
            

            side_info = post.find('aside')
            side_info1 = side_info.find('div', class_='custom-tile-author-info') if side_info else None
            # Extracting post author
            post_author = side_info1.find('a').find('span').text.strip() if side_info and side_info.find('span') else ""

            author_link = post.find('aside').find('div', class_='custom-tile-author-info').find('a').get('href')
            # Extracting user id from author link
            user_id = author_link.split('user-id/')[-1]

            


            side_info2 = side_info.find('div', class_='custom-tile-category-content')
            # Extracting post tag
            post_tag = side_info2.find('a').text.strip() if side_info2 and side_info2.find('a') else ""
            
            
            raw_date = side_info2.find('time').text.strip() if side_info2 and side_info2.find('time') else ""
            # Extracting post date
            post_date = parse_post_date(raw_date)



            side_info3 = side_info.find('div', class_='custom-tile-unread-replies')
            unread = side_info3.find('span').text.strip() if side_info3 and side_info3.find('span') else ""
            # Extracting number of unread replies
            match = re.search(r'\d+', unread)
            post_unread = int(match.group()) if match else 0

            # Extracting number of comments
            number_comments = post.find('li', class_ = 'custom-tile-replies').find('b').text.strip() if post.find('li', class_ = 'custom-tile-replies') else ""


            

            post_data = {
                "Post ID": post_id,
                "Post Title": post_title,
                "Post Content": post_content,
                "Post Author": post_author,
                "User ID": user_id,
                "Post Date": post_date,
                "Post Category": post_tag,
                "Number of Comments": number_comments,
                "Comments": comments
            }
            whole_data.append(post_data)


        # Print the number of posts scraped on the current page
        # print(f"Tag: {tag} Page {page}: Scraped {len(discussions)} posts.")

        # Find next page link
        next_page = soup.find('li', class_='lia-paging-page-next')
        if next_page and next_page.find('a'):
            url = next_page.find('a')['href']
        else:
            print("No more pages to scrape.")
            break







        if page % 10 == 0:
            # save the data to a csv file
            temp_df = pd.DataFrame(whole_data)
            temp_df = temp_df.drop_duplicates(subset='Post ID').reset_index(drop=True)
            temp_df['Post Date'] = pd.to_datetime(temp_df['Post Date'], errors='coerce')
            temp_df = temp_df.sort_values(by='Post Date', ascending=False).reset_index(drop=True)
            # check if the file already exists
            if not os.path.exists(f'./data/beyondblue_data/{tag}_beyondblue_posts.csv'):
                temp_df.to_csv(f'./data/beyondblue_data/{tag}_beyondblue_posts.csv', index=False)
            else:
                existing_df = pd.read_csv(f'./data/beyondblue_data/{tag}_beyondblue_posts.csv')
                temp_df = pd.concat([existing_df, temp_df]).drop_duplicates(subset='Post ID').reset_index(drop=True)
                temp_df.to_csv(f'./data/beyondblue_data/{tag}_beyondblue_posts.csv', index=False)

            print(f'The data for tag {tag} has been scraped to page {page}')

            # if the oldest post is older than 2020-01-01, stop the scraping
            # if not temp_df.empty and temp_df['Post Date'].min() < pd.to_datetime('2020-01-01'):
            #     print(f"The oldest post is older than 2020-01-01, stopping the scraping for tag {tag}.")
            #     break


    #close the driver
    driver.quit()





    # Convert to DataFrame 
    df = pd.DataFrame(whole_data)
    # delete duplicates by Post ID
    df = df.drop_duplicates(subset='Post ID').reset_index(drop=True)
    # Convert 'Post Date' to datetime format
    df['Post Date'] = pd.to_datetime(df['Post Date'], errors='coerce')
    # Sort by 'Post Date' in descending order
    df = df.sort_values(by='Post Date', ascending=False).reset_index(drop=True)

    # check if the file already exists
    if os.path.exists(f'./data/beyondblue_data/{tag}_beyondblue_posts.csv'):
        existing_df = pd.read_csv(f'./data/beyondblue_data/{tag}_beyondblue_posts.csv')
        df = pd.concat([existing_df, df]).drop_duplicates(subset='Post ID').reset_index(drop=True)

    df.to_csv(f'./data/beyondblue_data/{tag}_beyondblue_posts.csv', index=False)
        
        
    print(f"Data saved to ./data/beyondblue_data/{tag}_beyondblue_posts.csv")






In [None]:

mental_health_urls = {
    "Anxiety": "https://forums.beyondblue.org.au/t5/anxiety/bd-p/c1-sc2-b1?&sort=recent",
    "Depression": "https://forums.beyondblue.org.au/t5/depression/bd-p/c1-sc2-b2?&sort=recent",
    "PTSD": "https://forums.beyondblue.org.au/t5/ptsd-and-trauma/bd-p/c1-sc2-b3?&sort=recent",
    "Suicide_selfharm": "https://forums.beyondblue.org.au/t5/suicidal-thoughts-and-self-harm/bd-p/c1-sc2-b4?&sort=recent",
    "Staying_well": "https://forums.beyondblue.org.au/t5/staying-well/bd-p/c1-sc3-b1?&sort=recent",
    "Treament": "https://forums.beyondblue.org.au/t5/treatments-health-professionals/bd-p/c1-sc3-b2?&sort=recent",
    "Relationship_family_issues":"https://forums.beyondblue.org.au/t5/relationship-and-family-issues/bd-p/c1-sc3-b3?&sort=recent",
    "Youth":"https://forums.beyondblue.org.au/t5/young-people/bd-p/c1-sc4-b1?&sort=recent",
    "Sex_identity":"https://forums.beyondblue.org.au/t5/sexuality-and-gender-identity/bd-p/c1-sc4-b2?&sort=recent",
    "Multiculture":"https://forums.beyondblue.org.au/t5/multicultural-experiences/bd-p/c1-sc4-b3?&sort=recent",
    "Grief_loss":"https://forums.beyondblue.org.au/t5/grief-and-loss/bd-p/c1-sc4-b4?&sort=recent"
}

each_tag_number_data = 1500
pages = each_tag_number_data // 10  # Assuming each page has 10 posts


for tag, address in mental_health_urls.items():
    try:
        beyondblue_scrapping(tag, address, pages = pages)
    except Exception as e:
        print(f"Error scraping {tag}: {e}")
        continue

Scraping pages:   7%|▋         | 10/150 [13:56<3:14:03, 83.17s/it]

The data for tag Relationship_family_issues has been scraped to page 10


Scraping pages:  13%|█▎        | 20/150 [27:34<2:56:11, 81.32s/it]

The data for tag Relationship_family_issues has been scraped to page 20


Scraping pages:  20%|██        | 30/150 [41:03<2:41:22, 80.68s/it]

The data for tag Relationship_family_issues has been scraped to page 30


Scraping pages:  27%|██▋       | 40/150 [54:35<2:28:14, 80.85s/it]

The data for tag Relationship_family_issues has been scraped to page 40


Scraping pages:  33%|███▎      | 50/150 [1:08:00<2:15:13, 81.13s/it]

The data for tag Relationship_family_issues has been scraped to page 50


Scraping pages:  40%|████      | 60/150 [1:21:43<2:05:01, 83.35s/it]

The data for tag Relationship_family_issues has been scraped to page 60


Scraping pages:  47%|████▋     | 70/150 [1:35:30<1:49:06, 81.83s/it]

The data for tag Relationship_family_issues has been scraped to page 70


Scraping pages:  53%|█████▎    | 80/150 [1:49:00<1:34:56, 81.39s/it]

The data for tag Relationship_family_issues has been scraped to page 80


Scraping pages:  60%|██████    | 90/150 [2:02:33<1:21:09, 81.17s/it]

The data for tag Relationship_family_issues has been scraped to page 90


Scraping pages:  67%|██████▋   | 100/150 [2:16:09<1:08:32, 82.25s/it]

The data for tag Relationship_family_issues has been scraped to page 100


Scraping pages:  73%|███████▎  | 110/150 [2:29:57<55:00, 82.52s/it]  

The data for tag Relationship_family_issues has been scraped to page 110


Scraping pages:  80%|████████  | 120/150 [2:43:46<41:34, 83.14s/it]

The data for tag Relationship_family_issues has been scraped to page 120


Scraping pages:  87%|████████▋ | 130/150 [2:57:29<27:30, 82.53s/it]

The data for tag Relationship_family_issues has been scraped to page 130


Scraping pages:  93%|█████████▎| 140/150 [3:11:10<13:45, 82.55s/it]

The data for tag Relationship_family_issues has been scraped to page 140


Scraping pages: 100%|██████████| 150/150 [3:24:55<00:00, 81.97s/it]

The data for tag Relationship_family_issues has been scraped to page 150





Data saved to ./data/beyondblue_data/Relationship_family_issues_beyondblue_posts.csv


Climate data API

In [6]:
import pandas as pd
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

url = "https://reg.bom.gov.au/climate/data/"

weather_types = ['Rainfall', 'Temp_Max', 'Temp_Min', 'Solar exposure']
australian_regions = ['NSW', 'VIC', 'QLD', 'WA', 'SA', 'TAS', 'ACT', 'NT']
australian_citys = ['Sydney', 'Melbourne', 'Brisbane', 'Perth', 'Adelaide', 'Hobart', 'Canberra', 'Darwin']

zip_url = {}

for weather_type in weather_types:
    for region, city in zip(australian_regions, australian_citys):
        service = Service(ChromeDriverManager().install())
        driver = webdriver.Chrome(service=service)

        try:
            driver.get(url)
            wait = WebDriverWait(driver, 20)

            # Select weather type
            data_about_dropdown = wait.until(EC.presence_of_element_located((By.ID, 'ncc_obs_code_group')))
            dropdown_xpath = {
                'Rainfall': "Rainfall",
                'Temp_Max': "Temperature",
                'Temp_Min': "Temperature",
                'Solar exposure': "Solar exposure"
            }
            data_about_dropdown.find_element(By.XPATH, f"//option[text()='{dropdown_xpath[weather_type]}']").click()
            time.sleep(1)

            # Select sub-option for Temperature
            if weather_type in ['Temp_Max', 'Temp_Min']:
                wait.until(EC.visibility_of_element_located((By.ID, 'elementSubSelectLine')))
                element_select = wait.until(EC.presence_of_element_located((By.ID, 'elementSubSelect')))
                option_text = 'Maximum temperature' if weather_type == 'Temp_Max' else 'Minimum temperature'
                element_select.find_element(By.XPATH, f"//option[text()='{option_text}']").click()
                time.sleep(1)

            # Input city
            location_input = wait.until(EC.presence_of_element_located((By.ID, 'p_locSearch')))
            location_input.clear()
            location_input.send_keys(city)

            # Click Find button and wait for match list
            driver.find_element(By.ID, 'text').click()
            wait.until(EC.presence_of_element_located((By.ID, 'matchList')))
            time.sleep(1)
            match_list = driver.find_element(By.ID, 'matchList')
            match_list.find_elements(By.TAG_NAME, 'option')[0].click()
            time.sleep(1)

            # Show open stations only
            open_station_checkbox = wait.until(EC.element_to_be_clickable((By.ID, 'openStation')))
            if not open_station_checkbox.is_selected():
                open_station_checkbox.click()
                time.sleep(2)  # Wait for list to refresh

            # Select first station
            nearest_stations = wait.until(EC.presence_of_element_located((By.ID, 'nearest10')))
            station_options = nearest_stations.find_elements(By.TAG_NAME, 'option')
            if len(station_options) == 0:
                print(f'!!! No stations found for {city} - {weather_type}')
                continue

            station_options[0].click()
            time.sleep(1)

            # Wait for station number to load
            wait.until(lambda d: d.find_element(By.ID, 'p_stn_num').get_attribute('value').strip() != '')

            # Get data
            get_data_button = wait.until(EC.element_to_be_clickable((By.ID, 'getData')))
            get_data_button.click()

            # Wait and switch to new tab
            wait.until(lambda d: len(d.window_handles) > 1)
            driver.switch_to.window(driver.window_handles[-1])
            time.sleep(2)

            # Wait for downloads section
            wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'downloads')))
            all_years_link = driver.find_element(By.LINK_TEXT, 'All years of data')
            href = all_years_link.get_attribute('href')

            # Save link
            zip_url_key = f'{region}_{city}_{weather_type}'
            zip_url[zip_url_key] = href
            print(f'Collected ZIP URL for {city} - {weather_type}: {href}')

            driver.close()
            driver.switch_to.window(driver.window_handles[0])

        except Exception as e:
            print(f'Error processing {city} - {weather_type}: {e}')
            print("=================================================")
        finally:
            driver.quit()
            print("=================================================")

# Save the results
print("\nAll collected ZIP URLs:")
for key, link in zip_url.items():
    print(f'{key}: {link}')

df_urls = pd.DataFrame(list(zip_url.items()), columns=['Region_City_WeatherType', 'ZIP_URL'])
df_urls.to_csv('./data/AUS_weather/zip_urls.csv', index=False)

Error processing Sydney - Rainfall: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=137.0.7151.104)
Stacktrace:
	GetHandleVerifier [0x0x943b03+62899]
	GetHandleVerifier [0x0x943b44+62964]
	(No symbol) [0x0x7710f3]
	(No symbol) [0x0x74ff59]
	(No symbol) [0x0x7e4f7e]
	(No symbol) [0x0x7ff6a9]
	(No symbol) [0x0x7de306]
	(No symbol) [0x0x7ad670]
	(No symbol) [0x0x7ae4e4]
	GetHandleVerifier [0x0xba4793+2556483]
	GetHandleVerifier [0x0xb9fd02+2537394]
	GetHandleVerifier [0x0x96a2fa+220586]
	GetHandleVerifier [0x0x95aae8+157080]
	GetHandleVerifier [0x0x96141d+184013]
	GetHandleVerifier [0x0x94ba68+95512]
	GetHandleVerifier [0x0x94bc10+95936]
	GetHandleVerifier [0x0x936b5a+9738]
	BaseThreadInitThunk [0x0x76075d49+25]
	RtlInitializeExceptionChain [0x0x7780d09b+107]
	RtlGetAppContainerNamedObjectPath [0x0x7780d021+561]



KeyboardInterrupt: 

In [None]:


import os
import requests
import zipfile
import pandas as pd

# Load ZIP URLs from CSV
df_urls = pd.read_csv('./data/AUS_weather/zip_urls.csv')
dict_zip_urls = dict(zip(df_urls['Region_City_WeatherType'], df_urls['ZIP_URL']))

# Group keys by Region_City prefix (e.g., NSW_Sydney)
city_groups = {}
for key in dict_zip_urls.keys():
    city = '_'.join(key.split('_')[:2])
    if city not in city_groups:
        city_groups[city] = []
    city_groups[city].append(key)

# Create output folder
os.makedirs('./data/AUS_weather/merged_per_city', exist_ok=True)

# Iterate over each city
for city, keys in city_groups.items():
    print(f"\n Processing: {city}")

    dfs = []
    for key in keys:
        try:
            url = dict_zip_urls[key]
            print(f"  ⬇️ Downloading: {key}")
            response = requests.get(url)
            print(f"    Content-Type: {response.headers.get('Content-Type')}")

            # Save the ZIP file to disk
            zip_filename = f"{key}.zip"
            with open(zip_filename, 'wb') as temp_file:
                temp_file.write(response.content)
            print(f"    Saved {zip_filename} for inspection.")

            if response.status_code != 200:
                print(f"    !!! Failed to download {key}")
                continue

            # Extract the CSV file from the ZIP
            with zipfile.ZipFile(zip_filename, 'r') as z:
                csv_name = z.namelist()[0]
                z.extract(csv_name, './data/AUS_weather/merged_per_city')
                print(f"    Extracted {csv_name} to ./data/AUS_weather/merged_per_city")

                # Read the CSV into a DataFrame
                csv_path = os.path.join('./data/AUS_weather/merged_per_city', csv_name)
                df = pd.read_csv(csv_path)

            # Remove the ZIP file
            os.remove(zip_filename)
            print(f"    Deleted ZIP file: {zip_filename}")

            # Optionally, remove the extracted CSV after processing
            os.remove(csv_path)

            # ...rest of your processing...
            df.drop(columns=['Product code', 'Bureau of Meteorology station number'], errors='ignore', inplace=True)
            base_cols = ['Year', 'Month', 'Day']
            weather_cols = [col for col in df.columns if col not in base_cols]
            df = df[base_cols + weather_cols]
            weather_type = '_'.join(key.split('_')[2:])
            df.rename(columns={col: f"{weather_type}_{col}" for col in weather_cols}, inplace=True)
            dfs.append(df)

        except Exception as e:
            print(f"    !!! Error processing {key}: {e}")

    # Merge all weather types on Year-Month-Day
    if dfs:
        merged_df = dfs[0]
        for other_df in dfs[1:]:
            merged_df = pd.merge(merged_df, other_df, on=['Year', 'Month', 'Day'], how='outer')

        # Add 'location' column
        merged_df['location'] = city

        # Save merged file
        merged_df.to_csv(f'./data/AUS_weather/merged_per_city/{city}_merged.csv', index=False)
        print(f" Saved: {city}_merged.csv")
        print("==================================================")
    else:
        print(f"!!! Skipping {city}, no valid data.")
        print("==================================================")

        
        


In [None]:

import pandas as pd
import os

current_dir = os.getcwd()
target_path = os.path.join(current_dir, 'data', 'AUS_weather', 'merged_per_city')

# concatenate all csv files into one dataframe except one csv file called Australia_disaster-mapper-data-21-03-2023.csv
all_files = [f for f in os.listdir(target_path) if f.endswith('.csv') and f != 'Australia_disaster-mapper-data-21-03-2023.csv']
dfs = []
for file in all_files:
    file_path = os.path.join(target_path, file)
    df = pd.read_csv(file_path)
    dfs.append(df)

# Concatenate all dataframes
merged_df = pd.concat(dfs, ignore_index=True)
# Save the merged dataframe to a new CSV file
merged_df.to_csv(os.path.join(target_path, 'AUS_bigcity_weather_data.csv'), index=False)




In [None]:

from datetime import timedelta
import pandas as pd
import os
current_dir = os.getcwd()
target_path = os.path.join(current_dir, 'data', 'AUS_weather', 'merged_per_city')





# australian_regions = ['NSW', 'VIC', 'QLD', 'WA', 'SA', 'TAS', 'ACT', 'NT']
# australian_citys = ['Sydney', 'Melbourne', 'Brisbane', 'Perth', 'Adelaide', 'Hobart', 'Canberra', 'Darwin']

# region_retrival
zone_retrival = {
    'NSW': 'New South Wales',
    'VIC': 'Victoria',
    'QLD': 'Queensland',
    'WA': 'Western Australia',
    'SA': 'South Australia',
    'TAS': 'Tasmania',
    'ACT': 'Australian Capital Territory',
    'NT': 'Northern Territory'
}




normal_weather_df = pd.read_csv(os.path.join(target_path, 'AUS_bigcity_weather_data.csv'))

# merge Year, Month, Day columns into a single datetime column
normal_weather_df['Date'] = pd.to_datetime(normal_weather_df[['Year', 'Month', 'Day']]).apply(lambda x: x.strftime('%Y-%m-%d'))

normal_weather_df['Date'] = pd.to_datetime(normal_weather_df['Date'], format='%Y-%m-%d', errors='coerce')
# Drop the original Year, Month, Day columns
normal_weather_df.drop(columns=['Year', 'Month', 'Day'], inplace=True)
# Reorder columns to have 'Date' first
normal_weather_df = normal_weather_df[['Date'] + [col for col in normal_weather_df.columns if col != 'Date']]


normal_weather_df["Extreme_weather"] = ""










# Read the extreme weather data
extreme_weather_df = pd.read_csv(os.path.join(target_path, 'Australia_disaster-mapper-data-21-03-2023.csv'))

# Convert to datetime (keep as datetime)
extreme_weather_df['Start Date'] = pd.to_datetime(extreme_weather_df['Start Date'], errors='coerce', dayfirst=True)
extreme_weather_df['End Date'] = pd.to_datetime(extreme_weather_df['End Date'], errors='coerce', dayfirst=True)




for index, row in extreme_weather_df.iterrows():
    start_date = row['Start Date']
    end_date = row['End Date']

    if pd.isna(start_date) or pd.isna(end_date):

        if pd.isna(start_date) and pd.isna(end_date):
            print(f"Skipping row {index} due to missing Start Date and End Date")
            continue

        if pd.notna(start_date):
            end_date = start_date + timedelta(days=10)

        if pd.notna(end_date):
            start_date = end_date - timedelta(days=10)

    # Split multiple regions and clean
    regions = [r.strip() for r in str(row['Zone']).split(',')]
    category = row['Category']
    if category == 'Environment':
        category = 'Drought'
    elif category == 'Health':
        category = 'Heatwave'
    elif category == 'Industrial':
        category = 'Industrial Accident'
    elif category == 'Maritime/Coastal':
        category = 'Shipwreck'
    elif category == 'Transport':
        category = 'Transport Accident'

    

    # Check if location's region matches ANY of the regions in the list
    region_mask = normal_weather_df['location'].apply(
        lambda x: zone_retrival[x.split('_')[0].strip()] in regions
    )

    # Select records matching region and date range (dates are already datetime)
    mask = region_mask & (normal_weather_df['Date'] >= start_date) & (normal_weather_df['Date'] <= end_date)

    # If there's already a category, append the new one
    normal_weather_df.loc[mask, 'Extreme_weather'] = normal_weather_df.loc[mask, 'Extreme_weather'].apply(
        lambda x: f"{x} | {category}" if x else category
    )



# Format dates only when exporting
extreme_weather_df['Start Date'] = extreme_weather_df['Start Date'].dt.strftime('%Y-%m-%d')
extreme_weather_df['End Date'] = extreme_weather_df['End Date'].dt.strftime('%Y-%m-%d')

# Save the cleaned DataFrame to a new CSV file
normal_weather_df.to_csv(os.path.join(target_path, 'AUS_bigcity_weather_data_code_cleaned.csv'), index=False)



Skipping row 0 due to missing Start Date and End Date
Skipping row 75 due to missing Start Date and End Date
Skipping row 97 due to missing Start Date and End Date
Skipping row 143 due to missing Start Date and End Date
Skipping row 154 due to missing Start Date and End Date
Skipping row 169 due to missing Start Date and End Date
Skipping row 195 due to missing Start Date and End Date
Skipping row 285 due to missing Start Date and End Date
Skipping row 340 due to missing Start Date and End Date
Skipping row 345 due to missing Start Date and End Date
Skipping row 348 due to missing Start Date and End Date
Skipping row 424 due to missing Start Date and End Date
Skipping row 444 due to missing Start Date and End Date
Skipping row 453 due to missing Start Date and End Date
Skipping row 590 due to missing Start Date and End Date
Skipping row 593 due to missing Start Date and End Date
Skipping row 707 due to missing Start Date and End Date
Skipping row 710 due to missing Start Date and End D

# Nature Language Process


In [10]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ------------------- -------------------- 6.3/12.8 MB 38.6 MB/s eta 0:00:01
     --------------------------------------- 12.8/12.8 MB 33.5 MB/s eta 0:00:00
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [11]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')  # For tokenization

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [1]:

import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import spacy
import emoji
import emot

from transformers import BertTokenizer
# import torch  # if you want to use token IDs later

class NLP_OPERATORS:
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.nlp = spacy.load('en_core_web_sm', disable=["parser", "ner", "textcat"])
        self.nlp.max_length = 5_000_000
        self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    def convert_emojis_emoticons(self, text):
        e = emot.core.emot()
        emoticon_results = e.emoticons(text)
        for original, meaning in zip(emoticon_results['value'], emoticon_results['mean']):
            text = text.replace(original, f" {meaning} ")

        # # Extract emojis
        # emoji_results = e.emoji(text)
        # for original, meaning in zip(emoji_results['value'], emoji_results['mean']):
        #     text = text.replace(original, f" {meaning} ")


        # Convert emojis to text
        text = emoji.demojize(text)


        # Make BERT-compatible
        text = text.replace(":", " ").replace("_", " ")  
        return text.strip().lower()
        

    def basic_cleaning(self, text):
        text = self.convert_emojis_emoticons(text)
        text = text.replace('\n', ' ').replace('\r', ' ')
        text = re.sub(r'https?://\S+', '', text)
        text = re.sub(r'<.*?>', '', text)
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'&quot;', '', text)
        text = re.sub(r'</?.*?>', '', text)
        text = re.sub(r'[^a-zA-Z0-9 ]', '', text)
        text = re.sub(r'\d+', '', text)
        return text.strip()

    def text_preprocessing(self, text, regex=False, remove_stop_word=False, lemmatisation=False, lower_case=False,
                           return_tokens=True, use_bert_tokenizer=False):
        
        if use_bert_tokenizer:

            text = self.basic_cleaning(text)
            tokens = self.bert_tokenizer.tokenize(text)
            return tokens if return_tokens else " ".join(tokens)
        

        if regex:
            text = self.basic_cleaning(text)


        if lower_case:
            text = text.lower()

        tokens = word_tokenize(text)

        if remove_stop_word:
            tokens = [word for word in tokens if word not in self.stop_words]

        if lemmatisation:
            doc = self.nlp(' '.join(tokens))
            tokens = [token.lemma_ for token in doc]

        return tokens if return_tokens else " ".join(tokens)


preprocess for statistical model 

In [None]:

import os 
import pandas as pd



clean_operator = NLP_OPERATORS()

# get the current path
current_path = os.getcwd()
output_folder = os.path.join(current_path, 'data', 'reddit_data', 'clean')
os.makedirs(output_folder, exist_ok=True)

# read the csv file
df_mental = pd.read_csv(os.path.join(current_path, 'data', 'reddit_data', 'mental_reddit_posts.csv'))
# print(df_mental.shape)



df_mental["title_selftext_topcomments_text"] = df_mental["title"].astype(str) + " | " + df_mental["selftext"].astype(str) + " | " + df_mental["top_comments"].astype(str)
df_mental['clean_title_selftext_topcomments_text'] = df_mental['title_selftext_topcomments_text'].copy().apply(
    lambda x: clean_operator.text_preprocessing(x, regex=True, remove_stop_word=True, lemmatisation=True, lower_case=True, return_tokens=False))

# combine all the text in the column into a single string
clean_all_strings = " ".join(df_mental['clean_title_selftext_topcomments_text'].tolist())
token_clean_all_strings = clean_operator.text_preprocessing(clean_all_strings, return_tokens=True)

# save this into a text file
with open(os.path.join(output_folder, 'token_cleaned_mental_all_text.txt'), 'w') as f:
    f.write(str(token_clean_all_strings))

#save the df_mental into a csv file
df_mental.to_csv(os.path.join(output_folder, 'cleaned_mental_all_text.csv'), index=False)




# df_normal = pd.read_csv(os.path.join(current_path, 'data', 'reddit_data', 'normal_reddit_posts.csv'))
# print(df_normal.shape)      


preprocess for deep learning model

In [13]:

# use bert tokenizer to tokenize the text 
not_clean_all_strings = " ".join(df_mental['title_selftext_topcomments_text'].tolist())
bert_token_not_clean_all_strings = clean_operator.text_preprocessing(not_clean_all_strings,use_bert_tokenizer = True, return_tokens=True)

# save it to a pickle file
import pickle
with open(os.path.join(output_folder, 'bert_token_not_clean_all_text.pkl'), 'wb') as f:
    pickle.dump(bert_token_not_clean_all_strings, f)


In [None]:


clean_operator = NLP_OPERATORS()



df_anxiety = pd.read_csv('./data/beyondblue_data/Anxiety_beyondblue_posts.csv')
print(df_anxiety.shape)

df_depression = pd.read_csv('./data/beyondblue_data/Depression_beyondblue_posts.csv')
print(df_depression.shape)

df_ptsd = pd.read_csv('./data/beyondblue_data/PTSD_beyondblue_posts.csv')
print(df_ptsd.shape)

df_suicide_selfharm = pd.read_csv('./data/beyondblue_data/Suicide_selfharm_beyondblue_posts.csv')
print(df_suicide_selfharm.shape)

df_Staying_well = pd.read_csv('./data/beyondblue_data/Staying_well_beyondblue_posts.csv')
print(df_Staying_well.shape)

df_treament = pd.read_csv('./data/beyondblue_data/Treament_beyondblue_posts.csv')
print(df_treament.shape)

df_relationship = pd.read_csv('./data/beyondblue_data/Relationship_family_issues_beyondblue_posts.csv')
print(df_relationship.shape)

df_youth = pd.read_csv('./data/beyondblue_data/Youth_beyondblue_posts.csv')
print(df_youth.shape)

df_sex = pd.read_csv('./data/beyondblue_data/Sex_identity_beyondblue_posts.csv')
print(df_sex.shape)

df_multiculture = pd.read_csv('./data/beyondblue_data/Multiculture_beyondblue_posts.csv')
print(df_multiculture.shape)
      
df_grief = pd.read_csv('./data/beyondblue_data/Grief_loss_beyondblue_posts.csv')
print(df_grief.shape)








# read all the csv files in beyondblue_data repository and concatenate them into a single dataframe
df_beyondblue = pd.DataFrame()
beyondblue_data_path = os.path.join(current_path, 'data', 'beyondblue_data')
for file in os.listdir(beyondblue_data_path):
    if file.endswith('.csv'):
        temp_df = pd.read_csv(os.path.join(beyondblue_data_path, file))
        df_beyondblue = pd.concat([df_beyondblue, temp_df], ignore_index=True)
# remove duplicates by Post ID
df_beyondblue = df_beyondblue.drop_duplicates(subset='Post ID').reset_index(drop=True)
# Convert 'Post Date' to datetime format
df_beyondblue['Post Date'] = pd.to_datetime(df_beyondblue['Post Date'], errors='coerce')
# Sort by 'Post Date' in descending order
df_beyondblue = df_beyondblue.sort_values(by='Post Date', ascending=False).reset_index(drop=True)

print(df_beyondblue.shape)

df_beyondblue['preprocessed_token_post_title'] = df_beyondblue['Post Title'].apply(lambda x: clean_operator.text_preprocessing(x, regex=True, remove_stop_word=True, lemmatisation=True, lower_case=True, return_tokens=True))
df_beyondblue['preprocessed_token_post_content'] = df_beyondblue['Post Content'].apply(lambda x: clean_operator.text_preprocessing(x, regex=True, remove_stop_word=True, lemmatisation=True, lower_case=True, return_tokens=True))
df_beyondblue['preprocessed_token_comments'] = df_beyondblue['Comments'].apply(lambda x: clean_operator.text_preprocessing(x, regex=True, remove_stop_word=True, lemmatisation=True, lower_case=True, return_tokens=True))
df_beyondblue.to_csv(os.path.join(current_path, 'data', 'beyondblue_data','clean', 'clean_beyondblue_posts.csv'), index=False)



feature extraction

# CATEGORIZATION

In [None]:
general_symptom_categories = [
    "Depression",
    "Anxiety",
    "Stress/Burnout",
    "Loneliness",
    "Low Self-Esteem",
    "Trauma/PTSD",
    "Anger/Irritability",
    "Obsessive Thoughts",
    "Addiction"
]

suicide_and_selfharm_labels = [
    "Suicidal Ideation",
    "Suicide Attempt",
    "Self-Harm",
    "Despair",
    "Urgent Help Request",
    "Grief After Suicide",
    "Coping with Suicidal Thoughts"
]

# NETWORK VISUALIZATION

# MODEL TRAINING