In [44]:
!pip install praw
!pip install asyncpraw
!pip install langdetect



In [45]:
import pandas as pd
from datasets import load_dataset
import os

import praw
from time import sleep

from langdetect import detect, DetectorFactory

In [46]:
# Parameters
TARGET_PER_LABEL = 2200
MAX_LENGTH = 800             # max characters per post
SLEEP_TIME = 0.2             # seconds between requests to avoid rate limit
DetectorFactory.seed = 0  # for consistent results

# Part 2   
fetching 4 class from reddit


In [47]:
df_hf = pd.read_csv('/content/huggingFace_mental_health_dataset.csv')
save_file = "reddit_mental_health_dataset.csv"

if os.path.exists(save_file):
    df_existing = pd.read_csv(save_file)
    data_reddit = df_existing.to_dict("records") + df_hf.to_dict("records")
    seen_texts = set(df_existing["body"].tolist()) | set(df_hf["body"].tolist())  # avoid duplicates
    print(f"🔄 Resuming... already have {len(df_existing)} posts")
else:
    data_reddit = df_hf.to_dict("records")
    seen_texts = set(df_hf["body"].tolist())

🔄 Resuming... already have 12454 posts


In [48]:
# 1. Setup Reddit API
reddit = praw.Reddit(client_id="YOUR_CLIENT_ID",
                     client_secret="YOUR_CLIENT_SECRET",
                     user_agent="YOUR_USER_AGENT")

In [49]:
# 2. Define rest of subreddits

mental_subreddits = {
    "Anxiety": {
        "sub_list": ["anxiety", "socialanxiety", "PanicAttack", "HealthAnxiety"],
        "MIN_LENGTH": 80
    },
    "Suicidal": {
        "sub_list": ["SuicideWatch", "Suicidal_Thoughts", "SuicideBereavement"],
        "MIN_LENGTH": 50
    },
    "Addiction": {
        "sub_list": ["StopSmoking", "addiction", "StopDrinking", "stopgaming"],
        "MIN_LENGTH": 80
    },
    "Normal": {
        "sub_list": ["happy", "CasualConversation", "wholesomememes", "LifeProTips"],
        "MIN_LENGTH": 50
    }
}


In [50]:
def is_english(text):
    try:
        return detect(text) == "en"
    except:
        return False

In [51]:
def fetch_posts_for_label(label, subreddits, MIN_LENGTH):

    for subreddit_name in subreddits:
        try:
            subreddit = reddit.subreddit(subreddit_name)
            print(f"\nFetching '{label}' posts from r/{subreddit_name}...")

            for post in subreddit.new(limit=None):
                title = post.title.strip()
                body = post.selftext.strip()

                if len(body) < MIN_LENGTH or len(body) > MAX_LENGTH or body in seen_texts:
                    continue
                if not is_english(body):
                    continue

                seen_texts.add(body)

                data_reddit.append({
                    "post_id": post.id,
                    "subreddit": subreddit_name,
                    "title": title,
                    "body": body,
                    "label": label
                })

                sleep(SLEEP_TIME)

            # Save after each subreddit
            df_reddit = pd.DataFrame(data_reddit)
            df_reddit.to_csv(save_file, index=False)
            print(f"💾 Progress saved: {len(df_reddit)} posts so far")

            current_label_count = sum(1 for d in data_reddit if d["label"] == label)
            if current_label_count >= TARGET_PER_LABEL:
              return

        except Exception as e:
            print(f"⚠️ Could not fetch from r/{subreddit_name}: {e}")


In [52]:
# Example fetch call
for label, subreddits in mental_subreddits.items():
    fetch_posts_for_label(label, subreddits["sub_list"], subreddits["MIN_LENGTH"])


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.




Fetching 'Anxiety' posts from r/anxiety...


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/l

Collected 0 posts for r/anxiety (Anxiety)


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



💾 Progress saved: 20985 posts so far

Fetching 'Anxiety' posts from r/socialanxiety...


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/l

Collected 0 posts for r/socialanxiety (Anxiety)


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



💾 Progress saved: 21496 posts so far

Fetching 'Suicidal' posts from r/SuicideWatch...


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/l

Collected 0 posts for r/SuicideWatch (Suicidal)


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



💾 Progress saved: 22081 posts so far

Fetching 'Suicidal' posts from r/Suicidal_Thoughts...
⚠️ Could not fetch from r/Suicidal_Thoughts: received 404 HTTP response

Fetching 'Suicidal' posts from r/SuicideBereavement...


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/l

Collected 0 posts for r/SuicideBereavement (Suicidal)


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



💾 Progress saved: 22510 posts so far

Fetching 'Addiction' posts from r/StopSmoking...


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/l

Collected 0 posts for r/StopSmoking (Addiction)


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



💾 Progress saved: 23077 posts so far

Fetching 'Addiction' posts from r/addiction...


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/l

Collected 0 posts for r/addiction (Addiction)


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



💾 Progress saved: 23531 posts so far

Fetching 'Addiction' posts from r/StopDrinking...


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/l

Collected 0 posts for r/StopDrinking (Addiction)


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



💾 Progress saved: 24098 posts so far

Fetching 'Normal' posts from r/happy...


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/l

Collected 0 posts for r/happy (Normal)


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



💾 Progress saved: 24548 posts so far

Fetching 'Normal' posts from r/CasualConversation...


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/l

Collected 0 posts for r/CasualConversation (Normal)


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



💾 Progress saved: 25258 posts so far

Fetching 'Normal' posts from r/wholesomememes...


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/l

Collected 0 posts for r/wholesomememes (Normal)


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



💾 Progress saved: 25281 posts so far

Fetching 'Normal' posts from r/LifeProTips...


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/l

Collected 0 posts for r/LifeProTips (Normal)
💾 Progress saved: 25855 posts so far


In [53]:
df_reddit = pd.DataFrame(data_reddit)
df_reddit = df_reddit.drop_duplicates(subset="body").reset_index(drop=True)
df_reddit['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
Addiction,2747
Normal,2638
Anxiety,2557
ADHD,2000
PTSD,2000
OCD,2000
Depression,2000
Suicidal,1913


In [56]:
BALANCE_SIZE = 2000

balanced_df = (
    df_reddit
    .groupby("label", group_keys=False)   # group by label
    .apply(lambda x: x.sample(n=min(BALANCE_SIZE, len(x)), random_state=42))  # sample within each class
    .reset_index(drop=True)
)

balanced_df["label"].value_counts()

  .apply(lambda x: x.sample(n=min(BALANCE_SIZE, len(x)), random_state=42))  # sample within each class


Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
ADHD,2000
Addiction,2000
Anxiety,2000
Depression,2000
Normal,2000
OCD,2000
PTSD,2000
Suicidal,1913


In [57]:
from google.colab import files
balanced_df.to_csv(save_file, index=False)
files.download(save_file)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>