In [1]:
!pip install praw pandas numpy matplotlib seaborn nltk textblob spacy gensim scikit-learn xgboost tensorflow transformers

Collecting xgboost
  Downloading xgboost-2.1.4-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.4-py3-none-win_amd64.whl (124.9 MB)
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 1.3/124.9 MB 6.7 MB/s eta 0:00:19
    --------------------------------------- 2.9/124.9 MB 7.0 MB/s eta 0:00:18
   - -------------------------------------- 3.9/124.9 MB 6.5 MB/s eta 0:00:19
   - -------------------------------------- 5.2/124.9 MB 6.5 MB/s eta 0:00:19
   -- ------------------------------------- 6.6/124.9 MB 6.5 MB/s eta 0:00:19
   -- ------------------------------------- 7.9/124.9 MB 6.2 MB/s eta 0:00:19
   -- ------------------------------------- 9.2/124.9 MB 6.3 MB/s eta 0:00:19
   --- ------------------------------------ 10.2/124.9 MB 6.2 MB/s eta 0:00:19
   --- ------------------------------------ 11.3/124.9 MB 6.1 MB/s eta 0:00:19
   ---- ----------------------------------- 12.6/124.9 MB 6.1 MB/s eta 0:00:1


[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
import json
from tqdm import tqdm

np.random.seed(42)

In [6]:

import praw
import pandas as pd
import time
from tqdm import tqdm
import os
import json

def setup_reddit_credentials():
    """Set up Reddit API credentials either from file or user input"""
    credentials_path = './reddit_credentials.json'

    try:
        with open(credentials_path, 'r') as f:
            credentials = json.load(f)
            print("Loaded existing Reddit API credentials")
            return credentials
    except FileNotFoundError:
        print("Reddit API credentials not found. Please enter them:")
        credentials = {
            'client_id': input("Enter your Reddit client ID: "),
            'client_secret': input("Enter your Reddit client secret: "),
            'user_agent': input("Enter your Reddit user agent (e.g., 'mental_health_research/0.1 by YOUR_USERNAME'): ")
        }

        with open(credentials_path, 'w') as f:
            json.dump(credentials, f)

        print("Credentials saved for future use")
        return credentials

def initialize_reddit_api(credentials):
    """Initialize the Reddit API client using PRAW"""
    reddit = praw.Reddit(
        client_id=credentials['client_id'],
        client_secret=credentials['client_secret'],
        user_agent=credentials['user_agent']
    )
    return reddit

def collect_posts_from_subreddit(reddit, subreddit_name, target_count=30000):
    """Collect posts from a specific subreddit until reaching target count"""
    subreddit = reddit.subreddit(subreddit_name)
    posts = []

    print(f"Collecting data from r/{subreddit_name}...")

    sorting_methods = [
        ('hot', 5000),
        ('top', 'all', 5000),
        ('new', 5000),
        ('rising', 5000),
        ('controversial', 'all', 5000)
    ]

    for method in sorting_methods:
        if len(posts) >= target_count:
            break

        remaining = target_count - len(posts)
        print(f"Using {method[0]} sorting to collect {remaining} more posts...")

        try:
            if len(method) == 2:
                method_name, limit = method
                submissions = getattr(subreddit, method_name)(limit=limit)
            else:
                method_name, time_filter, limit = method
                submissions = getattr(subreddit, method_name)(time_filter=time_filter, limit=limit)

            for submission in tqdm(submissions, total=limit, leave=False):
                if not submission.stickied and submission.selftext and len(submission.selftext) > 50:
                    posts.append({
                        'id': submission.id,
                        'title': submission.title,
                        'text': submission.selftext,
                        'created_utc': submission.created_utc,
                        'score': submission.score,
                        'num_comments': submission.num_comments,
                        'subreddit': subreddit_name
                    })

                    if len(posts) >= target_count:
                        break

        except Exception as e:
            print(f"Error collecting from {method[0]}: {str(e)}")
            continue

    if len(posts) < target_count:
        remaining = target_count - len(posts)
        print(f"Still need {remaining} posts. Using search to find more...")

        search_terms = ["the", "I", "and", "to", "of", "a", "in", "is", "my", "that", "this", "for", "with"]

        for term in search_terms:
            search_results = subreddit.search(term, sort="relevance", limit=30000)
            for submission in search_results:
                if not submission.stickied and submission.selftext and len(submission.selftext) > 50:
                    # Check if we already have this post (by ID)
                    if not any(post['id'] == submission.id for post in posts):
                        posts.append({
                            'id': submission.id,
                            'title': submission.title,
                            'text': submission.selftext,
                            'created_utc': submission.created_utc,
                            'score': submission.score,
                            'num_comments': submission.num_comments,
                            'subreddit': subreddit_name
                        })

                        if len(posts) >= target_count:
                            break

            if len(posts) >= target_count:
                break

    print(f"Collected {len(posts)} posts from r/{subreddit_name}")
    return posts

def collect_clinical_and_non_clinical_data(reddit, posts_per_subreddit=30000):
    """Collect data from both clinical and non-clinical subreddits"""
    base_path = './'

    clinical_subreddits = {
        'ADHD': 'ADHD',
        'Anxiety': 'Anxiety',
        'Bipolar': 'bipolar',
        'Depression': 'depression',
        'EatingDisorders': 'EatingDisorders',
        'OCD': 'OCD',
        'PTSD': 'ptsd',
        'Schizophrenia': 'schizophrenia',
        'SuicideWatch': 'SuicideWatch',
        'BPD': 'BPD',  
        'Stress': 'Stress'
    }

    non_clinical_subreddits = {
        'AskReddit': 'AskReddit',
        'CasualConversation': 'CasualConversation',
        'Fitness': 'Fitness',
        'Cooking': 'Cooking',
        'Gaming': 'gaming',
        'Movies': 'movies',
        'Music': 'Music',
        'Science': 'science',
        'Technology': 'technology',
        'Travel': 'travel'
    }

    clinical_posts = []
    for disorder, subreddit in clinical_subreddits.items():
        try:
            file_path = f"{base_path}data_raw_{subreddit}.csv"
            if os.path.exists(file_path):
                print(f"Found existing data for r/{subreddit}. Loading...")
                df_subreddit = pd.read_csv(file_path)
                posts = df_subreddit.to_dict('records')
                print(f"Loaded {len(posts)} posts from r/{subreddit}")
            else:
                posts = collect_posts_from_subreddit(reddit, subreddit, posts_per_subreddit)
                for post in posts:
                    post['category'] = 'clinical'
                    post['disorder'] = disorder

                df_subreddit = pd.DataFrame(posts)
                df_subreddit.to_csv(file_path, index=False)
                print(f"Saved {len(posts)} posts from r/{subreddit}")

            clinical_posts.extend(posts)

            time.sleep(2)

        except Exception as e:
            print(f"Error collecting from r/{subreddit}: {str(e)}")
            continue

    non_clinical_posts = []
    for category, subreddit in non_clinical_subreddits.items():
        try:
            file_path = f"{base_path}data_raw_{subreddit}.csv"
            if os.path.exists(file_path):
                print(f"Found existing data for r/{subreddit}. Loading...")
                df_subreddit = pd.read_csv(file_path)
                posts = df_subreddit.to_dict('records')
                print(f"Loaded {len(posts)} posts from r/{subreddit}")
            else:
                posts = collect_posts_from_subreddit(reddit, subreddit, posts_per_subreddit)
                for post in posts:
                    post['category'] = 'non-clinical'
                    post['disorder'] = 'none'  # No disorder for non-clinical posts

                df_subreddit = pd.DataFrame(posts)
                df_subreddit.to_csv(file_path, index=False)
                print(f"Saved {len(posts)} posts from r/{subreddit}")

            non_clinical_posts.extend(posts)

            time.sleep(2)

        except Exception as e:
            print(f"Error collecting from r/{subreddit}: {str(e)}")
            continue

    all_posts = clinical_posts + non_clinical_posts
    df_all = pd.DataFrame(all_posts)

    df_all.to_csv(f"{base_path}reddit_mental_health_dataset_raw.csv", index=False)

    df_clinical = pd.DataFrame(clinical_posts)
    df_clinical.to_csv(f"{base_path}reddit_clinical_dataset_raw.csv", index=False)

    df_non_clinical = pd.DataFrame(non_clinical_posts)
    df_non_clinical.to_csv(f"{base_path}reddit_non_clinical_dataset_raw.csv", index=False)

    print("\nData Collection Summary:")
    print(f"Total posts collected: {len(all_posts)}")
    print(f"Clinical posts: {len(clinical_posts)}")
    print(f"Non-clinical posts: {len(non_clinical_posts)}")

    print("\nPosts by subreddit:")
    subreddit_counts = df_all['subreddit'].value_counts()
    for subreddit, count in subreddit_counts.items():
        print(f"  r/{subreddit}: {count} posts")

    return df_clinical, df_non_clinical, df_all

def run_data_collection():
    print("Starting Reddit data collection for mental health detection...\n")

    try:
        credentials = setup_reddit_credentials()
        reddit = initialize_reddit_api(credentials)

        print(f"Connected to Reddit as: {reddit.user.me() or 'Read-only mode'}")

        posts_per_subreddit = 30000

        df_clinical, df_non_clinical, df_all = collect_clinical_and_non_clinical_data(reddit, posts_per_subreddit)

        print("\nData collection completed successfully!")
        print(f"Clinical dataset shape: {df_clinical.shape}")
        print(f"Non-clinical dataset shape: {df_non_clinical.shape}")
        print(f"Combined dataset shape: {df_all.shape}")

        return df_clinical, df_non_clinical, df_all

    except Exception as e:
        print(f"An error occurred during data collection: {str(e)}")
        return None, None, None

In [7]:
# Execute data collection
clinical_df, non_clinical_df, all_df = run_data_collection()

# Optionally display some collected data
if all_df is not None:
    display(all_df.head())
    print("\nDataset info:")
    all_df.info()

Starting Reddit data collection for mental health detection...

Reddit API credentials not found. Please enter them:


Enter your Reddit client ID:  PU2yUedhMR9BEs170mGbhQ
Enter your Reddit client secret:  wM4k9ThQV2gdjQjfz59bWLgGXFwEeQ
Enter your Reddit user agent (e.g., 'mental_health_research/0.1 by YOUR_USERNAME'):  blackwarrant1


Credentials saved for future use
Connected to Reddit as: Read-only mode
Collecting data from r/ADHD...
Using hot sorting to collect 30000 more posts...


                                                                                                                       

Using top sorting to collect 29465 more posts...


                                                                                                                       

Using new sorting to collect 28505 more posts...


                                                                                                                       

Using rising sorting to collect 27525 more posts...


                                                                                                                       

Using controversial sorting to collect 27500 more posts...


                                                                                                                       

Still need 26572 posts. Using search to find more...
Collected 4341 posts from r/ADHD
Saved 4341 posts from r/ADHD
Collecting data from r/Anxiety...
Using hot sorting to collect 30000 more posts...


                                                                                                                       

Using top sorting to collect 29049 more posts...


                                                                                                                       

Using new sorting to collect 28327 more posts...


                                                                                                                       

Using rising sorting to collect 27381 more posts...


                                                                                                                       

Using controversial sorting to collect 27357 more posts...


                                                                                                                       

Still need 26492 posts. Using search to find more...
Collected 4368 posts from r/Anxiety
Saved 4368 posts from r/Anxiety
Collecting data from r/bipolar...
Using hot sorting to collect 30000 more posts...


                                                                                                                       

Using top sorting to collect 29168 more posts...


                                                                                                                       

Using new sorting to collect 29106 more posts...


                                                                                                                       

Using rising sorting to collect 28121 more posts...


                                                                                                                       

Using controversial sorting to collect 28096 more posts...


                                                                                                                       

Still need 27334 posts. Using search to find more...
Collected 3774 posts from r/bipolar
Saved 3774 posts from r/bipolar
Collecting data from r/depression...
Using hot sorting to collect 30000 more posts...


                                                                                                                       

Using top sorting to collect 29208 more posts...


                                                                                                                       

Using new sorting to collect 28316 more posts...


                                                                                                                       

Using rising sorting to collect 27348 more posts...


                                                                                                                       

Using controversial sorting to collect 27323 more posts...


                                                                                                                       

Still need 26424 posts. Using search to find more...
Collected 4429 posts from r/depression
Saved 4429 posts from r/depression
Collecting data from r/EatingDisorders...
Using hot sorting to collect 30000 more posts...


                                                                                                                       

Using top sorting to collect 29022 more posts...


                                                                                                                       

Using new sorting to collect 28041 more posts...


                                                                                                                       

Using rising sorting to collect 27054 more posts...


                                                                                                                       

Using controversial sorting to collect 27030 more posts...


                                                                                                                       

Still need 26061 posts. Using search to find more...
Collected 5045 posts from r/EatingDisorders
Saved 5045 posts from r/EatingDisorders
Collecting data from r/OCD...
Using hot sorting to collect 30000 more posts...


                                                                                                                       

Using top sorting to collect 29041 more posts...


                                                                                                                       

Using new sorting to collect 28862 more posts...


                                                                                                                       

Using rising sorting to collect 27892 more posts...


                                                                                                                       

Using controversial sorting to collect 27868 more posts...


                                                                                                                       

Still need 27015 posts. Using search to find more...
Collected 4135 posts from r/OCD
Saved 4135 posts from r/OCD
Collecting data from r/ptsd...
Using hot sorting to collect 30000 more posts...


                                                                                                                       

Using top sorting to collect 29024 more posts...


                                                                                                                       

Using new sorting to collect 28080 more posts...


                                                                                                                       

Using rising sorting to collect 27102 more posts...


                                                                                                                       

Using controversial sorting to collect 27077 more posts...


                                                                                                                       

Still need 26139 posts. Using search to find more...
Collected 4735 posts from r/ptsd
Saved 4735 posts from r/ptsd
Collecting data from r/schizophrenia...
Using hot sorting to collect 30000 more posts...


                                                                                                                       

Using top sorting to collect 29152 more posts...


                                                                                                                       

Using new sorting to collect 29031 more posts...


                                                                                                                       

Using rising sorting to collect 28176 more posts...


                                                                                                                       

Using controversial sorting to collect 28156 more posts...


                                                                                                                       

Still need 27362 posts. Using search to find more...
Collected 3606 posts from r/schizophrenia
Saved 3606 posts from r/schizophrenia
Collecting data from r/SuicideWatch...
Using hot sorting to collect 30000 more posts...


                                                                                                                       

Using top sorting to collect 29203 more posts...


                                                                                                                       

Using new sorting to collect 28381 more posts...


                                                                                                                       

Using rising sorting to collect 27454 more posts...


                                                                                                                       

Using controversial sorting to collect 27429 more posts...


                                                                                                                       

Still need 26562 posts. Using search to find more...
Collected 4444 posts from r/SuicideWatch
Saved 4444 posts from r/SuicideWatch
Collecting data from r/BPD...
Using hot sorting to collect 30000 more posts...


                                                                                                                       

Using top sorting to collect 29148 more posts...


                                                                                                                       

Using new sorting to collect 28182 more posts...


                                                                                                                       

Using rising sorting to collect 27189 more posts...


                                                                                                                       

Using controversial sorting to collect 27164 more posts...


                                                                                                                       

Still need 26197 posts. Using search to find more...
Collected 5092 posts from r/BPD
Saved 5092 posts from r/BPD
Collecting data from r/Stress...
Using hot sorting to collect 30000 more posts...


                                                                                                                       

Using top sorting to collect 29061 more posts...


                                                                                                                       

Using new sorting to collect 28153 more posts...


                                                                                                                       

Using rising sorting to collect 27248 more posts...


                                                                                                                       

Using controversial sorting to collect 27224 more posts...


                                                                                                                       

Still need 26352 posts. Using search to find more...
Collected 4270 posts from r/Stress
Saved 4270 posts from r/Stress
Collecting data from r/AskReddit...
Using hot sorting to collect 30000 more posts...


                                                                                                                       

Using top sorting to collect 30000 more posts...


                                                                                                                       

Using new sorting to collect 29987 more posts...


                                                                                                                       

Using rising sorting to collect 29987 more posts...


                                                                                                                       

Using controversial sorting to collect 29987 more posts...


                                                                                                                       

Still need 29633 posts. Using search to find more...
Collected 476 posts from r/AskReddit
Saved 476 posts from r/AskReddit
Collecting data from r/CasualConversation...
Using hot sorting to collect 30000 more posts...


                                                                                                                       

Using top sorting to collect 29257 more posts...


                                                                                                                       

Using new sorting to collect 28289 more posts...


                                                                                                                       

Using rising sorting to collect 27413 more posts...


                                                                                                                       

Using controversial sorting to collect 27388 more posts...


                                                                                                                       

Still need 26421 posts. Using search to find more...
Collected 4298 posts from r/CasualConversation
Saved 4298 posts from r/CasualConversation
Collecting data from r/Fitness...
Using hot sorting to collect 30000 more posts...


                                                                                                                       

Using top sorting to collect 29965 more posts...


                                                                                                                       

Using new sorting to collect 28981 more posts...


                                                                                                                       

Using rising sorting to collect 27990 more posts...


                                                                                                                       

Using controversial sorting to collect 27967 more posts...


                                                                                                                       

Still need 27063 posts. Using search to find more...
Collected 3673 posts from r/Fitness
Saved 3673 posts from r/Fitness
Collecting data from r/Cooking...
Using hot sorting to collect 30000 more posts...


                                                                                                                       

Using top sorting to collect 29087 more posts...


                                                                                                                       

Using new sorting to collect 28232 more posts...


                                                                                                                       

Using rising sorting to collect 27343 more posts...


                                                                                                                       

Using controversial sorting to collect 27318 more posts...


                                                                                                                       

Still need 26475 posts. Using search to find more...
Collected 4292 posts from r/Cooking
Saved 4292 posts from r/Cooking
Collecting data from r/gaming...
Using hot sorting to collect 30000 more posts...


                                                                                                                       

Using top sorting to collect 29641 more posts...


                                                                                                                       

Using new sorting to collect 29630 more posts...


                                                                                                                       

Using rising sorting to collect 29088 more posts...


                                                                                                                       

Using controversial sorting to collect 29077 more posts...


                                                                                                                       

Still need 29019 posts. Using search to find more...
Collected 1451 posts from r/gaming
Saved 1451 posts from r/gaming
Collecting data from r/movies...
Using hot sorting to collect 30000 more posts...


                                                                                                                       

Using top sorting to collect 29525 more posts...


                                                                                                                       

Using new sorting to collect 29469 more posts...


                                                                                                                       

Using rising sorting to collect 28728 more posts...


                                                                                                                       

Using controversial sorting to collect 28727 more posts...


                                                                                                                       

Still need 28135 posts. Using search to find more...
Collected 2602 posts from r/movies
Saved 2602 posts from r/movies
Collecting data from r/Music...
Using hot sorting to collect 30000 more posts...


                                                                                                                       

Using top sorting to collect 29842 more posts...


                                                                                                                       

Using new sorting to collect 29674 more posts...


                                                                                                                       

Using rising sorting to collect 29413 more posts...


                                                                                                                       

Using controversial sorting to collect 29412 more posts...


                                                                                                                       

Still need 29040 posts. Using search to find more...
Collected 1469 posts from r/Music
Saved 1469 posts from r/Music
Collecting data from r/science...
Using hot sorting to collect 30000 more posts...


                                                                                                                       

Using top sorting to collect 30000 more posts...


                                                                                                                       

Using new sorting to collect 29994 more posts...


                                                                                                                       

Using rising sorting to collect 29994 more posts...


                                                                                                                       

Using controversial sorting to collect 29994 more posts...


                                                                                                                       

Still need 29973 posts. Using search to find more...
Collected 325 posts from r/science
Saved 325 posts from r/science
Collecting data from r/technology...
Using hot sorting to collect 30000 more posts...


                                                                                                                       

Using top sorting to collect 30000 more posts...


                                                                                                                       

Using new sorting to collect 29982 more posts...


                                                                                                                       

Using rising sorting to collect 29982 more posts...


                                                                                                                       

Using controversial sorting to collect 29982 more posts...


                                                                                                                       

Still need 29957 posts. Using search to find more...
Collected 59 posts from r/technology
Saved 59 posts from r/technology
Collecting data from r/travel...
Using hot sorting to collect 30000 more posts...


                                                                                                                       

Using top sorting to collect 29353 more posts...


                                                                                                                       

Using new sorting to collect 29296 more posts...


                                                                                                                       

Using rising sorting to collect 29117 more posts...


                                                                                                                       

Using controversial sorting to collect 29094 more posts...


                                                                                                                       

Still need 28201 posts. Using search to find more...
Collected 2566 posts from r/travel
Saved 2566 posts from r/travel

Data Collection Summary:
Total posts collected: 69450
Clinical posts: 48239
Non-clinical posts: 21211

Posts by subreddit:
  r/BPD: 5092 posts
  r/EatingDisorders: 5045 posts
  r/ptsd: 4735 posts
  r/SuicideWatch: 4444 posts
  r/depression: 4429 posts
  r/Anxiety: 4368 posts
  r/ADHD: 4341 posts
  r/CasualConversation: 4298 posts
  r/Cooking: 4292 posts
  r/Stress: 4270 posts
  r/OCD: 4135 posts
  r/bipolar: 3774 posts
  r/Fitness: 3673 posts
  r/schizophrenia: 3606 posts
  r/movies: 2602 posts
  r/travel: 2566 posts
  r/Music: 1469 posts
  r/gaming: 1451 posts
  r/AskReddit: 476 posts
  r/science: 325 posts
  r/technology: 59 posts

Data collection completed successfully!
Clinical dataset shape: (48239, 9)
Non-clinical dataset shape: (21211, 9)
Combined dataset shape: (69450, 9)


Unnamed: 0,id,title,text,created_utc,score,num_comments,subreddit,category,disorder
0,1joyhmu,"People who were diagnosed late in life, what's...",For me it was my exceptional ability to make i...,1743521000.0,1472,793,ADHD,clinical,ADHD
1,1jp2iex,Accidentally managed my ADHD at work,"I (AuDHD) am at work, at a no-phone-calls offi...",1743531000.0,477,82,ADHD,clinical,ADHD
2,1jpf7td,Does your ADHD cause you to skip and stutter y...,Sometimes I just cannot get words out at all.....,1743564000.0,67,34,ADHD,clinical,ADHD
3,1jp5gle,What is a hobby that actually stuck for you?,A common experience for people with adhd is ho...,1743538000.0,144,411,ADHD,clinical,ADHD
4,1jpc5di,I think I figured out why I/we eat faster than...,So I noticed while I was eating dinner just no...,1743556000.0,39,153,ADHD,clinical,ADHD



Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69450 entries, 0 to 69449
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            69450 non-null  object 
 1   title         69450 non-null  object 
 2   text          69450 non-null  object 
 3   created_utc   69450 non-null  float64
 4   score         69450 non-null  int64  
 5   num_comments  69450 non-null  int64  
 6   subreddit     69450 non-null  object 
 7   category      69450 non-null  object 
 8   disorder      69450 non-null  object 
dtypes: float64(1), int64(2), object(6)
memory usage: 4.8+ MB


In [12]:
import pandas as pd

# File path
file_path = r'C:\Users\User\Downloads\SMA2\reddit_mental_health_dataset_raw.csv'

# Subreddits to remove
unwanted_subreddits = [
    'Cooking', 'Fitness', 'movies', 'travel',
    'Music', 'gaming', 'AskReddit', 'science', 'technology'
]

# Read dataset
df = pd.read_csv(file_path)

# Remove unwanted subreddits
filtered_df = df[~df['subreddit'].isin(unwanted_subreddits)]

# Save the cleaned dataset
filtered_df.to_csv('reddit_mental_health_dataset_cleaned1.csv', index=False)

print("Unwanted subreddits removed successfully.")


Unwanted subreddits removed successfully.


In [17]:
if filtered_df is not None:
    display(filtered_df.head())
    print("\nDataset info:")
    filtered_df.info()

Unnamed: 0,id,title,text,created_utc,score,num_comments,subreddit,category,disorder
0,1joyhmu,"People who were diagnosed late in life, what's...",For me it was my exceptional ability to make i...,1743521000.0,1472,793,ADHD,clinical,ADHD
1,1jp2iex,Accidentally managed my ADHD at work,"I (AuDHD) am at work, at a no-phone-calls offi...",1743531000.0,477,82,ADHD,clinical,ADHD
2,1jpf7td,Does your ADHD cause you to skip and stutter y...,Sometimes I just cannot get words out at all.....,1743564000.0,67,34,ADHD,clinical,ADHD
3,1jp5gle,What is a hobby that actually stuck for you?,A common experience for people with adhd is ho...,1743538000.0,144,411,ADHD,clinical,ADHD
4,1jpc5di,I think I figured out why I/we eat faster than...,So I noticed while I was eating dinner just no...,1743556000.0,39,153,ADHD,clinical,ADHD



Dataset info:
<class 'pandas.core.frame.DataFrame'>
Index: 52537 entries, 0 to 53012
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            52537 non-null  object 
 1   title         52537 non-null  object 
 2   text          52537 non-null  object 
 3   created_utc   52537 non-null  float64
 4   score         52537 non-null  int64  
 5   num_comments  52537 non-null  int64  
 6   subreddit     52537 non-null  object 
 7   category      52537 non-null  object 
 8   disorder      52537 non-null  object 
dtypes: float64(1), int64(2), object(6)
memory usage: 4.0+ MB


In [15]:
print(f"Combined dataset shape: {filtered_df.shape}")


Combined dataset shape: (52537, 9)


In [20]:

# Replace 'None' in the 'disorder' column with 'Normal'
filtered_df.loc[filtered_df['disorder'] == 'none', 'disorder'] = 'Normal'

# Save the cleaned dataset
filtered_df.to_csv('reddit_mental_health_dataset_cleaned_normal.csv', index=False)

print("Unwanted subreddits removed and 'None' replaced with 'Normal' in disorder column.")


Unwanted subreddits removed and 'None' replaced with 'Normal' in disorder column.
