In [2]:
import praw
import pandas as pd
import datetime as dt
import time

# Reddit API credentials
reddit = praw.Reddit(
    client_id="APufBZJrIjHVSulEG3dUAQ",
    client_secret="lWzbaPsAMAnSI0jT9I9qcV6wvz146A",
    user_agent="Mental_health"
)

# Date range
start_date = dt.datetime(2024, 1, 1)
end_date = dt.datetime(2025, 6, 25)

# Subreddits to scrape
subreddits = [
    'depression', 'Anxiety', 'SuicideWatch', 'socialanxiety',
    'mentalhealth', 'PTSD', 'Stress', 'loneliness',
    'addiction', 'PanicAttack', 'OCD', 'mentalillness', 'BipolarReddit', 'BPD'
]

# Sort methods to try
sort_methods = ['top', 'hot', 'new', 'controversial', 'rising']

# Storage
all_data = []
seen_ids = set()  # Used to avoid duplicates by tracking seen post IDs

for sub_name in subreddits:
    subreddit = reddit.subreddit(sub_name)
    print(f"🔍 Scraping subreddit: {sub_name}")
    
    for method in sort_methods:
        print(f"   ➤ Using sort: {method}")
        count = 0
        
        try:
            if method in ['top', 'controversial']:
                submissions = getattr(subreddit, method)(time_filter='year', limit=3000)
            else:
                submissions = getattr(subreddit, method)(limit=3000)

            for submission in submissions:
                post_time = dt.datetime.fromtimestamp(submission.created_utc)

                # ✅ Check for duplicates
                if submission.id in seen_ids:
                    continue
                seen_ids.add(submission.id)

                # ✅ Date range filter
                if not (start_date <= post_time <= end_date):
                    continue

                # Extract up to 10 comments
                try:
                    submission.comments.replace_more(limit=0)
                    comment_bodies = [comment.body for comment in submission.comments.list()[:10]]
                except Exception as e:
                    comment_bodies = [f"Error fetching comments: {e}"]

                # ✅ Add post_id to the saved data
                all_data.append({
                    'post_id': submission.id,
                    'keyword': sub_name,
                    'post_title': submission.title,
                    'post_text': submission.selftext,
                    'post_date': post_time,
                    'comments': comment_bodies,
                    'source_method': method
                })
                count += 1

            print(f"     ✅ {count} new posts saved from '{method}'\n")
            time.sleep(2)
        
        except Exception as e:
            print(f"     ❌ Failed with method '{method}' on {sub_name}: {e}")
        
        time.sleep(1)

# Save to CSV
df = pd.DataFrame(all_data)
df.to_csv('reddit_mental_health_combined_with_comments.csv', index=False)
print(f"\n🎯 All done! Saved {len(df)} unique posts in total.")


🔍 Scraping subreddit: depression
   ➤ Using sort: top
     ✅ 971 new posts saved from 'top'

   ➤ Using sort: hot
     ✅ 1 new posts saved from 'hot'

   ➤ Using sort: new
     ✅ 0 new posts saved from 'new'

   ➤ Using sort: controversial
     ✅ 926 new posts saved from 'controversial'

   ➤ Using sort: rising
     ✅ 0 new posts saved from 'rising'

🔍 Scraping subreddit: Anxiety
   ➤ Using sort: top
     ✅ 986 new posts saved from 'top'

   ➤ Using sort: hot
     ✅ 2 new posts saved from 'hot'

   ➤ Using sort: new
     ✅ 0 new posts saved from 'new'

   ➤ Using sort: controversial
     ✅ 939 new posts saved from 'controversial'

   ➤ Using sort: rising
     ✅ 0 new posts saved from 'rising'

🔍 Scraping subreddit: SuicideWatch
   ➤ Using sort: top
     ✅ 964 new posts saved from 'top'

   ➤ Using sort: hot
     ✅ 0 new posts saved from 'hot'

   ➤ Using sort: new
     ✅ 0 new posts saved from 'new'

   ➤ Using sort: controversial
     ✅ 899 new posts saved from 'controversial'

   ➤ U

In [None]:
import praw
import pandas as pd
import datetime as dt
import time

# Load previously scraped data
existing_df = pd.read_csv('reddit_mental_health_combined_with_comments.csv')
existing_post_ids = set(existing_df['post_id'].astype(str))

# Reddit API credentials
reddit = praw.Reddit(
    client_id="APufBZJrIjHVSulEG3dUAQ",
    client_secret="lWzbaPsAMAnSI0jT9I9qcV6wvz146A",
    user_agent="Mental_health"
)

# Subreddits to scrape
subreddits = [
    'depression', 'Anxiety', 'SuicideWatch', 'socialanxiety',
    'mentalhealth', 'PTSD', 'Stress', 'loneliness',
    'addiction', 'PanicAttack', 'OCD', 'mentalillness', 'BipolarReddit', 'BPD'
]

# Storage for new data
new_data = []
seen_new_ids = set()

# Pull only from subreddit.new()
for sub_name in subreddits:
    subreddit = reddit.subreddit(sub_name)
    print(f"📥 Checking subreddit.new() in: {sub_name}")
    
    try:
        for submission in subreddit.new(limit=300):  # You can increase this over time
            post_id = submission.id
            post_time = dt.datetime.fromtimestamp(submission.created_utc)

            # Skip if already in old CSV or seen in this run
            if post_id in existing_post_ids or post_id in seen_new_ids:
                continue
            seen_new_ids.add(post_id)

            try:
                submission.comments.replace_more(limit=1000)
                comment_bodies = [comment.body for comment in submission.comments.list()[:10]]
            except Exception as e:
                comment_bodies = [f"Error fetching comments: {e}"]

            new_data.append({
                'post_id': post_id,
                'keyword': sub_name,
                'post_title': submission.title,
                'post_text': submission.selftext,
                'post_date': post_time,
                'comments': comment_bodies,
                'source_method': 'new'
            })

        time.sleep(2)

    except Exception as e:
        print(f"Failed to process {sub_name}: {e}")

# Save new data
if new_data:
    new_df = pd.DataFrame(new_data)
    combined_df = pd.concat([existing_df, new_df], ignore_index=True)
    combined_df.to_csv('reddit_mental_health_combined_with_comments.csv', index=False)
    print(f"\n Appended {len(new_df)} new posts. Total is now {len(combined_df)}.")
else:
    print("\n  No new posts to add today.")


📥 Checking subreddit.new() in: depression
📥 Checking subreddit.new() in: Anxiety
📥 Checking subreddit.new() in: SuicideWatch
📥 Checking subreddit.new() in: socialanxiety
📥 Checking subreddit.new() in: mentalhealth
📥 Checking subreddit.new() in: PTSD
📥 Checking subreddit.new() in: Stress
📥 Checking subreddit.new() in: loneliness
📥 Checking subreddit.new() in: addiction
📥 Checking subreddit.new() in: PanicAttack
📥 Checking subreddit.new() in: OCD
📥 Checking subreddit.new() in: mentalillness
📥 Checking subreddit.new() in: BipolarReddit
📥 Checking subreddit.new() in: BPD

✅ Appended 2720 new posts. Total is now 49404.


In [2]:
import praw
import pandas as pd
import datetime as dt
import time

# Load previously scraped data
existing_df = pd.read_csv('reddit_mental_health_combined_with_comments.csv')
existing_df['post_id'] = existing_df['post_id'].astype(str)
existing_post_ids = set(existing_df['post_id'])

# Filter only "new" posts (to check for updates)
new_only_df = existing_df[existing_df['source_method'] == 'new']

# Reddit API credentials
reddit = praw.Reddit(
    client_id="APufBZJrIjHVSulEG3dUAQ",
    client_secret="lWzbaPsAMAnSI0jT9I9qcV6wvz146A",
    user_agent="Mental_health"
)

# Storage for updated posts
updated_rows = []

for index, row in new_only_df.iterrows():
    try:
        submission = reddit.submission(id=row['post_id'])
        submission.comments.replace_more(limit=1000)
        updated_comments = [comment.body for comment in submission.comments.list()[:10]]

        # Only update if new comments are added
        old_comment_count = len(eval(row['comments'])) if isinstance(row['comments'], str) else 0
        new_comment_count = len(updated_comments)

        if new_comment_count > old_comment_count:
            print(f"🔁 Updating post {row['post_id']} with more comments: {new_comment_count} > {old_comment_count}")
            row['comments'] = updated_comments
            updated_rows.append(row)
        else:
            print(f"✅ No update needed for post {row['post_id']}")

        time.sleep(1)

    except Exception as e:
        print(f"⚠️ Could not update post {row['post_id']}: {e}")

# Replace updated posts in original DataFrame
if updated_rows:
    updated_df = pd.DataFrame(updated_rows)
    # Drop old versions of updated posts
    combined_df = existing_df[~existing_df['post_id'].isin(updated_df['post_id'])]
    # Append updated posts
    combined_df = pd.concat([combined_df, updated_df], ignore_index=True)
    # Save
    combined_df.to_csv('reddit_mental_health_combined_with_comments.csv', index=False)
    print(f"\n✅ Updated {len(updated_df)} posts with new comments.")
else:
    print("\nℹ️ No posts needed updating.")


✅ No update needed for post 1l9kf5q
✅ No update needed for post 1l9d0kq
✅ No update needed for post 1l99bb5
✅ No update needed for post 1l97kn4
✅ No update needed for post 1l97k53
✅ No update needed for post 1l974pf
✅ No update needed for post 1l971s1
✅ No update needed for post 1l96h95
✅ No update needed for post 1l95zvs
✅ No update needed for post 1l95te1
✅ No update needed for post 1l945e9
✅ No update needed for post 1l93ktf
✅ No update needed for post 1l93igz
✅ No update needed for post 1l92mzg
✅ No update needed for post 1l92khf
✅ No update needed for post 1l924l3
✅ No update needed for post 1l923rx
✅ No update needed for post 1l91cc9
✅ No update needed for post 1l8xyx1
✅ No update needed for post 1l8wskl
✅ No update needed for post 1l8wopa
✅ No update needed for post 1l8vdft
✅ No update needed for post 1l8vbh7
✅ No update needed for post 1kt46oe
✅ No update needed for post 1k91wud
✅ No update needed for post 1k8xgl3
✅ No update needed for post 1k8swza
✅ No update needed for post 