In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Install required packages (if needed)
import subprocess
import sys

packages = ['tqdm', 'pandas', 'requests']
for package in packages:
    try:
        __import__(package)
    except ImportError:
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])

print("✅ Setup completed!")

# Confguration

In [None]:
# Folder for data storage (replace with your path or mount in Colab)
DRIVE_FOLDER = '/inputPath/FileName'

# Subreddits to scrape (modify as needed)
SUBREDDITS = ['alzheimers', 'dementia']

# Date range (15-16 years as requested)
START_DATE = '2010-01-01'
END_DATE = '2025-07-20'

# Batch sizes (adjust based on your needs)
POST_BATCH_SIZE = 1000
COMMENT_BATCH_SIZE = 2000

print(f"📁 Data will be saved to: {DRIVE_FOLDER}")
print(f"🎯 Target subreddits: {SUBREDDITS}")
print(f"📅 Date range: {START_DATE} to {END_DATE}")

In [None]:
import requests
import json
import pandas as pd
import time
from tqdm import tqdm
import os
from datetime import datetime

# Configuration
BASE_URL = 'https://arctic-shift.photon-reddit.com/api'
HEADERS = {'User-Agent': 'RedditResearchScraper/1.0'}
REQUEST_DELAY = 1
MAX_RETRIES = 5
RETRY_DELAY = 60

In [None]:
class ColabRedditScraper:
    def __init__(self, drive_folder=DRIVE_FOLDER):
        self.drive_folder = drive_folder
        os.makedirs(drive_folder, exist_ok=True)
        print(f"📁 Created/verified folder: {drive_folder}")

    def make_request(self, endpoint, params, max_retries=MAX_RETRIES):
        """Make API request with retry logic"""
        url = f"{BASE_URL}/{endpoint}"
        retries = 0

        while retries < max_retries:
            try:
                response = requests.get(url, headers=HEADERS, params=params, timeout=30)

                if response.status_code == 200:
                    # Check rate limits
                    remaining = response.headers.get('X-RateLimit-Remaining')
                    if remaining and int(remaining) < 5:
                        reset_time = response.headers.get('X-RateLimit-Reset')
                        if reset_time:
                            wait_time = max(1, int(reset_time) - int(time.time()))
                            print(f"⚠️ Rate limit low. Waiting {wait_time}s...")
                            time.sleep(wait_time)

                    return response.json()

                elif response.status_code == 429:
                    print("⚠️ Rate limited. Waiting 60 seconds...")
                    time.sleep(60)

                elif response.status_code == 525:
                    print(f"⚠️ Cloudflare error. Retrying in {RETRY_DELAY}s...")
                    time.sleep(RETRY_DELAY)

                else:
                    print(f"⚠️ HTTP {response.status_code}: {response.text[:200]}")

            except Exception as e:
                print(f"❌ Request error: {e}")

            retries += 1
            wait_time = min(300, 2 ** retries)
            print(f"🔁 Retry {retries}/{max_retries} in {wait_time}s...")
            time.sleep(wait_time)

        return None

    def save_checkpoint(self, subreddit, data_type, last_timestamp, batch_num, total_count):
        """Save checkpoint for resuming"""
        checkpoint = {
            'subreddit': subreddit,
            'data_type': data_type,
            'last_timestamp': last_timestamp,
            'batch_num': batch_num,
            'total_count': total_count,
            'timestamp': datetime.now().isoformat()
        }

        checkpoint_file = os.path.join(self.drive_folder, f"checkpoint_{subreddit}_{data_type}.json")
        with open(checkpoint_file, 'w') as f:
            json.dump(checkpoint, f, indent=2)
        print(f"💾 Saved checkpoint: {subreddit} {data_type} - {total_count} items")

    def load_checkpoint(self, subreddit, data_type):
        """Load checkpoint for resuming"""
        checkpoint_file = os.path.join(self.drive_folder, f"checkpoint_{subreddit}_{data_type}.json")
        if os.path.exists(checkpoint_file):
            with open(checkpoint_file, 'r') as f:
                checkpoint = json.load(f)
            print(f"📂 Resuming from checkpoint: {checkpoint['total_count']} {data_type}")
            return checkpoint
        return None

    def scrape_posts(self, subreddit, resume=True):
        """Scrape all posts from subreddit"""
        print(f"\n📝 Starting post scraping for r/{subreddit}")

        # Check for existing checkpoint
        checkpoint = None
        if resume:
            checkpoint = self.load_checkpoint(subreddit, 'posts')

        # Set up parameters
        params = {
            'subreddit': subreddit,
            'after': checkpoint['last_timestamp'] if checkpoint else START_DATE,
            'before': END_DATE,
            'limit': 100,
            'sort': 'asc',
            'fields': 'id,title,selftext,author,created_utc,score,num_comments,url,over_18'
        }

        batch_num = checkpoint['batch_num'] if checkpoint else 1
        total_count = checkpoint['total_count'] if checkpoint else 0
        current_batch = []

        progress_bar = tqdm(desc=f"r/{subreddit} posts", initial=total_count)

        while True:
            data = self.make_request('posts/search', params)
            if not data:
                print("❌ Failed to fetch data")
                break

            posts = data.get('data', [])
            if not posts:
                print("✅ No more posts found")
                break

            current_batch.extend(posts)
            total_count += len(posts)
            progress_bar.update(len(posts))

            # Save batch when it reaches the limit
            if len(current_batch) >= POST_BATCH_SIZE:
                self.save_batch(current_batch, f"{subreddit}_posts_batch_{batch_num}")
                self.save_checkpoint(subreddit, 'posts', posts[-1]['created_utc'], batch_num + 1, total_count)
                current_batch = []
                batch_num += 1

            # Update pagination
            params['after'] = posts[-1]['created_utc']
            time.sleep(REQUEST_DELAY)

        # Save remaining posts
        if current_batch:
            self.save_batch(current_batch, f"{subreddit}_posts_batch_{batch_num}")

        progress_bar.close()
        print(f"✅ Completed posts for r/{subreddit}: {total_count:,} posts")
        return total_count

    def scrape_comments(self, subreddit, resume=True):
        """Scrape all comments from subreddit"""
        print(f"\n💬 Starting comment scraping for r/{subreddit}")

        # Check for existing checkpoint
        checkpoint = None
        if resume:
            checkpoint = self.load_checkpoint(subreddit, 'comments')

        # Set up parameters
        params = {
            'subreddit': subreddit,
            'after': checkpoint['last_timestamp'] if checkpoint else START_DATE,
            'before': END_DATE,
            'limit': 100,
            'sort': 'asc',
            'fields': 'id,body,author,link_id,parent_id,created_utc,score'
        }

        batch_num = checkpoint['batch_num'] if checkpoint else 1
        total_count = checkpoint['total_count'] if checkpoint else 0
        current_batch = []

        progress_bar = tqdm(desc=f"r/{subreddit} comments", initial=total_count)

        while True:
            data = self.make_request('comments/search', params)
            if not data:
                print("❌ Failed to fetch data")
                break

            comments = data.get('data', [])
            if not comments:
                print("✅ No more comments found")
                break

            current_batch.extend(comments)
            total_count += len(comments)
            progress_bar.update(len(comments))

            # Save batch when it reaches the limit
            if len(current_batch) >= COMMENT_BATCH_SIZE:
                self.save_batch(current_batch, f"{subreddit}_comments_batch_{batch_num}")
                self.save_checkpoint(subreddit, 'comments', comments[-1]['created_utc'], batch_num + 1, total_count)
                current_batch = []
                batch_num += 1

            # Update pagination
            params['after'] = comments[-1]['created_utc']
            time.sleep(REQUEST_DELAY)

        # Save remaining comments
        if current_batch:
            self.save_batch(current_batch, f"{subreddit}_comments_batch_{batch_num}")

        progress_bar.close()
        print(f"✅ Completed comments for r/{subreddit}: {total_count:,} comments")
        return total_count

    def save_batch(self, data, filename):
        """Save batch to Drive"""
        filepath = os.path.join(self.drive_folder, f"{filename}.json")
        with open(filepath, 'w') as f:
            json.dump(data, f, indent=2, default=str)
        print(f"💾 Saved {len(data)} items to {filename}.json")


In [None]:

def run_complete_scraping():
    """Run the complete scraping process"""
    scraper = ColabRedditScraper()

    total_stats = {
        'start_time': datetime.now().isoformat(),
        'subreddits': {},
        'total_posts': 0,
        'total_comments': 0
    }

    for subreddit in SUBREDDITS:
        print(f"\n🎯 Processing r/{subreddit}")

        try:
            # Scrape posts
            posts_count = scraper.scrape_posts(subreddit)

            # Scrape comments
            comments_count = scraper.scrape_comments(subreddit)

            # Update stats
            total_stats['subreddits'][subreddit] = {
                'posts': posts_count,
                'comments': comments_count
            }
            total_stats['total_posts'] += posts_count
            total_stats['total_comments'] += comments_count

            print(f"🎉 Completed r/{subreddit}: {posts_count:,} posts, {comments_count:,} comments")

        except KeyboardInterrupt:
            print("⚠️ Scraping interrupted by user")
            break
        except Exception as e:
            print(f"❌ Error processing r/{subreddit}: {e}")
            continue

    # Save final summary
    total_stats['end_time'] = datetime.now().isoformat()
    summary_file = os.path.join(scraper.drive_folder, 'final_summary.json')
    with open(summary_file, 'w') as f:
        json.dump(total_stats, f, indent=2, default=str)

    print(f"\n🎉 SCRAPING COMPLETED!")
    print(f"📊 Total Posts: {total_stats['total_posts']:,}")
    print(f"📊 Total Comments: {total_stats['total_comments']:,}")
    print(f"📁 Data saved to: {scraper.drive_folder}")

In [None]:

if __name__ == "__main__":
    print("🚀 Starting Reddit Data Scraping for Research")
    print("=" * 50)
    run_complete_scraping()