In [13]:
import praw
import pandas as pd
import datetime
import time
from concurrent.futures import ThreadPoolExecutor, as_completed

In [15]:
reddit = praw.Reddit(
    client_id="Gdr88UlekXQoVBVQd1moig",
    client_secret="WHAbASIdXelrbhf96Ol-1WtU7DoHFg",
    user_agent="ThreadInsight/0.1 by Dry_Wolverine7741"
)

In [65]:
subreddit_name = "funny"
days_to_fetch = 1
max_posts = 500  
subscribers = None
post_workers = 2
comment_workers = 3

In [73]:
def fetch_recent_posts():
    global subscribers
    all_posts = []
    subreddit_instance = reddit.subreddit(subreddit_name)

    start_time = datetime.datetime.utcnow() - datetime.timedelta(days=days_to_fetch)
    start_timestamp = int(start_time.timestamp())

    print(f"🔍 Fetching only posts from the last {days_to_fetch} days in r/{subreddit_name}...")

    post_list = []
    backoff_time = 1

    print("Fetching subreddit metadata (subscribers)...")
    subscribers = subreddit_instance.subscribers
    time.sleep(1)

    while True:
        try:
            for post in subreddit_instance.new(limit=max_posts):
                if post.created_utc < start_timestamp:
                    break
                post_list.append(post)

            post_list.sort(key=lambda x: x.created_utc, reverse=True)
            print(f"🔍 Found {len(post_list)} recent posts.")
            break

        except praw.exceptions.APIException as e:
            if "RATELIMIT" in str(e):
                print(f"⚠️ Rate limit exceeded. Sleeping {backoff_time}s before retrying...")
                time.sleep(backoff_time)
                backoff_time *= 2
            else:
                raise e

    def process_post(post):
        post_type = "Text"
        if post.url.endswith((".jpg", ".png", ".gif", ".jpeg")):
            post_type = "Image"
        elif "v.redd.it" in post.url:
            post_type = "Video"

        try:
            account_age_days = (
                (datetime.datetime.utcnow() - datetime.datetime.utcfromtimestamp(post.author.created_utc)).days
                if post.author and hasattr(post.author, "created_utc")
                else None
            )
        except:
            account_age_days = None

        return {
            "id": post.id,
            "author": post.author.name if post.author else "[deleted]",
            "upvotes": post.score,
            "created_utc": post.created_utc,
            "post_type": post_type,
            "flair": post.link_flair_text,
            "subscribers": subscribers,
            "crosspost_parent_list": post.crosspost_parent_list if hasattr(post, "crosspost_parent_list") else None,
            "account_age_days": account_age_days
        }

    with ThreadPoolExecutor(max_workers=post_workers) as executor:
        futures = [executor.submit(process_post, post) for post in post_list]

        for count, future in enumerate(as_completed(futures), start=1):
            result = future.result()
            if result:
                all_posts.append(result)

            if count % 100 == 0:
                print(f"✅ Processed {count}/{len(post_list)} posts... Sleeping for 1s.")
                time.sleep(1)

    print(f"✅ Done! Fetched {len(all_posts)} recent posts from r/{subreddit_name}.")
    return all_posts


def fetch_comments(post_id):
    backoff_time = 2
    while True:
        try:
            submission = reddit.submission(id=post_id)
            submission.comments.replace_more(limit=0)

            comments = []
            for comment in submission.comments.list():
                comments.append({
                    "comment_id": comment.id,
                    "comment_author": comment.author.name if comment.author else "[deleted]",
                    "comment_upvotes": comment.score,
                    "comment_created_utc": comment.created_utc,
                    "post_id": post_id
                })

            return comments

        except praw.exceptions.APIException as e:
            if "RATELIMIT" in str(e):
                print(f"⚠️ Rate limit on comments. Sleeping {backoff_time}s before retrying...")
                time.sleep(backoff_time)
                backoff_time *= 2
            else:
                print(f"⚠️ Error fetching comments for post {post_id}: {e}")
                return []


def fetch_comments_parallel(posts):
    all_comments = []
    with ThreadPoolExecutor(max_workers=comment_workers) as executor:
        future_to_post = {executor.submit(fetch_comments, post["id"]): post["id"] for post in posts}

        for count, future in enumerate(as_completed(future_to_post), start=1):
            post_id = future_to_post[future]
            try:
                all_comments.extend(future.result())
            except Exception as exc:
                print(f"⚠️ Exception fetching comments for post {post_id}: {exc}")

            if count % 50 == 0:
                print(f"🗨️ Processed comments for {count} posts... Sleeping for 0.5s.")
                time.sleep(0.5)

    return all_comments

start_time = time.time()

posts = fetch_recent_posts()
all_comments = fetch_comments_parallel(posts)

end_time = time.time()
elapsed_time = end_time - start_time
print(f"⏳ Total Execution Time: {elapsed_time:.2f} seconds.")

🔍 Fetching only posts from the last 1 days in r/funny...
Fetching subreddit metadata (subscribers)...
🔍 Found 29 recent posts.
✅ Done! Fetched 29 recent posts from r/funny.
⏳ Total Execution Time: 25.21 seconds.


In [75]:
len(all_comments)

1771

In [77]:
len(posts)

29