In [1]:
import pandas as pd
import praw
import numpy as np
import re #RegEx : Regular expression
import os
from dotenv import load_dotenv
from datetime import datetime, timedelta
from typing import Optional, Literal, Tuple, List

In [2]:
# Load environment variables from .env file
load_dotenv()

client_id = os.getenv('client_id')
client_secret = os.getenv('client_secret')
user_agent = os.getenv('user_agent')

# Reddit

In [18]:
class RedditScraper:
    def __init__(self):
        self.reddit = praw.Reddit(
            client_id=os.getenv('client_id'),
            client_secret=os.getenv('client_secret'),
            user_agent=os.getenv('user_agent'),
        )

    def get_hot_posts(self, subreddit: str, count: int, start_date: datetime, end_date: datetime) -> List[dict]:
        """
        Gets the top count hot posts from a certain subreddit within a specified date range
        """
        posts = []
        current_date = start_date
        while current_date <= end_date:
            hot_posts = self.reddit.subreddit(subreddit).top(time_filter="day", limit=count, params={"after": int(current_date.timestamp())})
            posts.extend([
                {
                    "title": post.title,
                    "url": post.url,
                    "audio_url": None,
                    "filename": str(hash(post.url))[:15] + ".jpg" if ".jpg" in post.url else str(hash(post.url))[:15] + ".png",
                    "score": post.score,
                    "source": subreddit,
                    "Timestamp": post.created_utc,
                    "type": "image",
                    "last_updated": datetime.now(),
                    "added": False,
                    "approved": None,
                }
                for post in hot_posts
                if (".jpg" in post.url) or (".png" in post.url)
            ])
            current_date += timedelta(days=1)
        return posts

    def convert_posts_to_dataframe(self, posts: List[dict]) -> pd.DataFrame:
        """
        Converts a list of posts to a dataframe
        'title', 'url', 'filename', 'score', 'source', 'type'
        """
        columns = [
            "title",
            "url",
            "audio_url",
            "filename",
            "score",
            "source",
            "Timestamp",
            "type",
            "last_updated",
            "added",
            "approved",
        ]
        dataframe = pd.DataFrame(posts, columns=columns)
        return dataframe

    def scrape_reddit_for_top_posts(
        self, list_of_subreddits_to_search: None, start_date: datetime, end_date: datetime
    ) -> pd.DataFrame:
        """
        Scrapes top posts from specified subreddits within a specified date range
        """
        all_posts = []
        for subreddit in list_of_subreddits_to_search:
            posts = self.get_hot_posts(subreddit["subreddit"], subreddit["count"], start_date, end_date)
            all_posts.extend(posts)

        # return all of the posts within the date range as a dataframe
        return self.convert_posts_to_dataframe(all_posts)


In [21]:
rs = RedditScraper()
post_df = rs.scrape_reddit_for_top_posts(list_of_subreddits_to_search= [
                {"subreddit": "bitcoin", "count": 5000},
                {"subreddit": "ethereum", "count": 1000},
                {"subreddit": "cryptocurrency", "count": 1000}
            ], start_date=datetime(2018, 1, 1), end_date=datetime(2022, 1, 3))


TooManyRequests: received 429 HTTP response

In [20]:
post_df

Unnamed: 0,title,url,audio_url,filename,score,source,Timestamp,type,last_updated,added,approved
