In [1]:
import pandas as pd
import numpy as np
import requests
import json
import time
from datetime import datetime, timezone, timedelta

In [2]:
def get_recent_posts(base_time, num_posts, avoid_distinguished=True, attached=None):
    if attached:
        posts = attached
    else:
        posts = []
    after = None
    params = {'t': 'timestamp'}
    
    # Convert base_time to Unix timestamp
    base_timestamp = int(base_time.timestamp())
    params['before'] = base_timestamp
    count = 0

    while len(posts) < num_posts:
        if after is not None:
            params['after'] = after
        res = requests.get('https://reddit.com/.json', params=params, headers={'User-agent': 'Dodge Bot 0.1'})
        if res.status_code != 200:
            print('API request failed.')
            if (count+=1):
                return posts  
            time.sleep(2)
            count++
            continue
        the_json = res.json()
        
        if avoid_distinguished:
            page = [child for child in the_json['data']['children'] if not child['data']['stickied'] 
                    and not child['data']['archived'] 
                    and not child['data']['distinguished']
                    and not child['data']['domain'].startswith('i.redd.it') 
                    and not child['data']['domain'].startswith('v.redd.it') 
                    and not child['data']['url'].startswith('https://www.reddit.com/gallery')]
                    
        else:
            page = [child for child in the_json['data']['children'] if not child['data']['domain'].startswith('i.redd.it')
                     and not child['data']['domain'].startswith('v.redd.it') 
                     and not child['data']['url'].startswith('https://www.reddit.com/gallery')]
        
        posts.extend(page)
        after = the_json['data']['after']
        time.sleep(1)
        if len(posts) >= num_posts:
            break
        if after is None:
            break
    return posts[:num_posts]

In [3]:
def get_posts_by_day(base_time, num_days, avoid_distinguished=True, max_posts_per_day=None):
    all_posts = []

    for i in range(num_days):
        current_day = base_time - timedelta(days=i)
        base_time_day_start = datetime(current_day.year, current_day.month, current_day.day, tzinfo=timezone.utc)
        base_time_day_end = base_time_day_start + timedelta(days=1)

        posts = get_recent_posts(base_time_day_end, num_posts=max_posts_per_day, avoid_distinguished=avoid_distinguished)
        
        if posts is not None: 
            all_posts.extend(posts)

    return all_posts

base_time = datetime(2024, 5, 18, 10, 0, 00)
num_days = 10

all_posts = get_posts_by_day(base_time, num_days, max_posts_per_day=7000)

print(len(all_posts))


API request failed.
API request failed.
API request failed.
API request failed.
API request failed.
API request failed.
API request failed.
API request failed.
API request failed.
API request failed.
409


In [4]:
def get_subreddit_subscribers(subreddit):
    while True:
        url = f'https://www.reddit.com/r/{subreddit}/about.json'
        headers = {'User-agent': 'Dodge Bot 0.1'}  
        response = requests.get(url, headers=headers)
        
        if response.status_code == 200:
            data = response.json()
            subscribers = data['data']['subscribers']
            return subscribers
        else:
            print(f"Failed to retrieve data for subreddit {subreddit}. Retrying in 2 seconds...")
            time.sleep(3)

In [5]:
subreddit_names = []
for post in all_posts:
    subreddit_name = post['data']['subreddit_name_prefixed'].split('r/')[-1]
    subreddit_names.append(subreddit_name)

subscribers = []
for subreddit_name in subreddit_names:
    subscribers_count = get_subreddit_subscribers(subreddit_name)
    subscribers.append(subscribers_count)


Failed to retrieve data for subreddit Money. Retrying in 0.1 seconds...
Failed to retrieve data for subreddit Money. Retrying in 0.1 seconds...
Failed to retrieve data for subreddit Money. Retrying in 0.1 seconds...
Failed to retrieve data for subreddit Money. Retrying in 0.1 seconds...
Failed to retrieve data for subreddit Money. Retrying in 0.1 seconds...
Failed to retrieve data for subreddit Money. Retrying in 0.1 seconds...
Failed to retrieve data for subreddit Money. Retrying in 0.1 seconds...
Failed to retrieve data for subreddit Money. Retrying in 0.1 seconds...
Failed to retrieve data for subreddit Money. Retrying in 0.1 seconds...
Failed to retrieve data for subreddit Money. Retrying in 0.1 seconds...
Failed to retrieve data for subreddit Money. Retrying in 0.1 seconds...
Failed to retrieve data for subreddit Money. Retrying in 0.1 seconds...
Failed to retrieve data for subreddit Money. Retrying in 0.1 seconds...
Failed to retrieve data for subreddit Money. Retrying in 0.1 sec

In [6]:
data = []

for post, subscriber_count in zip(all_posts, subscribers):
    post_data = {
        'created_utc': post['data']['created_utc'],
        'title': post['data']['title'],
        'selftext': post['data']['selftext'],
        'subreddit': post['data']['subreddit'],
        'num_comments': post['data']['num_comments'],
        'domain': post['data']['domain'],
        'ups': post['data']['ups'],
        'url': post['data']['url'],
        'base_time': base_time,
        'subscribers': subscriber_count
    }
    data.append(post_data)

df = pd.DataFrame(data)

df

Unnamed: 0,created_utc,title,selftext,subreddit,num_comments,domain,ups,url,base_time,subscribers
0,1.715955e+09,"Grandpa passed away and left me 167,000 USD on...",She wants it for herself. She told me that dir...,Money,6315,self.Money,12672,https://www.reddit.com/r/Money/comments/1cu68c...,2024-05-18 10:00:43.672513+00:00,517657
1,1.715966e+09,WIBTA for divorcing my wife because she couldn...,I 28M and my Wife 29F were recently visited Ca...,AITAH,5995,self.AITAH,10621,https://www.reddit.com/r/AITAH/comments/1cuasu...,2024-05-18 10:00:43.672513+00:00,1635997
2,1.715987e+09,Your nudes just got leaked. What’s your response?,,AskReddit,4706,self.AskReddit,3767,https://www.reddit.com/r/AskReddit/comments/1c...,2024-05-18 10:00:43.672513+00:00,46394934
3,1.715955e+09,"Rudy Giuliani is missing, last seen in Palm Be...",,law,2803,bocanewsnow.com,24649,https://bocanewsnow.com/2024/05/16/rudy-giulia...,2024-05-18 10:00:43.672513+00:00,291203
4,1.715958e+09,College Football 25 Reveal Trailer,,CFB,1971,youtu.be,4800,https://youtu.be/W1QDaXkufCo?si=4et05QoLJRnE9USU,2024-05-18 10:00:43.672513+00:00,3323271
...,...,...,...,...,...,...,...,...,...,...
404,1.716005e+09,Must read for 24 aspirants (and repeaters),This is gonna be long but I feel it might help...,CATpreparation,64,self.CATpreparation,103,https://www.reddit.com/r/CATpreparation/commen...,2024-05-18 10:00:43.672513+00:00,74008
405,1.716010e+09,Indian Students In Kyrgyzstan Asked To Stay In...,,GeopoliticsIndia,20,ndtv.com,81,https://www.ndtv.com/indians-abroad/indian-stu...,2024-05-18 10:00:43.672513+00:00,18571
406,1.715960e+09,Men with dark triad traits accurately detect s...,,psychology,143,psypost.org,965,https://www.psypost.org/men-with-dark-triad-tr...,2024-05-18 10:00:43.672513+00:00,2910627
407,1.715994e+09,i'm gonna die single atp,throwaway because friends know my main\n\nearl...,SGExams,71,self.SGExams,164,https://www.reddit.com/r/SGExams/comments/1cul...,2024-05-18 10:00:43.672513+00:00,234462


In [7]:
from pymongo import MongoClient

client = MongoClient('localhost', 27017)
db = client['data_mining']  
collection = db['reddit_data']  

records = df.to_dict(orient='records')

collection.insert_many(records)

InsertManyResult([ObjectId('664887edec26dc4d1e32ccb7'), ObjectId('664887edec26dc4d1e32ccb8'), ObjectId('664887edec26dc4d1e32ccb9'), ObjectId('664887edec26dc4d1e32ccba'), ObjectId('664887edec26dc4d1e32ccbb'), ObjectId('664887edec26dc4d1e32ccbc'), ObjectId('664887edec26dc4d1e32ccbd'), ObjectId('664887edec26dc4d1e32ccbe'), ObjectId('664887edec26dc4d1e32ccbf'), ObjectId('664887edec26dc4d1e32ccc0'), ObjectId('664887edec26dc4d1e32ccc1'), ObjectId('664887edec26dc4d1e32ccc2'), ObjectId('664887edec26dc4d1e32ccc3'), ObjectId('664887edec26dc4d1e32ccc4'), ObjectId('664887edec26dc4d1e32ccc5'), ObjectId('664887edec26dc4d1e32ccc6'), ObjectId('664887edec26dc4d1e32ccc7'), ObjectId('664887edec26dc4d1e32ccc8'), ObjectId('664887edec26dc4d1e32ccc9'), ObjectId('664887edec26dc4d1e32ccca'), ObjectId('664887edec26dc4d1e32cccb'), ObjectId('664887edec26dc4d1e32cccc'), ObjectId('664887edec26dc4d1e32cccd'), ObjectId('664887edec26dc4d1e32ccce'), ObjectId('664887edec26dc4d1e32cccf'), ObjectId('664887edec26dc4d1e32cc