Modules import

In [10]:
import pandas as pd
import numpy as np
import requests
import json
import csv
import time
from datetime import datetime, timedelta

Functions declaration

In [11]:
def calcTimeStamps(startDaysAgo: int, endDaysAgo: int) -> '[startStamp, endStamp]':
    """Returns timestamps for set days from the past"""
    now = datetime.now()

    # Datetimes calculation
    dateStart = now - timedelta(days=startDaysAgo)
    dateEnd = now - timedelta(days=endDaysAgo)

    # Datetimes to timestamps conversion
    stampStart = int(datetime.timestamp(dateStart))
    stampEnd = int(datetime.timestamp(dateEnd))

    return [stampStart, stampEnd]

In [12]:
def getPushShiftData(afterDateStamp: "Start timestamp", beforeDateStamp: "End timestamp", subredditName: "Subreddit's name"):
    """Returns a dictionary based on JSON object received from PushShift's API"""
    # API's querries are being made through URL
    url = 'https://api.pushshift.io/reddit/search/submission/?' + 'size=500' + '&after=' + str(afterDateStamp) + '&before=' + \
            str(beforeDateStamp) + '&subreddit=' + str(subredditName)
    # Get a request from the url
    request = requests.get(url)
    # Load and convert: JSON -> dictionary
    data = json.loads(request.text)

    return data['data']

In [13]:
def collectDataFromSubmission(submissionsList: 'list of json-origin dictionaries', yummyDataDictionary: 'a dictionary the data is being appended to'):
    """Gets specific data from every submission and appends it to the yummyDataDictionary"""
    for submission in submissionsList:
        title = submission['title']
        url = submission['url']
        try:
            flair = submission['link_flair_text']
        except KeyError:
            flair = np.nan
        author = submission['author']
        subId = submission['id']
        score = submission['score']
        created = datetime.fromtimestamp(submission['created_utc'])
        numComments = submission['num_comments']
        permalink = submission['permalink']

        # Structuring the data
        dataChunk = (subId, title, url, author, score, created, numComments, permalink, flair)

        yummyDataDictionary[subId] = dataChunk

In [20]:
def forceGetData(afterDateStamp: "Start timestamp", beforeDateStamp: "End timestamp", subredditName: "Subreddit's name"):
    """Forces Pushshift API to return more submissions (over 500) by pushing multiple querries. Those have to be time delayed, so that requests_limit per minute is not exceeded"""
    # Stores all the colleced entries
    finalDictionary = {}

    # Get the first chunk of data
    dataChunk = getPushShiftData(afterDateStamp, beforeDateStamp, subredditName)

    
    # While data is still being received
    while len(dataChunk) > 0:
        # Collect interesting fields
        collectDataFromSubmission(dataChunk, finalDictionary)

        # Narrow the time frame
        newAfterStamp = dataChunk[-1]['created_utc']
        # Querry again
        dataChunk = getPushShiftData(newAfterStamp, beforeDateStamp, subredditName)

        # Periodic output for users comfort
        date = datetime.fromtimestamp(newAfterStamp)
        print(f"Current date: {date}")
        # Cannot exceed 120 querries per minute
        # No problem, my internet connection is too slow
        time.sleep(0.25)

    return finalDictionary

In [21]:
def writeDataToCSV(dataDictionary: 'dict with all the yummy data', fileName='default.csv'):
    """Writes the collected data to a .csv file"""
    with open(fileName, 'w', newline='', encoding='utf-8') as file:
        csvWriter = csv.writer(file, delimiter=',')

        # Adding a header
        headers = ["Post ID", "Title", "URL", "Author", "Score", "Publish Date", "Total number of comments", "Permalink", "Flair"]
        csvWriter.writerow(headers)

        # Writing the data
        for sub in dataDictionary:
            csvWriter.writerow(dataDictionary[sub])

Main code

In [22]:
# Calc timestamps for begin & end days
datesList  = calcTimeStamps(1,91)

In [23]:
# Downloading the data
DATA = forceGetData(datesList[1], datesList[0], 'worldnews')

Current date: 2019-12-27 02:47:28
Current date: 2019-12-27 18:32:06
Current date: 2019-12-28 13:57:39
Current date: 2019-12-29 09:14:53
Current date: 2019-12-30 03:16:22
Current date: 2019-12-30 19:07:01
Current date: 2019-12-31 13:12:29
Current date: 2020-01-01 09:49:08
Current date: 2020-01-02 07:00:56
Current date: 2020-01-02 21:22:39
Current date: 2020-01-03 09:51:17
Current date: 2020-01-03 19:55:18
Current date: 2020-01-04 10:05:32
Current date: 2020-01-04 22:42:01
Current date: 2020-01-05 10:47:55
Current date: 2020-01-05 19:27:48
Current date: 2020-01-06 04:28:58
Current date: 2020-01-06 17:23:27
Current date: 2020-01-07 04:13:05
Current date: 2020-01-07 15:29:33
Current date: 2020-01-08 00:37:41
Current date: 2020-01-08 04:42:47
Current date: 2020-01-08 11:12:17
Current date: 2020-01-08 18:19:38
Current date: 2020-01-09 02:25:43
Current date: 2020-01-09 15:33:17
Current date: 2020-01-09 23:00:14
Current date: 2020-01-10 11:06:21
Current date: 2020-01-10 20:58:27
Current date: 

In [24]:
print(len(DATA))

106599


In [25]:
# Saving to .csv
writeDataToCSV(DATA, 'last3Months.csv')