In [1]:
import pandas as pd
import numpy as np
import requests
import json
import csv
import time
from datetime import datetime, timedelta

# 1. Functions declaration

It contains functions responsible for data scraping with Pushshift API

In [2]:
def calcTimeStamps(startDaysAgo: int, endDaysAgo: int) -> '[startStamp, endStamp]':
    """Returns timestamps for set days from the past"""
    now = datetime.now()

    # Datetimes calculation
    dateStart = now - timedelta(days=startDaysAgo)
    dateEnd = now - timedelta(days=endDaysAgo)

    # Datetimes to timestamps conversion
    stampStart = int(datetime.timestamp(dateStart))
    stampEnd = int(datetime.timestamp(dateEnd))

    return [stampStart, stampEnd]

In [3]:
def getPushShiftData(afterDateStamp: "Start timestamp", beforeDateStamp: "End timestamp", subredditName: "Subreddit's name"):
    """Returns a dictionary based on JSON object received from PushShift's API"""
    # API's querries are being made through URL
    url = 'https://api.pushshift.io/reddit/search/submission/?' + 'size=500' + '&after=' + str(afterDateStamp) + '&before=' + \
            str(beforeDateStamp) + '&subreddit=' + str(subredditName)
    # Get a request from the url
    request = requests.get(url)
    # Load and convert: JSON -> dictionary
    data = json.loads(request.text)

    return data['data']

In [4]:
def collectDataFromSubmission(submissionsList: 'list of json-origin dictionaries', yummyDataDictionary: 'a dictionary the data is being appended to'):
    """Gets specific data from every submission and appends it to the yummyDataDictionary"""
    for submission in submissionsList:
        title = submission['title']
        url = submission['url']
        try:
            flair = submission['link_flair_text']
        except KeyError:
            flair = np.nan
        author = submission['author']
        subId = submission['id']
        score = submission['score']
        created = datetime.fromtimestamp(submission['created_utc'])
        numComments = submission['num_comments']
        permalink = submission['permalink']

        # Structuring the data
        dataChunk = (subId, title, url, author, score, created, numComments, permalink, flair)

        yummyDataDictionary[subId] = dataChunk

In [9]:
def forceGetData(afterDateStamp: "Start timestamp", beforeDateStamp: "End timestamp", subredditName: "Subreddit's name"):
    """Forces Pushshift API to return more submissions (over 500) by pushing multiple querries. Those have to be time delayed, so that requests_limit per minute is not exceeded"""
    # Stores all the colleced entries
    finalDictionary = {}

    # Get the first chunk of data
    dataChunk = getPushShiftData(afterDateStamp, beforeDateStamp, subredditName)

    
    # While data is still being received
    while len(dataChunk) > 0:
        # Collect interesting fields
        collectDataFromSubmission(dataChunk, finalDictionary)

        # Narrow the time frame
        newAfterStamp = dataChunk[-1]['created_utc']
        # Querry again
        dataChunk = getPushShiftData(newAfterStamp, beforeDateStamp, subredditName)

        # Periodic output for users comfort
        date = datetime.fromtimestamp(newAfterStamp)
        print(f"Current date: {date}")
        # Cannot exceed 120 querries per minute
        # No problem, my internet connection is too slow
        time.sleep(0.05)

    return finalDictionary

In [10]:
def writeDataToCSV(dataDictionary: 'dict with all the yummy data', fileName='default.csv'):
    """Writes the collected data to a .csv file"""
    with open(fileName, 'w', newline='', encoding='utf-8') as file:
        csvWriter = csv.writer(file, delimiter=',')

        # Adding a header
        headers = ["Post ID", "Title", "URL", "Author", "Score", "Publish Date", "Total number of comments", "Permalink", "Flair"]
        csvWriter.writerow(headers)

        # Writing the data
        for sub in dataDictionary:
            csvWriter.writerow(dataDictionary[sub])

# 2. Main code

In [11]:
# Calc timestamps for begin & end days
datesList  = calcTimeStamps(1,240)

In [12]:
# Downloading the data
DATA = forceGetData(datesList[1], datesList[0], 'worldnews')

8:25:28
Current date: 2020-06-06 11:34:22
Current date: 2020-06-06 13:40:51
Current date: 2020-06-06 15:21:17
Current date: 2020-06-06 17:26:59
Current date: 2020-06-06 19:23:27
Current date: 2020-06-06 21:50:41
Current date: 2020-06-07 00:43:36
Current date: 2020-06-07 03:38:31
Current date: 2020-06-07 06:11:31
Current date: 2020-06-07 07:28:29
Current date: 2020-06-07 09:53:45
Current date: 2020-06-07 13:17:41
Current date: 2020-06-07 15:40:47
Current date: 2020-06-07 18:01:44
Current date: 2020-06-07 20:12:38
Current date: 2020-06-07 22:30:52
Current date: 2020-06-08 00:45:28
Current date: 2020-06-08 03:16:35
Current date: 2020-06-08 05:00:37
Current date: 2020-06-08 06:11:37
Current date: 2020-06-08 07:31:15
Current date: 2020-06-08 08:57:14
Current date: 2020-06-08 10:41:40
Current date: 2020-06-08 12:09:14
Current date: 2020-06-08 13:43:04
Current date: 2020-06-08 15:21:00
Current date: 2020-06-08 16:57:01
Current date: 2020-06-08 18:17:26
Current date: 2020-06-08 19:20:11
Curren

In [13]:
print(len(DATA))

295688


In [14]:
# Saving to .csv
writeDataToCSV(DATA, 'last240Days.csv')