In [1]:
import json
from pathlib import Path
import datetime as dt
from pprint import pprint
from collections import defaultdict

import praw
from psaw import PushshiftAPI
import pandas as pd

In [12]:
# Config params

# Directory where all data will be stored
DATA = Path('../data/')

# Subreddits to get comments/posts from 
subreddits = ['Democrats', 'The_Donald', 'Politics', 'Conservative', 'Liberal']

# Candidate comments
candidates = {
    'JoeBiden': ['joe', 'biden'],
    'ElizabethWarren': ['elizabeth', 'warren'],
    'BernieSanders': ['bernie', 'sanders'],
    'DonaldTrump': ['donald', 'trump']
}

# File containing reddit app credentials 
CREDENTIALS = Path('../credentials.json')

# Time period to get data for
START = dt.datetime(year=2016, month=1, day=1)
END = dt.datetime(year=2019, month=12, day=13, hour=23, minute=59, second=59)

In [3]:
def create_dirs(PATH, dirs):
    " Creates the required folders "
    
    contents = [f.name for f in PATH.glob('*')]
    
    for directory in dirs:
        if directory not in contents:
            (PATH/f'{directory}').mkdir()
        else:
            continue
            
create_dirs(DATA, ['posts', 'comments', 'test'])

### 1. Set up API access

In [4]:
# Pushshift
api = PushshiftAPI()

# Auth Credentials
creds = json.load(CREDENTIALS.open('rb'))

# Authorize app
reddit = praw.Reddit(**creds)

### 2. Get all posts

In [5]:
def get_posts(FILELOC, subreddit, timeperiod, nchunks=1000,
              filter_params = ['url','author', 'title', 'id', 'full_link', 'score', 'subreddit', 
                               'subreddit_subscribers', 'url'],
              logging=True
             ):
    """ 
    Gets all the posts from a specific subredddit 

    Args:
        FILELOC: Pathlib object specifying full path to file (including name and format)
        subreddit: String specifying the name of the subreddit to get data from (e.g. learnpython).
        timeperiod: Tuple of two datetime objects specifying the time period for data curation
        nchunks: Number of chunks to divide the timeperiod in (default is good).
        filter_params: List containing params to return for each post.

    Returns:
        A Pandas Dataframe containing all the scraped posts
    
    """
    
    start, end = [t.timestamp() for t in [START, END]]
    difference = end - start
    increment = difference/nchunks
    assert difference > 0, "Please enter the timeperiod in the correct format"

    with open(FILELOC, 'w') as f:
        for i in range(0, nchunks):
            if logging:
                print(f"Getting chunk: {i+1}/{nchunks}")
            
            for p in api.search_submissions(after=start + i*difference, 
                                            before=start + (i+1)*difference,
                                            subreddit=subreddit,
                                            filter=filter_params,
                                            limit=1e6):
                f.write(json.dumps(p.d_) + '\n')

In [8]:
create_dirs(DATA/'posts', ['raw', 'processed'])

for subreddit in subreddits:
    print(f"Getting posts for subreddit: {subreddit}")
    get_posts(DATA/f'posts/raw/{subreddit}.json', 'learnpython', (START,END))

### 3. Get comments mentioning candidates

In [15]:
def get_comments(FILELOC, query_term, subreddits, timeperiod, nchunks=1000,
                 filter_params = ['url', 'author', 'body', 'id', 'full_link', 'score', 'subreddit'],
                 logging=True):
    """ 
    Gets all comments containing a specific term from the subreddits specified.

    Args:
        FILELOC: Pathlib object specifying full path to file (including name and format)
        subreddits: List containing the names of the subreddits from which comments will be scraped.
        timeperiod: Tuple of two datetime objects specifying the time period for data curation
        nchunks: Number of chunks to divide the timeperiod in (default is good).
        filter_params: List containing params to return for each post.

    Returns:
        A Pandas Dataframe containing all the scraped posts
    
    """
    
    start, end = [t.timestamp() for t in [START, END]]
    difference = end - start
    increment = difference/nchunks
    assert difference > 0, "Please enter the timeperiod in the correct format"

    with open(FILELOC, 'w') as f:
        for i in range(0, nchunks):
            if logging:
                print(f"Getting chunk: {i+1}/{nchunks}")
            
            for subreddit in subreddits:
                for p in api.search_comments(q=query_term,
                                             after=start + i*difference, 
                                             before=start + (i+1)*difference,
                                             subreddit=subreddit,
                                             filter=filter_params,
                                             limit=1e6):
                    allcomments.append(p.d_)
                    f.write(json.dumps(p.d_) + '\n')

In [None]:
create_dirs(DATA/'comments', ['raw', 'processed'])

for candidate, candidate_terms in candidates.items():
    print(f"Getting comments for candidate: {candidate}")
    
    for term in candidate_terms:
        get_comments(DATA/f'comments/raw/{term}.json', term, subreddits, (START,END))