In [None]:
import config
import requests
import praw
import pandas as pd
import pprint

In [None]:
user_agent = "python:ministryofedCSC_redditscraper:v0.0.1 (by rahm)"

In [None]:
reddit = praw.Reddit(client_id=config.client_id,
                     client_secret=config.client_secret,
                     user_agent=user_agent,
                    )

In [None]:
def extractSubmissionData(rs):
    """
    rs: being a reddit submission type
    return: list of attributes from submission
    """
    
    author_name = "[deleted]" if rs.author is None else rs.author.name
    
    return [rs.id, rs.title, rs.subreddit.display_name, author_name,\
            rs.score, rs.upvote_ratio, rs.num_comments, rs.selftext,\
            rs.url, rs.created_utc]

In [None]:
submissionsCol = ["id", "title", "subreddit", "author",\
                 "score", "upvote_ratio", "num_comments",\
                 "selftext", "url", "created_utc"]
totalSubmissionsData = pd.DataFrame([], columns=submissionsCol)


### Get submissions from Reddit

In [None]:
# query parameters
subToSearch = 'ontario'
sort = 'new'
time_filter = 'all'
# limit how many reddit posts you want
limit = 50
# what to search on
#query = 'title:education'
query = ''


#pd.DataFrame([], columns=submissionsCol)

def getSubmissions(subToSearch, query='', sort='new', time_filter='all', numSubmissions=30):
    """
    this function will get submissions/posts from reddits (not comments) according to the
    input parameters, and return attributes of each submission.
    
    subToSearch (string): the subreddit to get submissions from
    query (string): any properly formatted reddit query, see: https://www.reddit.com/wiki/search
    sort (string): a set of discrete filters that determines which submissions to get. Options
    change depending on if a query is being set or not.
                   if query is empty/None: {'new', 'hot', 'top', 'rising'}
                   if query is entered: {'new', 'hot', 'top', 'rising', 'relevance'}
                   Default is 'new'
                   
    time_filter (string): discrete set of filters that limit the time period submissions
                          will be retrieved from.
                          options: {'all', 'day', 'hour', 'month', 'week', 'year'}
    numSubmissions (int): the number of submissions you want to retrieve. Note, function will try
                          to return numSubmissions amount of submissions, but may not be possible
                          due to submission filtering from input parameters.

    returns (list of lists)
    """

    # redditMaxLimit is a constant defined as the total number of submissions that a single reddit api call can retrieve
    # its value is 1000 according to the documentation.
    # this shouldnt be changed unless the reddit api itself is updated
    redditMaxLimit = 1000

    submissionsData = []

    queryParams = {}
    # throttled limit
    thrLimit = min(numSubmissions, redditMaxLimit)
    

    if query is None or query == '':
        if sort == 'new':
            submissions = reddit.subreddit(subToSearch).new(limit=thrLimit)
        elif sort == 'hot':
            submissions = reddit.subreddit(subToSearch).hot(limit=thrLimit)
        elif sort == 'top':
            submissions = reddit.subreddit(subToSearch).top(limit=thrLimit)
        elif sort == 'rising':
            submissions = reddit.subreddit(subToSearch).rising(limit=thrLimit)
        else:
            # sorts by new by default
            submissions = reddit.subreddit(subToSearch).new(limit=thrLimit)

    else:

        submissions = reddit.subreddit(subToSearch)\
                    .search(query, sort=sort, time_filter=time_filter,\
                           limit=thrLimit)


    i = 0
    for sub in submissions:
        print(sub.id)
        print(sub.title)
        attrs = extractSubmissionData(sub)
        submissionsData.append(attrs)
        i+= 1
        print(i)

    return submissionsData
        

### Loop over all the subreddits in list and collect data from their submissions

In [None]:
subreddits = ['ontario', 'education']
submissions = []
for sr in subreddits:
    submissions += getSubmissions(sr, query='education', sort='new', time_filter='all', numSubmissions=5)

In [None]:
totalSubmissionsData = pd.DataFrame(submissions, columns=submissionsCol)

In [None]:


print(f"total number of submissions collected: {len(totalSubmissionsData)}")
totalSubmissionsData.head()

In [None]:
commentsCol = ["id", "subreddit", "submission_id", "author",\
              "body", "score", "conversation_id", "created_utc"]
conversationCol = ["id", "top_parent_id", "direct_parent_id",\
                  "child_comment_id"]


In [None]:
from collections import deque

def getReplyData(cf, top_level_id, convo_id):
    """
    comment: commentForest Object
    top_level_id: id of the top level comment
    return: tuple of list of lists: (list_a, list_b), where lista
            is a list of commentData, and listb is a list of convoData
    """
    cf.replace_more(limit=None)
    commentData = []
    convoData = []
    
    for com in list(cf):
        # in case post is deleted, author will be Nonetype
        author_name = "[deleted]" if com.author is None else com.author.name
        
        commentData.append([com.id, com.subreddit.display_name, com.submission.id,\
                            author_name, com.body, com.score,\
                            convo_id, com.created_utc])
        
        convoData.append([convo_id, top_level_id, com.parent_id, com.id])
        
        # according to documentation, need to call refresh before accessing reply info
        com.refresh()
        
        replyCommentData, replyConvoData = getReplyData(com.replies, top_level_id, convo_id)
        
        commentData += replyCommentData
        convoData += replyConvoData
        
    return (commentData, convoData)
    
def getCommentsFromSubmission(subm_id):
    """
    subm_id: submission id
    Return: list of comment objects from given submisison
    """
    commentData = []
    convoData = []

    subm = reddit.submission(id=subm_id)
    print(subm.title)
    
    # get rid of 'MoreComments' instances
    subm.comments.replace_more(limit=None)
    
    comments = subm.comments


    convo_id = 0

#     while len(comments) > 0:
    for com in comments:
        #com = comments.pop
        author_name = "[deleted]" if com.author is None else com.author.name

        commentData.append([com.id, com.subreddit.display_name, subm_id,\
                author_name, com.body, com.score,\
                convo_id, com.created_utc])
        
        convoData.append([convo_id, None, None, com.id])
        
#         if top_level_id == None:
#             conversation_data = \[convo_id, ]

#         top_level_id = com.name if top_level_id != None
        top_level_id = com.name
        
        com.refresh()
        replyCommentData, replyConvoData = getReplyData(com.replies, top_level_id, convo_id)
        
        commentData += replyCommentData
        convoData += replyConvoData
      
        convo_id += 1
    
    return (commentData, convoData)

### Iterate over all the previously collected submissions and get their comments

In [None]:
commentData = []
conversationData = []
for i, row in totalSubmissionsData.iterrows():
    subm_id = row.id
    comments, convo = getCommentsFromSubmission(subm_id)
    commentData += comments
    conversationData += convo  

### Form the actual data frame

In [None]:
commentsDataDf = pd.DataFrame(commentData, columns = commentsCol)
conversationDataDf = pd.DataFrame(conversationData, columns = conversationCol)

In [None]:
commentData

In [None]:
print(len(commentsDataDf))
print(len(conversationDataDf))

In [None]:
commentsDataDf.head()