In [1]:
# Collect relevant content through the Reddit API.
import json
import praw
# PRAW documentation:
#  https://praw.readthedocs.io/en/stable/code_overview/reddit_instance.html

In [2]:
# IMPORTANT: enter proper access credential in the config-file;
# follow instructions in reddit_credentials_verify.ipynb
import config_reddit

In [3]:
# establish an API connection and verify read-only access
reddit = praw.Reddit(user_agent=f"Exploration script by /u/{config_reddit.user_name}",
                     client_id=config_reddit.app_id,
                     client_secret=config_reddit.app_secret)
reddit.read_only

True

In [4]:
# choose a subreddit of interest
# MODIFY this to what you prefer to analyze
#
# Example (take the string from the ending-part of the subreddit URL):
#  https://www.reddit.com/r/ebikes/
query_subreddit = 'greysanatomy'

In [5]:
# decide how many top-"hot" posts to query
nposts = 30

In [6]:
# collect ids of the top posts within the chosen subreddit
post_ids = []
subreddit = reddit.subreddit(query_subreddit)
for p in subreddit.hot(limit = nposts):
    post_ids.append(p.id)
# check how many posts (submissions) were collected
len(post_ids)

30

In [7]:
# example post details
post_details = reddit.submission(id = post_ids[1])
print(post_details.title)
print(post_details.selftext)

Season 18 Episode 19 Out For Blood and Episode 20 You Are The Blood Discussion Post
Discussion post for Grey’s Anatomy season 18 finale double-header episode 19 and episode 20. Both are directed by Debbie Allen (AKA Catherine Fox, Jackson’s mom).

SPOILERS AHEAD!! Read at your own risk. This thread is for live blogging as the episodes air, and for looking back on in the future. 

Episode 19: Due to a blood shortage, Grey Sloan Memorial sets up a voluntary donation centre; Nick asks Meredith for help with his patient; Winston is hung up on his relationship with his brother.

Episode 20: The blood shortage continues; Meredith makes a risky decision regarding a patient; Owen's actions to help his fellow veterans come to light.


Original air date for both: 5/26/22

Title song [Out for Blood by Sun 41](https://m.youtube.com/watch?v=3A27bHME73I)

Title song for 20 [You Are The Blood by The Castanets](https://m.youtube.com/watch?v=VJhgwxSTHDs)


In [8]:
# decide how many top comments to query per post;
# NOTE: larger number of comments may dilute the content (irrelevant text)
ncomments = 20

In [9]:
# function to collect post data
def collect_post_data(post_id, ncomments, reddit):
    psubm = reddit.submission(id = post_id)
    pdata = {'id': post_id, 'title': psubm.title, 'text': psubm.selftext}
    
    # collect first- and second-level comments
    pcomm = []
    psubcomm = []
    psubm.comments.replace_more(limit = ncomments)
    for top_comment in psubm.comments:
        pcomm.append(top_comment.body)
        for lev2_comment in top_comment.replies:
            psubcomm.append(lev2_comment.body)
    
    # assemble the data together
    pdata['comments_lev1'] = pcomm
    pdata['comments_lev2'] = psubcomm
    
    return pdata

In [10]:
# collect information for each post
posts_all = [collect_post_data(pid, ncomments, reddit) for pid in post_ids]

In [11]:
# save collected data to json file
file_out = f"raw_post_comment_data.json"
with open(file_out, mode='w') as f:
    f.write(json.dumps(posts_all, indent=2))