In [None]:
# Installations
!pip install praw
!pip install tqdm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# Imports
import praw
import pandas as pd
from tqdm import tqdm
from google.colab import files

In [None]:
# Initializing Reddit Class
reddit = praw.Reddit(
    client_id="RV9fcx4CzD_htsS4hrF_9w",
    client_secret="LKsX3rScHYgq0PHokFilLZPzwjcJGQ",
    user_agent="Mai",
    check_for_async=False
)

context_len = 7

In [None]:
def get_all_leaves(submission):
  data = {}
  # Modified DFS to get all leaf comment ids/ comments that do not have children
  # And also maintain a dict with parent and comment text info for each comment
  submission.comments.replace_more(limit=0)
  comment_stack = submission.comments[:]
  all_leaves = []

  while comment_stack:
      comment = comment_stack.pop(0) # Comment visited
      data[comment.id] = {"parent_id":comment.parent().id, "body":comment.body}
      if len(comment.replies) == 0: # Leaf is reached
        all_leaves.append(comment.id)
      comment_stack[0:0] = comment.replies

  return all_leaves, data

In [None]:
def get_paths(submission_id, leaves, data):
  # Get all comment paths from root/ top level comment 
  paths = []

  for leaf in leaves:
    path = []
    x = leaf

    while (data[x]["parent_id"] != submission_id): # Till the comment id is not eq to post id/ till top level comment not reached
      if data[x]["body"] == "[deleted]" or data[x]["body"] == "[removed]":
        x = data[x]["parent_id"] # Update x eachtime in any case
        continue
      path.append(data[x]["body"])
      # print(data[x]["body"])
      x = data[x]["parent_id"] # Get parent of comment. Update x eachtime in any case
  
    # print(data[x]["body"])
    if data[x]["body"] != "[deleted]" or data[x]["body"] == "[removed]": # For the leaf node
      path.append(data[x]["body"])
    paths.append(path)

  return paths  

In [None]:
def filter_paths(comment_paths, min_comments=context_len+1):
  # Only choose those paths that have atleast the minimum context length
  final_comment_paths = []

  for path in comment_paths:
    if len(path)>= min_comments:
      final_comment_paths.append(path)
  return final_comment_paths

In [None]:
def create_df(paths):
  columns = ['response', 'context'] 
  columns = columns + ['context/'+str(i) for i in range(context_len - 1)] # We need more context columns according to content_len
  dataframe = pd.DataFrame(columns = columns) # Init dataframe with cols 

  for path in paths:
    data = {}
    data["response"] = path[0] # Leaf comment in a path is the response
    data["context"] = path[1] # (Leaf-1) makes context

    for i in range(2, context_len+1): # (Leaf-i) to root/ top level comment make subsequent contexts
      context_num = i-2
      data["context/"+str(context_num)] = path[i]  
    dataframe = dataframe.append(data, ignore_index=True) # Dictionary becomes one instance of dataframe

  return dataframe

In [None]:
def scrape_subreddit(subredit):
  posts = []
  # Get all posts from given subreddit 
  # Save all the details of each post in in dataframe
  subreddit = reddit.subreddit(subredit) 

  print("Retreiving posts...")
  for post in subreddit.hot(limit=500):
      posts.append([post.title, post.url])
  
  for post in subreddit.top(limit=500):
    if post not in posts:
      posts.append([post.title, post.url])
  
  for post in subreddit.new(limit=500):
    if post not in posts:
      posts.append([post.title, post.url])
  
  for post in subreddit.rising(limit=500):
    if post not in posts:
      posts.append([post.title, post.url])
  
  print("Retreived posts!")
  # posts = pd.DataFrame(posts,columns=['title', 'score', 'id', 'subreddit', 'url', 'num_comments', 'body', 'created'])
  posts = pd.DataFrame(posts,columns=['title', 'url'])
  print(posts)

  urls = posts['url'] # Select just the urls of each post

  dataframes = []
  image_num = 0
  small = 0

  # Putting everything together 
  # For each url/post, get the list of comment threads that meet the criteria and make a dataframe for each post 
  print("Finding Useful Threads...")
  for i in tqdm(range(len(urls))):
      if urls[i].split(".")[-1] in ["jpg", "png"]:
          image_num += 1
          continue
      try:
        submission = reddit.submission(url=urls[i])
        children, data = get_all_leaves(submission)
        comment_paths = get_paths(submission.id, children, data)
        
        p = filter_paths(comment_paths)
        if p !=[]:
            df = create_df(p)
            dataframes.append(df)
        else:
            small += 1
      except:
        continue
  print("Found Useful Threads!")

  return dataframes, small, image_num # Return all dataframes and stats for the subreddit

In [None]:
subreddits = ['Periods', 'menstruation', 'badwomensanatomy', 'TwoXChromosomes']

for subreddit in subreddits:
  print("-------------- WORKING ON SUBREDDIT:", str(subreddit), "--------------")
  dataframes, small, image_num = scrape_subreddit(subreddit)
  print("Total Usable DataFrames:", len(dataframes))
  print("Small, Unusable DataFrames:", small)
  print("Image Posts:", image_num)

  # For each subreddit, just concat the dataframes corresponding to each post of the subreddit as one post
  final_df = pd.concat(dataframes, ignore_index=True)
  print("Number of instances in concat DF:", final_df.shape[0])
  
  # Saving the dataframe
  fname = subreddit+'.csv' 
  final_df.to_csv(fname)
  files.download(fname)


-------------- WORKING ON SUBREDDIT: Periods --------------
Retreiving posts...
Retreived posts!
                                              title  \
0                         COVID Vaccine and Periods   
1                         Am I Pregnant? Megathread   
2              Does anyone dread their periods too?   
3                 So this just came out of me.. um?   
4                             Period insomnia sucks   
...                                             ...   
1519                                Inducing period   
1520  Passed out from cramps - is the alc to blame?   
1521                                  Period early?   
1522    Is it normal to get cramps once you hit 18?   
1523                          Light brown discharge   

                                                    url  
0     https://www.reddit.com/r/Periods/comments/oxez...  
1     https://www.reddit.com/r/Periods/comments/w9nz...  
2     https://www.reddit.com/r/Periods/comments/yj5d...  
3         

100%|██████████| 1524/1524 [32:30<00:00,  1.28s/it]

Found Useful Threads!
Total Usable DataFrames: 26
Small, Unusable DataFrames: 1043
Image Posts: 434
Number of instances in concat DF: 68





<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

-------------- WORKING ON SUBREDDIT: menstruation --------------
Retreiving posts...
Retreived posts!
                                                  title  \
0                                        Periods in 30s   
1                                        Toilet anxiety   
2                                      Digestive issues   
3                         IBS and Menstruation Research   
4                                  Is 44 cycles normal?   
...                                                 ...   
1519   Missed period but pregnancy test shows negative?   
1520  Recently diagnosed with hypothyroidism. I have...   
1521                                    treating cramps   
1522  Venting/ 20.5 yo that has started getting two ...   
1523                                           Spotting   

                                                    url  
0     https://www.reddit.com/r/menstruation/comments...  
1     https://www.reddit.com/r/menstruation/comments...  
2     https://w

100%|██████████| 1524/1524 [48:16<00:00,  1.90s/it]

Found Useful Threads!
Total Usable DataFrames: 22
Small, Unusable DataFrames: 1362
Image Posts: 86
Number of instances in concat DF: 25





<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

-------------- WORKING ON SUBREDDIT: badwomensanatomy --------------
Retreiving posts...
Retreived posts!
                                                  title  \
0                           Rule 7 update - please read   
1                       apparently only ugly women fart   
2                              does this one fut here ?   
3     On a post about someone with ADHD not being ab...   
4                       Explain yourselves lesbians! /s   
...                                                 ...   
1519                                   Wtf is a vulva 🤦   
1520  I made a post about how I had an image of my u...   
1521                        Not the reddit dm people 💀💀   
1522       "Teaching her in the end she's just a woman"   
1523  Those fluids likely had more to do with her gi...   

                                                    url  
0     https://www.reddit.com/r/badwomensanatomy/comm...  
1                   https://i.redd.it/zirzyvubr9x91.png  
2          

100%|██████████| 1524/1524 [10:38<00:00,  2.39it/s]

Found Useful Threads!
Total Usable DataFrames: 103
Small, Unusable DataFrames: 170
Image Posts: 1175
Number of instances in concat DF: 895





<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

-------------- WORKING ON SUBREDDIT: TwoXChromosomes --------------
Retreiving posts...
Retreived posts!
                                                  title  \
0     [MINI FAQ] Do I have to be a woman to particip...   
1     Delta paid a doctor to declare a pilot mentall...   
2     How do you respectfully explain that there is ...   
3     My husband is returning home tonight from a 3 ...   
4     Female voter registration for Wisconsin midter...   
...                                                 ...   
1519        I Have Trouble Believing There Are Good Men   
1520  My marriage is going really well but I still f...   
1521                             Rape culture on reddit   
1522                           Roe, Roe, Roe your vote!   
1523          Said goodbye to my Twitter account today.   

                                                    url  
0     https://www.reddit.com/r/TwoXChromosomes/comme...  
1     https://www.dailykos.com/story/2022/10/29/2131...  
2     https:

100%|██████████| 1524/1524 [59:58<00:00,  2.36s/it]

Found Useful Threads!
Total Usable DataFrames: 205
Small, Unusable DataFrames: 992
Image Posts: 38
Number of instances in concat DF: 745





<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>