In [1]:
from bs4 import Tag, NavigableString, BeautifulSoup
import requests
import random
import json
import html

In [2]:
with open('../common/useragents.txt', 'r') as file:
    lines = file.readlines()
    agents = [agent.strip() for agent in lines]

In [3]:
def getRedditSoup(url):
    headers = {
        "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
        "accept-encoding": "gzip, deflate, br",
        "accept-language": "en-US, en; q=0.9",
        "sec-fetch-dest": "document",
        "sec-fetch-mode": "navigate",
        "sec-fetch-site": "same-origin",
        "sec-fetch-user": "?1",
        "upgrade-insecure-requests": "1",
        "user-agent": agents[random.randint(0, len(agents))]
    }
    response = requests.get(url, headers = headers)
    
    if (response.status_code != 200):
        return BeautifulSoup("", "html.parser")

    return BeautifulSoup(response.text, "html.parser")

In [4]:
def postRedditSoup(url, payload):
    headers = {
        "accept": "application/json, text/javascript, */*; q=0.01",
        "accept-encoding": "gzip, deflate, br",
        "accept-language": "en-US, en; q=0.9",
        "sec-fetch-dest": "empty",
        "sec-fetch-mode": "cors",
        "sec-fetch-site": "same-origin",
        "content-type": "application/x-www-form-urlencoded; charset=UTF-8",
        "x-requested-with": "XMLHttpRequest",
        "user-agent": agents[random.randint(0, len(agents))]
    }
    
    response = requests.post(url, headers = headers, data = payload)
    
    if (response.status_code != 200):
        return BeautifulSoup("", "html.parser")

    return BeautifulSoup(response.text, "html.parser")

In [5]:
def parseCommentsFromDocument(document, get_children = False):
    comment_objects_list = []
    
    comments_container = document.find_all("div", class_=["nestedlisting"])[0]
    container_comments = comments_container.find_all("div", class_="comment")
    
    if (get_children and len(container_comments) == 1):
        return comment_objects_list
    
    first_comment = comments_container\
        .find_all("div", class_="comment")[1 if get_children else 0]
    
    comment_objects_list.append(extractCommentData(first_comment))
    
    for sibling in first_comment.next_siblings:
        is_tag = isinstance(sibling, Tag)
        is_comment = "comment" in sibling["class"]
        is_morechildren = "morechildren" in sibling["class"]
 
        if (is_tag and is_comment):
            comment_objects_list.append(extractCommentData(sibling))
        elif (is_tag and is_morechildren):
            subreddit = document\
            .find("link", {"rel":"canonical"})["href"]\
            .split("/")[4]
            
            comment_objects_list.extend(getMoreComments(sibling, subreddit))
    
    return comment_objects_list

In [6]:
def extractCommentData(comment_tag, recursive=True):
    top_level_comment_object = {}
    
    score = comment_tag.find("span", class_="score unvoted")["title"]
    author_tag = comment_tag.find("a", class_="author")
    author = author_tag.text.strip() if author_tag != None else "[deleted]"
    
    date_posted = comment_tag.find("time", class_="live-timestamp")
    date_posted_timestamp = date_posted["datetime"]
    date_posted_readable = date_posted["title"]
    
    date_edited = comment_tag.find("time", class_="edited-timestamp")
    date_edited_timestamp = date_edited["datetime"]\
        if date_edited != None else None
    
    num_children = int(comment_tag.find("a", class_="numchildren")\
        .text.strip().replace("(", "").replace(")", "").split(" ")[0])
    
    permalink = comment_tag.find("a", class_="bylink")["href"]
    
    comment_container = comment_tag\
        .find("div", class_="usertext-body may-blank-within md-container")\
        .find("div", class_="md")
    
    comment_formatted = comment_container.prettify()
    comment_raw = "".join([p.text for p in comment_container.find_all("p")]).strip().rstrip()
    
    top_level_comment_object["score"] = score
    top_level_comment_object["author"] = author
    top_level_comment_object["date_posted_timestamp"] = date_posted_timestamp
    top_level_comment_object["date_posted_readable"] = date_posted_readable
    top_level_comment_object["date_edited_timestamp"] = date_edited_timestamp
    top_level_comment_object["num_children"] = num_children
    top_level_comment_object["permalink"] = permalink
    top_level_comment_object["comment_formatted"] = comment_formatted
    top_level_comment_object["comment_raw"] = comment_raw
       
    if (num_children == 0 or not recursive):
        return top_level_comment_object
    else:    
        nested_soup = getRedditSoup(permalink)
        parsed_replies = parseCommentsFromDocument(nested_soup, True)
        
        if (len(parsed_replies) == 0):
            return top_level_comment_object
        
        top_level_comment_object["replies"] = parsed_replies
                                             
        return top_level_comment_object        

In [7]:
def getMoreComments(morecomment_tag, subreddit):
    morecomments_args = morecomment_tag.a["onclick"]\
    .replace("return morechildren", "")\
    .replace("(", "")\
    .replace(")", "")\
    .replace("'","")\
    .split(",")
    
    data_id = morecomment_tag["data-fullname"]
    link_id = morecomments_args[1].strip()
    sort = morecomments_args[2].strip()
    renderstyle = "html"
    limit_children = False
    r = subreddit
    children = ",".join(morecomments_args[3:len(morecomments_args) - 1]).strip()
    
    payload = {
        "id": data_id,
        "link_id": link_id,
        "sort": sort,
        "renderstyle": renderstyle,
        "limit_children": limit_children,
        "r": r,
        "children": children
    }
    
    more_soup = postRedditSoup("https://old.reddit.com/api/morechildren", payload)
    
    json_comments = json.loads(more_soup.prettify())
    json_comments_list = json_comments["jquery"][10][3][0]

    more_comments = []
    for comment in json_comments_list:
        comment_content = comment["data"]["content"]
        comment_tag_string = html.unescape(comment_content)
        comment_tag_soup = BeautifulSoup(comment_tag_string, "html.parser")
        if (comment["kind"] == "more"):
            more_comments.extend(getMoreComments(comment_tag_soup.find("div", class_="morechildren"), subreddit))
        else:            
            more_comments.append(extractCommentData(comment_tag_soup))
    
    return more_comments

Test runs

https://old.reddit.com/r/MechanicalKeyboards/comments/k385xo/mech_mat_giveaway/

or

https://www.reddit.com/r/MechanicalKeyboards/comments/k385xo/mech_mat_giveaway/

for more updated view but same data regardless

In [10]:
url = "https://old.reddit.com/r/MechanicalKeyboards/comments/k385xo/mech_mat_giveaway/"
url = url.replace("www", "old", 1)
soup = getRedditSoup(url)
results = parseCommentsFromDocument(soup)

In [11]:
results

[{'score': '334',
  'author': 'IllustriousPhysics',
  'date_posted_timestamp': '2020-11-29T11:52:43+00:00',
  'date_posted_readable': 'Sun Nov 29 11:52:43 2020 UTC',
  'date_edited_timestamp': '2020-12-02T00:21:08+00:00',
  'num_children': 3051,
  'permalink': 'https://old.reddit.com/r/MechanicalKeyboards/comments/k385xo/mech_mat_giveaway/ge0pz8h/',
  'comment_formatted': '<div class="md">\n <p>\n  Inspired by early 2000\'s Gundams.\n </p>\n <p>\n  \u200b\n </p>\n <p>\n  We love to bring high resolution art onto deskmats. The Mech Mat is no different, but for this piece, we knew we needed something different to capture the details of the main design. The illustration is drawn using vectors, which are based on mathematical formulas. This ensures the original high level of detail, regardless of how large the print area is.\n </p>\n <p>\n  Renders :\n  <a href="https://imgur.com/a/tks3kyV">\n   https://imgur.com/a/tks3kyV\n  </a>\n </p>\n <p>\n  \u200b\n </p>\n <p>\n  <strong>\n   We are 

In [13]:
with open("9k_comments_example", "w") as outfile:
    json.dump(results, outfile, indent=4)

https://old.reddit.com/r/MechanicalKeyboards/comments/jeylnv/gmk_noire_salvun_caps_giveaway/

In [None]:
url = "https://www.reddit.com/r/MechanicalKeyboards/comments/jeylnv/gmk_noire_salvun_caps_giveaway/"
url = url.replace("www", "old", 1)
soup = getRedditSoup(url)
results_giv = parseCommentsFromDocument(soup)

In [9]:
results_giv

[{'score': '48',
  'author': 'shmarcia',
  'date_posted_timestamp': '2020-10-20T21:04:54+00:00',
  'date_posted_readable': 'Tue Oct 20 21:04:54 2020 UTC',
  'date_edited_timestamp': '2020-10-26T18:03:31+00:00',
  'num_children': 70,
  'permalink': 'https://old.reddit.com/r/MechanicalKeyboards/comments/jeylnv/gmk_noire_salvun_caps_giveaway/g9h3gy0/',
  'comment_formatted': '<div class="md">\n <p>\n  <del>\n   Welcome to a Giveaway for a GMK Noire Brass Salvun Cap ( the middle one) to celebrate our Second week of sale!\n  </del>\n </p>\n <p>\n  Closed, congrats to\n  <a href="/u/EthanRayJohns">\n   /u/EthanRayJohns\n  </a>\n  for winning!\n </p>\n <p>\n  First, info about the set\n </p>\n <p>\n  Featuring an expansive base kit, and inspired by a need for a set that would compliment as many boards as possible regardless of design, layout, budget, or color, I\'m so excited to bring you all something I\'ve put so much time effort and love into!\n </p>\n <p>\n  You can read more about the se

In [14]:
with open("smaller_example", "w") as outfile:
    json.dump(results_giv, outfile, indent=4)

In [None]:
# given we have a huge html


# whole document
#    - comment 1
#        - comment 1a
#        - comment 1b
#        - ...
#    - comment 2
#    - comment 3
#    - comment 4
#    - comment 5
#    - morecomment

In [None]:
# functionA:
#     make a request given a the reddit post url/permalink
#     feed whole document to functionB(document)


# in functionB(document) given document is a soup
#     we find the div with class nestedlisting
#     from nested listing we get the first top level comment
#     extra comment data from first top level comment by using extractCommentData(comment_tag)
#        - if comment has replies or children:
#             make separate request for this comment
#             pass the response document to functionB(document)
#         else just return extracted comment data
    
#     get the siblings of that first top level comment (its siblings will also be top level comments)
#     iterate through siblings 
#     extra comment data from sibling comment by using extractCommentData(comment_tag)
#        - if comment has replies or children:
#             make separate request for this comment
#             pass the response document to functionB(document)
#         else just return extracted comment data
#     if there is morecomments:
#         getMoreComments() which should also handle if there are more comments
    

In [None]:
# def parseCommentsFromDocument(document, get_children = False):
#     comment_objects_list = []
    
#     comments_container = document.find_all("div", class_=["nestedlisting"])[0]
#     first_comment = comments_container.find_all("div", class_="comment")[0]
    
#     comment_objects_list.append(extractCommentData(first_comment))
    
#     for sibling in first_comment.next_siblings:
#         is_tag = isinstance(sibling, Tag)
#         is_comment = "comment" in sibling["class"]
#         is_morechildren = "morechildren" in sibling["class"]
 
#         if (is_tag and is_comment):
#             comment_objects_list.append(extractCommentData(sibling))
#         elif (is_tag and is_morechildren):
#             subreddit = document\
#             .find("link", {"rel":"canonical"})["href"]\
#             .split("/")[4]
            
#             comment_objects_list.extend(getMoreComments(sibling, subreddit))
    
#     return comment_objects_list

In [None]:
# def extractCommentData(comment_tag, recursive=True):
#     top_level_comment_object = {}
#     score = comment_tag.find("span", class_="score unvoted")["title"]
#     author = comment_tag.find("a", class_="author").text.strip()
    
#     date_posted = comment_tag.find("time", class_="live-timestamp")
#     date_posted_timestamp = date_posted["datetime"]
#     date_posted_readable = date_posted["title"]
    
#     date_edited = comment_tag.find("time", class_="edited-timestamp")
#     date_edited_timestamp = date_edited["datetime"]\
#         if date_edited != None else None
    
#     num_children = int(comment_tag.find("a", class_="numchildren")\
#         .text.strip().replace("(", "").replace(")", "").split(" ")[0])
    
#     permalink = comment_tag.find("a", class_="bylink")["href"]
    
#     comment_container = comment_tag\
#         .find("div", class_="usertext-body may-blank-within md-container")\
#         .find("div", class_="md")
    
#     comment_formatted = comment_container.prettify()
#     comment_raw = "".join([p.text for p in comment_container.find_all("p")]).strip().rstrip()
    
# #    top_level_comment_object["score"] = score
#     top_level_comment_object["author"] = author
# #     top_level_comment_object["date_posted_timestamp"] = date_posted_timestamp
# #     top_level_comment_object["date_posted_readable"] = date_posted_readable
# #     top_level_comment_object["date_edited_timestamp"] = date_edited_timestamp
# #     top_level_comment_object["num_children"] = num_children
# #     top_level_comment_object["permalink"] = permalink
# #     top_level_comment_object["comment_formatted"] = comment_formatted
#     top_level_comment_object["comment_raw"] = comment_raw
       
#     if (num_children == 0 or not recursive):
#         return top_level_comment_object
#     else:    
#         nested_soup = getRedditSoup(permalink)
        
#         container = nested_soup.find_all("div", class_=["nestedlisting"])[0]
#         container_comments = container.find_all("div", class_="comment")
        
#         if (len(container_comments) == 1):
#             return top_level_comment_object
        
#         first_reply = container_comments[1] 
        
#         child_comment_objects = []
#         child_comment_objects.append(extractCommentData(first_reply))
#         child_comment_objects.extend([extractCommentData(sibling) for sibling in first_reply.next_siblings
#                                      if (isinstance(sibling, Tag) and "comment" in sibling["class"])])
                                     
#         parent_comment = extractCommentData(container_comments[0], recursive=False) 
#         parent_comment["replies"] = child_comment_objects
                                     
#         return parent_comment        