In [13]:
subreddits = [
    "Entrepreneur",
    "SaaS",
    "NoStupidQuestions",
    "personalfinance",
    "smallbusiness",
    "socialmedia",
    "askatherapist",
    "productivity",
    "Accounting"
]

# Scrapping SubReddits

In [None]:
import json
import asyncio
from typing import List, Dict, Union
from httpx import AsyncClient, Response
from parsel import Selector
from loguru import logger as log

# initialize an async httpx client
client = AsyncClient(
    # enable http2
    http2=True,
    # add basic browser like headers to prevent getting blocked
    headers={
        "Accept-Language": "en-US,en;q=0.9",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
        "Accept-Encoding": "gzip, deflate, br",
        "Cookie": "intl_splash=false"
    },
    follow_redirects=True
)

def parse_subreddit(response: Response) -> List[Dict]:
    """parse article data from HTML"""
    selector = Selector(response.text)
    url = str(response.url)
    info = {}
    info["id"] = url.split("/r")[-1].replace("/", "")
    info["description"] = selector.xpath("//shreddit-subreddit-header/@description").get()
    members = selector.xpath("//shreddit-subreddit-header/@subscribers").get()
    rank = selector.xpath("//strong[@id='position']/*/@number").get()    
    info["members"] = int(members) if members else None
    info["rank"] = int(rank) if rank else None
    info["bookmarks"] = {}
    for item in selector.xpath("//div[faceplate-tracker[@source='community_menu']]/faceplate-tracker"):
        name = item.xpath(".//a/span/span/span/text()").get()
        link = item.xpath(".//a/@href").get()
        info["bookmarks"][name] = link

    info["url"] = url
    post_data = []
    for box in selector.xpath("//article"):
        link = box.xpath(".//a/@href").get()
        author = box.xpath(".//shreddit-post/@author").get()
        post_label = box.xpath(".//faceplate-tracker[@source='post']/a/span/div/text()").get()
        upvotes = box.xpath(".//shreddit-post/@score").get()
        comment_count = box.xpath(".//shreddit-post/@comment-count").get()
        attachment_type = box.xpath(".//shreddit-post/@post-type").get()
        if attachment_type and attachment_type == "image":
            attachment_link = box.xpath(".//div[@slot='thumbnail']/*/*/@src").get()
        elif attachment_type == "video":
            attachment_link = box.xpath(".//shreddit-player/@preview").get()
        else:
            attachment_link = box.xpath(".//div[@slot='thumbnail']/a/@href").get()
        post_data.append({
            "authorProfile": "https://www.reddit.com/user/" + author if author else None,
            "authorId": box.xpath(".//shreddit-post/@author-id").get(),            
            "title": box.xpath("./@aria-label").get(),
            "link": "https://www.reddit.com" + link if link else None,
            "publishingDate": box.xpath(".//shreddit-post/@created-timestamp").get(),
            "postId": box.xpath(".//shreddit-post/@id").get(),
            "postLabel": post_label.strip() if post_label else None,
            "postUpvotes": int(upvotes) if upvotes else None,
            "commentCount": int(comment_count) if comment_count else None,
            "attachmentType": attachment_type,
            "attachmentLink": attachment_link,
        })
    # id for the next posts batch
    cursor_id = selector.xpath("//shreddit-post/@more-posts-cursor").get()
    return {"post_data": post_data, "info": info, "cursor": cursor_id}


async def scrape_subreddit(subreddit_id: str, sort: Union["new", "hot", "old"], max_pages: int = None):
    """scrape articles on a subreddit"""
    base_url = f"https://www.reddit.com/r/{subreddit_id}/"
    response = await client.get(base_url)
    subreddit_data = {}
    data = parse_subreddit(response)
    subreddit_data["info"] = data["info"]
    subreddit_data["posts"] = data["post_data"]
    cursor = data["cursor"]

    def make_pagination_url(cursor_id: str):
        return f"https://www.reddit.com/svc/shreddit/community-more-posts/hot/?after={cursor_id}%3D%3D&t=DAY&name={subreddit_id}&feedLength=3&sort={sort}" 
        
    while cursor and (max_pages is None or max_pages > 0):
        url = make_pagination_url(cursor)
        response = await client.get(url)
        data = parse_subreddit(response)
        cursor = data["cursor"]
        post_data = data["post_data"]
        subreddit_data["posts"].extend(post_data)
        if max_pages is not None:
            max_pages -= 1
    log.success(f"scraped {len(subreddit_data['posts'])} posts from the rubreddit: r/{subreddit_id}")
    return subreddit_data

In [6]:
import os

async def run():
    # Create the main data directory if it doesn't exist
    os.makedirs("../data/", exist_ok=True)
    
    for subreddit in subreddits:
        data = await scrape_subreddit(
            subreddit_id=subreddit,
            sort="new",
            max_pages=2
        )
        
        # Create subfolder for each subreddit
        subreddit_folder = os.path.join("../data/", subreddit)
        os.makedirs(subreddit_folder, exist_ok=True)
        
        # Save data in the subreddit's subfolder
        file_path = os.path.join(subreddit_folder, "subreddit.json")
        with open(file_path, "w", encoding="utf-8") as f:
            json.dump(data, f, indent=2, ensure_ascii=False)
    

await run()

[32m2025-05-26 17:13:38.758[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mscrape_subreddit[0m:[36m95[0m - [32m[1mscraped 53 posts from the rubreddit: r/Entrepreneur[0m
[32m2025-05-26 17:13:42.851[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mscrape_subreddit[0m:[36m95[0m - [32m[1mscraped 53 posts from the rubreddit: r/SaaS[0m
[32m2025-05-26 17:13:46.310[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mscrape_subreddit[0m:[36m95[0m - [32m[1mscraped 53 posts from the rubreddit: r/NoStupidQuestions[0m
[32m2025-05-26 17:13:51.403[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mscrape_subreddit[0m:[36m95[0m - [32m[1mscraped 53 posts from the rubreddit: r/personalfinance[0m
[32m2025-05-26 17:13:55.155[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mscrape_subreddit[0m:[36m95[0m - [32m[1mscraped 53 posts from the rubreddit: r/smallbusiness[0m
[32m2025-05-26 17:13:59.927[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mscrape_

# Scrapping comments from posts 

In [11]:
import json
import asyncio
from typing import List, Dict, Union
from httpx import AsyncClient, Response
from parsel import Selector
from loguru import logger as log

client = AsyncClient(
    # previous client configuration
)

def parse_post_info(response: Response) -> Dict:
    """parse post data from a subreddit post"""
    selector = Selector(response.text)
    info = {}
    label = selector.xpath("//faceplate-tracker[@source='post']/a/span/div/text()").get()
    comments = selector.xpath("//shreddit-post/@comment-count").get()
    upvotes = selector.xpath("//shreddit-post/@score").get()
    info["authorId"] = selector.xpath("//shreddit-post/@author-id").get()
    info["author"] = selector.xpath("//shreddit-post/@author").get()
    info["authorProfile"] = "https://www.reddit.com/user/" + info["author"] if info["author"] else None
    info["subreddit"] = selector.xpath("//shreddit-post/@subreddit-prefixed-name").get()
    info["postId"] = selector.xpath("//shreddit-post/@id").get()
    info["postLabel"] = label.strip() if label else None
    info["publishingDate"] = selector.xpath("//shreddit-post/@created-timestamp").get()
    info["postTitle"] = selector.xpath("//shreddit-post/@post-title").get()
    info["postLink"] = selector.xpath("//shreddit-canonical-url-updater/@value").get()
    info["commentCount"] = int(comments) if comments else None
    info["upvoteCount"] = int(upvotes) if upvotes else None
    info["attachmentType"] = selector.xpath("//shreddit-post/@post-type").get()
    info["attachmentLink"] = selector.xpath("//shreddit-post/@content-href").get()
    return info


def parse_post_comments(response: Response) -> List[Dict]:
    """parse post comments"""

    def parse_comment(parent_selector) -> Dict:
        """parse a comment object"""
        author = parent_selector.xpath("./@data-author").get()
        link = parent_selector.xpath("./@data-permalink").get()
        dislikes = parent_selector.xpath(".//span[contains(@class, 'dislikes')]/@title").get()
        upvotes = parent_selector.xpath(".//span[contains(@class, 'likes')]/@title").get()
        downvotes = parent_selector.xpath(".//span[contains(@class, 'unvoted')]/@title").get()        
        return {
            "authorId": parent_selector.xpath("./@data-author-fullname").get(),
            "author": author,
            "authorProfile": "https://www.reddit.com/user/" + author if author else None,
            "commentId": parent_selector.xpath("./@data-fullname").get(),
            "link": "https://www.reddit.com" + link if link else None,
            "publishingDate": parent_selector.xpath(".//time/@datetime").get(),
            "commentBody": parent_selector.xpath(".//div[@class='md']/p/text()").get(),
            "upvotes": int(upvotes) if upvotes else None,
            "dislikes": int(dislikes) if dislikes else None,
            "downvotes": int(downvotes) if downvotes else None,            
        }

    def parse_replies(what) -> List[Dict]:
        """recursively parse replies"""
        replies = []
        for reply_box in what.xpath(".//div[@data-type='comment']"):
            reply_comment = parse_comment(reply_box)
            child_replies = parse_replies(reply_box)
            if child_replies:
                reply_comment["replies"] = child_replies
            replies.append(reply_comment)
        return replies

    selector = Selector(response.text)
    data = []
    for item in selector.xpath("//div[@class='sitetable nestedlisting']/div[@data-type='comment']"):
        comment_data = parse_comment(item)
        replies = parse_replies(item)
        if replies:
            comment_data["replies"] = replies
        data.append(comment_data)            
    return data


async def scrape_post(url: str, sort: Union["old", "new", "top"]) -> Dict:
    """scrape subreddit post and comment data"""
    response = await client.get(url)
    post_data = {}
    post_data["info"] = parse_post_info(response)
    # scrape the comments from the old.reddit version, with the same post URL 
    post_link = post_data["info"]["postLink"]
    if post_link:
        bulk_comments_page_url = post_link.replace("www", "old") + f"?sort={sort}&limit=500"
    else:
        # fallback to using the original URL if postLink is None
        bulk_comments_page_url = url.replace("www", "old") + f"?sort={sort}&limit=500"
    response = await client.get(bulk_comments_page_url)
    post_data["comments"] = parse_post_comments(response) 
    log.success(f"scraped {len(post_data['comments'])} comments from the post {url}")
    return post_data

In [21]:
from tqdm import tqdm

async def run():
    comments_and_posts = []
    for subreddit in tqdm(subreddits, desc="Scraping comments from posts"):
        
        subreddit_folder = os.path.join("../data/", subreddit)
        file_path = os.path.join(subreddit_folder, "subreddit.json")
        
        print(subreddit_folder, file_path)
        subreddit_data = json.load(open(file_path, "r", encoding="utf-8"))
        
        for post in tqdm(
            subreddit_data["posts"][:3],
            desc=f"Scraping posts from r/{subreddit}"
        ):
            comments_and_posts.append(post["title"])
            
            post_url = post["link"]
            post_data = await scrape_post(url=post_url, sort="top")
            
            comments_from_post = post_data["comments"]
            # Vamos ignorar replies
            for comment in comments_from_post:
                comments_and_posts.append(comment["commentBody"])
                
    # save the comments and posts to a file
    with open("../data/comments/comments_and_posts.json", "w", encoding="utf-8") as f:
        json.dump(comments_and_posts, f, indent=2, ensure_ascii=False)

await run()

Scraping comments from posts:   0%|          | 0/9 [00:00<?, ?it/s]

../data/Entrepreneur ../data/Entrepreneur\subreddit.json


[32m2025-05-27 12:15:01.846[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mscrape_post[0m:[36m94[0m - [32m[1mscraped 149 comments from the post https://www.reddit.com/r/Entrepreneur/comments/1kvkyh4/whats_an_industry_that_desperately_needs_younger/[0m
[32m2025-05-27 12:15:03.828[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mscrape_post[0m:[36m94[0m - [32m[1mscraped 64 comments from the post https://www.reddit.com/r/Entrepreneur/comments/1kuhsdu/i_keep_seeing_the_same_revenue_leak_in_every/[0m
[32m2025-05-27 12:15:05.352[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mscrape_post[0m:[36m94[0m - [32m[1mscraped 16 comments from the post https://www.reddit.com/r/Entrepreneur/comments/1kw1ui2/what_has_been_the_biggest_win_for_your_career/[0m
Scraping posts from r/Entrepreneur: 100%|██████████| 3/3 [00:08<00:00,  2.82s/it]
Scraping comments from posts:  11%|█         | 1/9 [00:08<01:07,  8.47s/it]

../data/SaaS ../data/SaaS\subreddit.json


[32m2025-05-27 12:15:07.436[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mscrape_post[0m:[36m94[0m - [32m[1mscraped 62 comments from the post https://www.reddit.com/r/SaaS/comments/1kvsq47/i_removed_aipowered_from_all_my_b2b_copy/[0m
[32m2025-05-27 12:15:09.724[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mscrape_post[0m:[36m94[0m - [32m[1mscraped 89 comments from the post https://www.reddit.com/r/SaaS/comments/1krurou/i_spent_6_months_building_an_app_that_made/[0m
[32m2025-05-27 12:15:11.239[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mscrape_post[0m:[36m94[0m - [32m[1mscraped 19 comments from the post https://www.reddit.com/r/SaaS/comments/1kvuzkn/saas_founders_with_more_than_1m_arr_what_tools_do/[0m
Scraping posts from r/SaaS: 100%|██████████| 3/3 [00:05<00:00,  1.96s/it]
Scraping comments from posts:  22%|██▏       | 2/9 [00:14<00:48,  6.95s/it]

../data/NoStupidQuestions ../data/NoStupidQuestions\subreddit.json


[32m2025-05-27 12:15:17.324[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mscrape_post[0m:[36m94[0m - [32m[1mscraped 217 comments from the post https://www.reddit.com/r/NoStupidQuestions/comments/1kvv3ca/what_is_the_hotel_receptionist_doing_on_the/[0m
[32m2025-05-27 12:15:24.172[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mscrape_post[0m:[36m94[0m - [32m[1mscraped 204 comments from the post https://www.reddit.com/r/NoStupidQuestions/comments/1kougfo/why_arent_former_american_slave_plantations/[0m
[32m2025-05-27 12:15:29.960[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mscrape_post[0m:[36m94[0m - [32m[1mscraped 206 comments from the post https://www.reddit.com/r/NoStupidQuestions/comments/1kvvst0/why_dont_evangelical_missionaries_ever_go_to/[0m
Scraping posts from r/NoStupidQuestions: 100%|██████████| 3/3 [00:18<00:00,  6.24s/it]
Scraping comments from posts:  33%|███▎      | 3/9 [00:33<01:13, 12.33s/it]

../data/personalfinance ../data/personalfinance\subreddit.json


[32m2025-05-27 12:15:31.254[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mscrape_post[0m:[36m94[0m - [32m[1mscraped 10 comments from the post https://www.reddit.com/r/personalfinance/comments/1kw2bpv/got_1000mo_raise_after_taxes_diverting_it_all_in/[0m
[32m2025-05-27 12:15:34.380[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mscrape_post[0m:[36m94[0m - [32m[1mscraped 60 comments from the post https://www.reddit.com/r/personalfinance/comments/1ksqnmx/billed_1300_for_a_free_screening_from_my_urologist/[0m
[32m2025-05-27 12:15:35.960[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mscrape_post[0m:[36m94[0m - [32m[1mscraped 29 comments from the post https://www.reddit.com/r/personalfinance/comments/1kvudqm/how_to_continue_the_conversation_with_hr_around/[0m
Scraping posts from r/personalfinance: 100%|██████████| 3/3 [00:05<00:00,  2.00s/it]
Scraping comments from posts:  44%|████▍     | 4/9 [00:39<00:49,  9.83s/it]

../data/smallbusiness ../data/smallbusiness\subreddit.json


[32m2025-05-27 12:15:37.031[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mscrape_post[0m:[36m94[0m - [32m[1mscraped 15 comments from the post https://www.reddit.com/r/smallbusiness/comments/1kw1ajm/today_i_got_my_first_customer/[0m
[32m2025-05-27 12:15:41.423[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mscrape_post[0m:[36m94[0m - [32m[1mscraped 204 comments from the post https://www.reddit.com/r/smallbusiness/comments/1kqo3ts/anybody_ever_question_why_tf_were_still_doing_this/[0m
[32m2025-05-27 12:15:44.125[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mscrape_post[0m:[36m94[0m - [32m[1mscraped 57 comments from the post https://www.reddit.com/r/smallbusiness/comments/1kvumu0/those_who_sold_what_do_you_do_now/[0m
Scraping posts from r/smallbusiness: 100%|██████████| 3/3 [00:08<00:00,  2.72s/it]
Scraping comments from posts:  56%|█████▌    | 5/9 [00:47<00:36,  9.23s/it]

../data/socialmedia ../data/socialmedia\subreddit.json


[32m2025-05-27 12:15:45.999[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mscrape_post[0m:[36m94[0m - [32m[1mscraped 1 comments from the post https://www.reddit.com/r/socialmedia/comments/1kw14i6/whats_your_post_and_pray_metric_that_actually/[0m
[32m2025-05-27 12:15:48.384[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mscrape_post[0m:[36m94[0m - [32m[1mscraped 78 comments from the post https://www.reddit.com/r/socialmedia/comments/1ktkqa3/are_millennials_done_with_social_media_or_just/[0m
[32m2025-05-27 12:15:49.813[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mscrape_post[0m:[36m94[0m - [32m[1mscraped 6 comments from the post https://www.reddit.com/r/socialmedia/comments/1kvpcl5/is_it_better_to_post_daily_or_focus_on/[0m
Scraping posts from r/socialmedia: 100%|██████████| 3/3 [00:05<00:00,  1.89s/it]
Scraping comments from posts:  67%|██████▋   | 6/9 [00:52<00:24,  8.02s/it]

../data/askatherapist ../data/askatherapist\subreddit.json


[32m2025-05-27 12:15:51.039[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mscrape_post[0m:[36m94[0m - [32m[1mscraped 14 comments from the post https://www.reddit.com/r/askatherapist/comments/1kvy7ve/is_longterm_therapy_always_the_clients_decision/[0m
[32m2025-05-27 12:15:52.558[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mscrape_post[0m:[36m94[0m - [32m[1mscraped 17 comments from the post https://www.reddit.com/r/askatherapist/comments/1koccc5/is_it_weird_that_im_fascinated_by_very_basic/[0m
[32m2025-05-27 12:15:53.716[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mscrape_post[0m:[36m94[0m - [32m[1mscraped 6 comments from the post https://www.reddit.com/r/askatherapist/comments/1kvy08u/can_i_still_see_my_therapist_when_i_move_for/[0m
Scraping posts from r/askatherapist: 100%|██████████| 3/3 [00:03<00:00,  1.30s/it]
Scraping comments from posts:  78%|███████▊  | 7/9 [00:56<00:13,  6.68s/it]

../data/productivity ../data/productivity\subreddit.json


[32m2025-05-27 12:15:54.784[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mscrape_post[0m:[36m94[0m - [32m[1mscraped 11 comments from the post https://www.reddit.com/r/productivity/comments/1kvw3iq/whats_something_that_used_to_stress_you_out/[0m
[32m2025-05-27 12:15:56.334[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mscrape_post[0m:[36m94[0m - [32m[1mscraped 50 comments from the post https://www.reddit.com/r/productivity/comments/1krinsq/fake_commuting_helps_me_work/[0m
[32m2025-05-27 12:15:57.770[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mscrape_post[0m:[36m94[0m - [32m[1mscraped 31 comments from the post https://www.reddit.com/r/productivity/comments/1kvwgmn/whats_the_one_productivity_hack_everyone_talks/[0m
Scraping posts from r/productivity: 100%|██████████| 3/3 [00:04<00:00,  1.35s/it]
Scraping comments from posts:  89%|████████▉ | 8/9 [01:00<00:05,  5.84s/it]

../data/Accounting ../data/Accounting\subreddit.json


[32m2025-05-27 12:15:59.608[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mscrape_post[0m:[36m94[0m - [32m[1mscraped 47 comments from the post https://www.reddit.com/r/Accounting/comments/1kw3drg/thank_you_accounting/[0m
[32m2025-05-27 12:16:04.212[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mscrape_post[0m:[36m94[0m - [32m[1mscraped 137 comments from the post https://www.reddit.com/r/Accounting/comments/1kamidy/antiwfh_people_are_the_laziest_employees_weve_got/[0m
[32m2025-05-27 12:16:05.845[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mscrape_post[0m:[36m94[0m - [32m[1mscraped 24 comments from the post https://www.reddit.com/r/Accounting/comments/1kvvhm6/i_enjoyed_business_school_but_never_want_to_work/[0m
Scraping posts from r/Accounting: 100%|██████████| 3/3 [00:08<00:00,  2.69s/it]
Scraping comments from posts: 100%|██████████| 9/9 [01:08<00:00,  7.66s/it]
