### Data collection (2) - Scraping FORUMS.RED posts and comments

This notebook includes the code used for scraping posts and comments from FORUMS.RED "What's Hot" page. 

This approach allowed us to collect data from a topic-restricted, semi-bounded population. Our approach aligns with the logic of trawling, defined as the systematic collection of already-published online content. In practice, this meant scraping six years of historical forum content rather than tracking real-time user behaviour beginning from the start of the project forward.

In [1]:
import requests as rq
from bs4 import BeautifulSoup
import pandas as pd
import re
import time
import random
import logging
from tqdm import tqdm

logging.basicConfig(level=logging.INFO, format="%(asctime)s  %(message)s")

USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
    "(KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 "
    "(KHTML, like Gecko) Version/16.5 Safari/605.1.15",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
    "(KHTML, like Gecko) Chrome/114.0.5735.199 Safari/537.36",
]

def parse_count(s: str):
    """Convert strings like '123', '1.5K', '2M' into integer counts."""
    if not s:
        return None
    s = s.strip().upper()
    try:
        if s.endswith('K'):
            return int(float(s[:-1]) * 1_000)
        if s.endswith('M'):
            return int(float(s[:-1]) * 1_000_000)
        return int(s)
    except ValueError:
        return None

def scrape_posts(urls, delay=(3, 7)):
    all_rows = []
    for url in tqdm(urls, desc="Threads", unit="thread"):
        ua = random.choice(USER_AGENTS)
        headers = {"User-Agent": ua}

        logging.info(f"Fetching {url}")
        r    = rq.get(url, headers=headers)
        soup = BeautifulSoup(r.text, "html.parser")

        # --- MAIN POST METADATA ---
        post_id    = soup.find("input", id="vote_post_id")["value"]
        post_title = soup.select_one(".page-header-title.post-title").get_text(strip=True)
        post_url   = url
        post_text  = soup.find("span", id=f"posttext-{post_id}").get_text(" ", strip=True)

        # flair parsing
        flair_link = soup.select_one("h4.forumheader.panel-title b a[href*='flair=']")
        if flair_link:
            m = re.search(r"flair=(-?\d+)", flair_link["href"])
            if m:
                num = int(m.group(1))
                if num >= 0:
                    flair_nr  = num
                    flair_cat = flair_link.text.strip()
                else:
                    flair_nr, flair_cat = -1, "Uncategorized"
            else:
                flair_nr, flair_cat = -1, "Uncategorized"
        else:
            flair_nr, flair_cat = -1, "Uncategorized"

        # OP’s username
        op_user = soup.select_one(
            "div.widget-post-author-info .postheader h4 a"
        ).text.strip()

        # OP’s time_since
        p_elem    = soup.select_one(
            "div.widget-post-author-info .postheader p.text-muted.m-0.f-s-11.postlinks"
        )
        raw_ptime = p_elem.get_text(" ", strip=True) if p_elem else ""
        m_time    = re.search(r"Posted\s+(.*?ago)", raw_ptime)
        post_time = m_time.group(1) if m_time else None

        # OP’s views & score
        v_elem     = soup.select_one("div.widget-post-author-info .postheader p.text-muted")
        post_views = parse_count(re.search(r"(\d+\.?\d*[KM]?)\s+Views", v_elem.text).group(1)) if v_elem else None

        s_tag      = soup.select_one(f"span#score_1_{post_id}")
        post_score = parse_count(s_tag.text) if s_tag else None

        # 1) main post
        all_rows.append({
            "post_id": post_id,
            "comment_id": None,
            "post_title": post_title,
            "post_url": post_url,
            "is_post": 1,
            "text": post_text,
            "flair_category": flair_cat,
            "flair_nr": flair_nr,
            "username": op_user,
            "time_since": post_time,
            "post_views": post_views,
            "score": post_score
        })

        # 2) comments
        comments = soup.select("#commentcontainer .singlecomment")
        for single in tqdm(comments, desc="Comments", leave=False, unit="c"):
            ctop = single.select_one(".commenttop")
            if not ctop:
                continue
            cid = ctop["id"].replace("ctop-", "").strip()

            user_tag = single.select_one(".comment-info a")
            user     = user_tag.text.strip() if user_tag else None

            time_tag = single.select_one(".comment-info .text-muted a")
            tme      = time_tag.text.strip() if time_tag else None

            vote_tag = single.select_one(".novote")
            scr      = parse_count(vote_tag.text) if vote_tag else None

            comment_div = single.find_next("div", class_="thecommentinquestion")
            if not comment_div:
                continue
            cleaned   = re.sub(r"<blockquote>.*?</blockquote>", "", str(comment_div), flags=re.S)
            full_text = BeautifulSoup(cleaned, "html.parser").get_text(" ", strip=True)
            text      = full_text[:-1] if full_text and full_text[-1].isdigit() else full_text

            all_rows.append({
                "post_id": post_id,
                "comment_id": cid,
                "post_title": post_title,
                "post_url": post_url,
                "is_post": 0,
                "text": text,
                "flair_category": flair_cat,
                "flair_nr": flair_nr,
                "username": user,
                "time_since": tme,
                "post_views": post_views,
                "score": scr
            })

        wait = random.uniform(*delay)
        logging.info(f"Sleeping for {wait:.1f}s")
        time.sleep(wait)

    return pd.DataFrame(all_rows)


In [5]:
# show unlimited width in each column
pd.set_option('display.max_colwidth', 200)

# (optional) show more rows/columns if you need
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 20)

In [None]:
all_links = pd.read_csv('forums_alltime_links'.csv')

In [27]:
links1 = all_links['url'][1500:1750]
links2 = all_links['url'][1750:2000]
links3 = all_links['url'][2000:2250]
links4 = all_links['url'][2250:2500]
links5 = all_links['url'][2500:2677]

In [37]:
links_df = scrape_posts(links5, delay=(5, 10))

Threads:   0%|                                      | 0/177 [00:00<?, ?thread/s]2025-05-07 01:48:59,693  Fetching https://www.forums.red/p/asktrp/168860/still_seeking_treatment_for_cure_to_oneitis

Comments:   0%|                                            | 0/5 [00:00<?, ?c/s][A
                                                                                [A2025-05-07 01:49:00,133  Sleeping for 6.7s
Threads:   1%|▏                             | 1/177 [00:07<20:50,  7.10s/thread]2025-05-07 01:49:06,798  Fetching https://www.forums.red/p/asktrp/211323/gaming_while_dealing_with_a_dead_dick

Comments:   0%|                                           | 0/22 [00:00<?, ?c/s][A
                                                                                [A2025-05-07 01:49:07,368  Sleeping for 6.1s
Threads:   1%|▎                             | 2/177 [00:13<20:01,  6.87s/thread]2025-05-07 01:49:13,499  Fetching https://www.forums.red/p/asktrp/125524/becoming_a_man

Comments:   0%|      

Threads:  11%|███▎                         | 20/177 [02:34<22:57,  8.77s/thread]2025-05-07 01:51:34,555  Fetching https://www.forums.red/p/asktrp/161436/considering_moving_to_upper_west_side_nyc_19m_anybody_here_k

Comments:   0%|                                           | 0/10 [00:00<?, ?c/s][A
                                                                                [A2025-05-07 01:51:35,054  Sleeping for 7.5s
Threads:  12%|███▍                         | 21/177 [02:42<22:12,  8.54s/thread]2025-05-07 01:51:42,557  Fetching https://www.forums.red/p/theredpill/70632/mid_40s_woman_hot_and_flaky_what_is_the_story_behind_her_beh

Comments:   0%|                                           | 0/41 [00:00<?, ?c/s][A
                                                                                [A2025-05-07 01:51:43,284  Sleeping for 9.1s
Threads:  12%|███▌                         | 22/177 [02:52<23:06,  8.94s/thread]2025-05-07 01:51:52,438  Fetching https://www.forums.red/p/asktrp/1

                                                                                [A2025-05-07 01:54:23,277  Sleeping for 8.2s
Threads:  23%|██████▌                      | 40/177 [05:31<19:40,  8.61s/thread]2025-05-07 01:54:31,521  Fetching https://www.forums.red/p/asktrp/136973/need_some_career_guidance

Comments:   0%|                                            | 0/9 [00:00<?, ?c/s][A
                                                                                [A2025-05-07 01:54:32,014  Sleeping for 7.5s
Threads:  23%|██████▋                      | 41/177 [05:39<19:07,  8.44s/thread]2025-05-07 01:54:39,554  Fetching https://www.forums.red/p/theredpill/51346/succeeding_in_the_sexual_market_place_of_nyc

Comments:   0%|                                           | 0/22 [00:00<?, ?c/s][A
                                                                                [A2025-05-07 01:54:40,138  Sleeping for 7.5s
Threads:  24%|██████▉                      | 42/177 [05:47<18:43,  8.32s

Comments:   0%|                                           | 0/12 [00:00<?, ?c/s][A
                                                                                [A2025-05-07 01:57:03,924  Sleeping for 5.8s
Threads:  34%|█████████▊                   | 60/177 [08:10<14:25,  7.40s/thread]2025-05-07 01:57:09,736  Fetching https://www.forums.red/p/asktrp/167866/ltr_started_using_tinder_while_i_m_present_trying_to_figure

Comments:   0%|                                           | 0/25 [00:00<?, ?c/s][A
                                                                                [A2025-05-07 01:57:10,334  Sleeping for 9.0s
Threads:  34%|█████████▉                   | 61/177 [08:19<15:36,  8.07s/thread]2025-05-07 01:57:19,370  Fetching https://www.forums.red/p/redpillwomen/300726/past_betrayal_causing_internal_conflict

Comments:   0%|                                           | 0/13 [00:00<?, ?c/s][A
                                                                                [


Comments:   0%|                                            | 0/1 [00:00<?, ?c/s][A
                                                                                [A2025-05-07 01:59:47,098  Sleeping for 8.7s
Threads:  45%|█████████████                | 80/177 [10:56<14:25,  8.92s/thread]2025-05-07 01:59:55,824  Fetching https://www.forums.red/p/asktrp/128077/worst_oneitis_of_my_life_lost_and_depressed

Comments:   0%|                                           | 0/13 [00:00<?, ?c/s][A
                                                                                [A2025-05-07 01:59:56,394  Sleeping for 9.1s
Threads:  46%|█████████████▎               | 81/177 [11:05<14:37,  9.14s/thread]2025-05-07 02:00:05,488  Fetching https://www.forums.red/p/theredpill/45225/top_post_on_r_askreddit_what_can_you_do_that_is_not_appearan

Comments:   0%|                                           | 0/51 [00:00<?, ?c/s][A
                                                                               

                                                                                [A2025-05-07 02:02:30,699  Sleeping for 5.4s
Threads:  56%|███████████████▊            | 100/177 [13:36<09:23,  7.32s/thread]2025-05-07 02:02:36,109  Fetching https://www.forums.red/p/asktrp/168979/where_in_the_us_can_i_live_that_is_the_least_feminist

Comments:   0%|                                           | 0/48 [00:00<?, ?c/s][A
                                                                                [A2025-05-07 02:02:36,722  Sleeping for 9.4s
Threads:  57%|███████████████▉            | 101/177 [13:46<10:16,  8.12s/thread]2025-05-07 02:02:46,094  Fetching https://www.forums.red/p/theredpill/23416/men_that_graduate_college_are_becoming_a_rare_commodity

Comments:   0%|                                           | 0/62 [00:00<?, ?c/s][A
                                                                                [A2025-05-07 02:02:46,756  Sleeping for 5.4s
Threads:  58%|████████████████▏  

                                                                                [A2025-05-07 02:04:58,810  Sleeping for 10.0s
Threads:  67%|██████████████████▊         | 119/177 [16:09<08:37,  8.92s/thread]2025-05-07 02:05:08,780  Fetching https://www.forums.red/p/asktrp/144819/do_people_here_listen_to_hip_hop_it_seems_like_the_best_type

Comments:   0%|                                           | 0/13 [00:00<?, ?c/s][A
                                                                                [A2025-05-07 02:05:09,276  Sleeping for 6.9s
Threads:  68%|██████████████████▉         | 120/177 [16:16<08:03,  8.48s/thread]2025-05-07 02:05:16,208  Fetching https://www.forums.red/p/redpillwomen/300648/question_help_married_to_rpm

Comments:   0%|                                           | 0/60 [00:00<?, ?c/s][A
                                                                                [A2025-05-07 02:05:16,843  Sleeping for 5.4s
Threads:  68%|███████████████████▏        | 121/1

                                                                                [A2025-05-07 02:07:31,767  Sleeping for 5.4s
Threads:  79%|█████████████████████▉      | 139/177 [18:37<04:38,  7.33s/thread]2025-05-07 02:07:37,188  Fetching https://www.forums.red/p/asktrp/4889/do_you_see_any_way_to_simultaneously_be_able_to_live_a_commi

Comments:   0%|                                          | 0/153 [00:00<?, ?c/s][A
Comments:  98%|█████████████████████████████▍| 150/153 [00:00<00:00, 1492.03c/s][A
                                                                                [A2025-05-07 02:07:38,231  Sleeping for 6.4s
Threads:  79%|██████████████████████▏     | 140/177 [18:44<04:32,  7.35s/thread]2025-05-07 02:07:44,596  Fetching https://www.forums.red/p/theredpill/69455/guy_bought_a_lease_for_his_gf_now_has_to_sell_the_car_none_o

Comments:   0%|                                           | 0/10 [00:00<?, ?c/s][A
                                                                 


Comments:   0%|                                            | 0/2 [00:00<?, ?c/s][A
                                                                                [A2025-05-07 02:10:08,461  Sleeping for 7.5s
Threads:  90%|█████████████████████████▏  | 159/177 [21:16<02:20,  7.83s/thread]2025-05-07 02:10:16,012  Fetching https://www.forums.red/p/redpillwomen/300398/how_many_of_you_make_lunch_for_your_husband_boyfriend_partne

Comments:   0%|                                           | 0/52 [00:00<?, ?c/s][A
                                                                                [A2025-05-07 02:10:16,646  Sleeping for 7.3s
Threads:  90%|█████████████████████████▎  | 160/177 [21:24<02:13,  7.87s/thread]2025-05-07 02:10:23,988  Fetching https://www.forums.red/p/theredpill/41078/divorce_rape_is_for_life_not_just_for_christmas

Comments:   0%|                                           | 0/53 [00:00<?, ?c/s][A
                                                                     

In [38]:
links_df.to_csv("links-5.csv", index=False)