### Data collection - Scraping FORUMS.RED posts 

This notebook includes the code used for scraping posts from FORUMS.RED "What's Hot" page, sorted by new posts and filtered by ”all-time” time frame. 

This approach allowed us to collect data from a topic-restricted, semi-bounded population. Our approach aligns with the logic of trawling, defined as the systematic collection of already-published online content. In practice, this meant scraping six years of historical forum content rather than tracking real-time user behaviour beginning from the start of the project forward.

In [1]:
# Web scraping
import requests
import urllib.parse
from fake_useragent import UserAgent
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import undetected_chromedriver as uc
from contextlib import contextmanager

# Data handling
import pandas as pd
import re
import os

# Timing and randomness
import time
import datetime
import random

# Progress display
from tqdm import tqdm

In [2]:
import json #For saving intermediate data 
import pandas as pd #for working with structured datasets
import numpy as np #for numerical computations
import datetime  # For handling date/time conversions and computations

In [3]:
ua = UserAgent()

languages = [
    'en-US,en;q=0.9',
    'en-GB,en;q=0.8',
    'fr-FR,fr;q=0.9,en-US;q=0.8,en;q=0.7',
    'de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7',
    'es-ES,es;q=0.9,en-US;q=0.8,en;q=0.7',
    'it-IT,it;q=0.9,en-US;q=0.8,en;q=0.7',
    'pt-BR,pt;q=0.9,en-US;q=0.8,en;q=0.7',
    'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7',
]

In [4]:
def get_driver():
    user_agent = ua.random
    accept_language = random.choice(languages)

    options = uc.ChromeOptions()
    options.add_argument(f"--user-agent={user_agent}")
    options.add_argument(f"--lang={accept_language}")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--headless=chrome")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-gpu")

    driver = uc.Chrome(options=options)
    return driver

In [5]:
def scrape_alltime_forum_url(url="https://www.forums.red/i/?sort=2&timeframe=1", delay=5, max_waits=5):
    post_links = []
    seen_links = set()
    scroll_count = 0
    partial_csv = "forums_alltime_links_partial.csv"
    final_csv = "forums_alltime_links_final.csv"

    # Load previously saved links (resume support)
    if os.path.exists(partial_csv):
        df_existing = pd.read_csv(partial_csv)
        seen_links = set(df_existing["url"].tolist())
        post_links = list(seen_links)
        print(f"🔁 Resuming from saved file: {len(post_links)} links already scraped.")

    try:
        driver = get_driver()
        driver.get(url)
        time.sleep(delay)

        print(f"🔽 Scrolling until end of URL: {url}")

        with tqdm(desc="Scrolling forum", unit="scroll") as pbar:
            prev_links_count = len(seen_links)
            stale_scrolls = 0

            while True:
                try:
                    # Check if browser is still alive
                    _ = driver.title

                    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                    time.sleep(delay + random.uniform(1.0, 2.0))

                    anchors = driver.find_elements(By.XPATH, "//a[text()='Permalink']")
                    new_links_found = False

                    for a in anchors:
                        href = a.get_attribute("href")
                        if href and href not in seen_links:
                            seen_links.add(href)
                            post_links.append(href)
                            new_links_found = True

                    scroll_count += 1

                    if not new_links_found:
                        stale_scrolls += 1
                    else:
                        stale_scrolls = 0

                    if stale_scrolls >= max_waits:
                        print("⏸️ No new posts loaded for several scrolls — stopping.")
                        break

                    # Autosave every 10 scrolls
                    if scroll_count % 10 == 0:
                        pd.DataFrame(post_links, columns=["url"]).to_csv(partial_csv, index=False)
                        print(f"💾 Autosaved {len(post_links)} links at scroll {scroll_count}")

                    pbar.update(1)

                except Exception as e:
                    print(f"⚠️ Scroll failed: {e}")
                    break

    except Exception as e:
        print(f"❌ Error during scraping: {e}")
    finally:
        try:
            driver.quit()
        except:
            pass

    # Final save
    df = pd.DataFrame(post_links, columns=["url"])
    df.to_csv(final_csv, index=False)
    print(f"✅ Final CSV saved: {final_csv} ({len(df)} links)")

    return df 

In [None]:
alltime_links = scrape_alltime_forum_url()

In [7]:
data = pd.read_csv('forums_alltime_links.csv')

In [8]:
data

Unnamed: 0,url
0,https://www.forums.red/p/asktrp/324046/girl_di...
1,https://www.forums.red/p/asktrp/324045/anyone_...
2,https://www.forums.red/p/asktrp/324044/plate_a...
3,https://www.forums.red/p/asktrp/324043/what_do...
4,https://www.forums.red/p/asktrp/324041/how_to_...
...,...
18149,https://www.forums.red/p/asktrp/209528/questio...
18150,https://www.forums.red/p/asktrp/209527/thought...
18151,https://www.forums.red/p/asktrp/209526/guys_25...
18152,https://www.forums.red/p/asktrp/209525/how_muc...
