In [1]:
import logging
import os
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime
from functools import cache
from pathlib import Path

import pandas as pd
import requests
from bs4 import BeautifulSoup
from goose3 import Goose
from tqdm import tqdm
from waybackpy import WaybackMachineCDXServerAPI
from waybackpy.exceptions import NoCDXRecordFound

In [2]:
USER_AGENT = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0"
HTML_TAG_IGNORE = ["script", "style", "img", "figure"]
HTML_ATTR_IGNORE = ["style"]

MD_FORMAT = "# {title}\n\n[{url}]({url})\n\n{content}\n\n"

RESULT_DIR = "results/{version}"
RESULT_FILENAME = f"{RESULT_DIR}/results.csv"
RESULT_MD_FILENAME = f"{RESULT_DIR}/results.md"
RESULT_HTML_FILENAME = f"{RESULT_DIR}/html/{{url}}.html"

ARTICLES = [
    "https://www.nytimes.com/2024/09/29/us/north-carolina-helene-relief-damage.html",
    "https://www.faz.net/aktuell/wirtschaft/kuenstliche-intelligenz/today-s-ai-can-t-be-trusted-19532136.html",
    "http://www.chinatoday.com.cn/ctenglish/2018/commentaries/202409/t20240925_800378506.html",
    "https://english.elpais.com/economy-and-business/2024-09-28/from-the-hermes-heir-to-nicolas-cage-millionaires-who-went-bankrupt.html",
    "https://insatiable.info/2023/06/30/quels-futur-pour-les-reseaux-sociaux/",
    "https://actu.fr/auvergne-rhone-alpes/lyon_69123/lyon-le-projet-de-reamenagement-des-quais-les-plus-mortels-pour-les-cyclistes-devoile_61667371.html",
]

In [3]:
logging.basicConfig(level=logging.INFO)

In [4]:
@cache
def get_alternate_url(url: str, *, user_agent: str = USER_AGENT, **kwargs) -> str:
    try:
        cdx_api = WaybackMachineCDXServerAPI(url=url, user_agent=user_agent, **kwargs)
        alternate_url = cdx_api.newest().archive_url

        logging.info(f"Alternate URL for {url} is {alternate_url}")

        return alternate_url
    except NoCDXRecordFound:
        logging.info(f"No alternate URL found for {url}. Using original URL.")
        return url


@cache
def fetch_html(url: str) -> str:
    return requests.get(url).text


@cache
def clean_html(
    html: str,
    *,
    html_tag_ignore: list[str] = HTML_TAG_IGNORE,
    html_attr_ignore: list[str] = HTML_ATTR_IGNORE,
) -> BeautifulSoup:
    soup = BeautifulSoup(html, "html.parser")

    for el in soup(html_tag_ignore):
        el.decompose()

    for el in soup.find_all(True):
        for attr in html_attr_ignore:
            del el[attr]

    output_html = str(soup)

    logging.info(
        f"HTML cleaned. Size reduced to {round(len(output_html) / len(html) * 100, 2)}% of original size"
    )

    return soup


@cache
def extract_data(html: str) -> dict:
    g = Goose()
    article = g.extract(raw_html=html)

    logging.info(f"Data extracted from HTML. Title: {article.title}")

    return dict(
        title=article.title,
        content=article.cleaned_text,
    )

In [5]:
def scrape_article(url: str) -> dict:
    alternate_url = get_alternate_url(url)
    html = fetch_html(alternate_url)
    cleaned_html = clean_html(html)
    data = extract_data(str(cleaned_html))

    return dict(url=url, alternate_url=alternate_url, html=cleaned_html, **data)

In [12]:
version = datetime.now().strftime("%Y%m%d%H%M%S")

result_filename = Path(RESULT_FILENAME.format(version=version))
result_md_filename = Path(RESULT_MD_FILENAME.format(version=version))

result_filename.parent.mkdir(parents=True, exist_ok=True)


def process_article(article_url: str) -> dict:
    logging.info(f"Being scraping {article_url}...")

    id = article_url.replace("/", "_")

    article_data = scrape_article(article_url)
    article_data["id"] = id

    pd.DataFrame([article_data]).to_csv(
        result_filename,
        mode="a",
        header=not os.path.exists(
            result_filename
        ),  # only write header on the first iteration
        index=False,
    )

    with open(result_md_filename, "a") as f:
        f.write(MD_FORMAT.format(**article_data))

    html_filename = Path(RESULT_HTML_FILENAME.format(version=version, url=id))

    html_filename.parent.mkdir(parents=True, exist_ok=True)

    with open(html_filename, "w") as f:
        f.write(article_data["html"].prettify())

    logging.info(f"Scraping of {article_url} done.")

    return article_data


with ThreadPoolExecutor() as executor:
    tqdm(executor.map(process_article, ARTICLES), total=len(ARTICLES))

INFO:root:Being scraping https://www.nytimes.com/2024/09/29/us/north-carolina-helene-relief-damage.html...
INFO:root:Being scraping https://www.faz.net/aktuell/wirtschaft/kuenstliche-intelligenz/today-s-ai-can-t-be-trusted-19532136.html...
  0%|          | 0/6 [00:00<?, ?it/s]INFO:root:Being scraping http://www.chinatoday.com.cn/ctenglish/2018/commentaries/202409/t20240925_800378506.html...
  0%|          | 0/6 [00:00<?, ?it/s]
INFO:root:Being scraping https://english.elpais.com/economy-and-business/2024-09-28/from-the-hermes-heir-to-nicolas-cage-millionaires-who-went-bankrupt.html...
INFO:root:Being scraping https://insatiable.info/2023/06/30/quels-futur-pour-les-reseaux-sociaux/...
INFO:root:Being scraping https://actu.fr/auvergne-rhone-alpes/lyon_69123/lyon-le-projet-de-reamenagement-des-quais-les-plus-mortels-pour-les-cyclistes-devoile_61667371.html...
INFO:root:Alternate URL for http://www.chinatoday.com.cn/ctenglish/2018/commentaries/202409/t20240925_800378506.html is https://web