In [6]:
import requests

url = "https://codeavecjonathan.com/scraping/techsport/"
response = requests.get(url, timeout=20)
response.raise_for_status()

# Enregistrer le code HTML brut
with open("techsport.html", "w", encoding="utf-8") as f:
    f.write(response.text)

print("✅ Code HTML enregistré sous techsport.html")


HTTPError: 403 Client Error: Forbidden for url: https://codeavecjonathan.com/scraping/techsport/

In [2]:
import requests

url = "https://codeavecjonathan.com/scraping/techsport/"

# En-tête User-Agent pour se faire passer pour un vrai navigateur
headers = {
    "User-Agent": (
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/118.0.0.0 Safari/537.36"
    )
}

# Requête HTTP
response = requests.get(url, headers=headers, timeout=20)
response.raise_for_status()

# Sauvegarde du code HTML brut
with open("techsport_useragent.html", "w", encoding="utf-8") as f:
    f.write(response.text)

print("✅ Code HTML enregistré sous techsport_useragent.html")


HTTPError: 403 Client Error: Forbidden for url: https://codeavecjonathan.com/scraping/techsport/

In [8]:
# save_techsport_html.py
import time, random
import requests
from urllib.parse import urlparse

URL = "https://codeavecjonathan.com/scraping/techsport/"
OUT = "techsport.html"

UA_POOL = [
    # Chrome macOS
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 "
    "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
    # Firefox macOS
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 14.0; rv:126.0) Gecko/20100101 Firefox/126.0",
]

BASE_HEADERS = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
    "Accept-Language": "fr-FR,fr;q=0.9,en-US;q=0.8,en;q=0.7",
    "Upgrade-Insecure-Requests": "1",
    "Cache-Control": "no-cache",
    "Pragma": "no-cache",
    "DNT": "1",
    "Connection": "keep-alive",
    "Accept-Encoding": "gzip, deflate, br",
    # Certains WAF regardent ces champs (même s'ils sont très "Chrome"):
    "Sec-Fetch-Site": "same-origin",
    "Sec-Fetch-Mode": "navigate",
    "Sec-Fetch-Dest": "document",
}

def build_headers(url: str) -> dict:
    o = urlparse(url)
    referer = f"{o.scheme}://{o.netloc}/"
    h = BASE_HEADERS.copy()
    h["Referer"]   = referer
    h["User-Agent"] = random.choice(UA_POOL)
    return h

def try_requests(url: str, out: str, tries: int = 3, timeout: int = 20) -> bool:
    s = requests.Session()
    for i in range(1, tries + 1):
        headers = build_headers(url)
        try:
            resp = s.get(url, headers=headers, timeout=timeout, allow_redirects=True)
            # Certains serveurs renvoient 403/429 -> retenter
            if resp.status_code == 200 and resp.text.strip():
                with open(out, "w", encoding="utf-8") as f:
                    f.write(resp.text)
                print(f"✅ HTML enregistré via requests → {out}")
                return True
            else:
                print(f"⚠️  Tentative {i}/{tries} : statut {resp.status_code}")
        except requests.RequestException as e:
            print(f"⚠️  Tentative {i}/{tries} : {e}")
        time.sleep(1.2 * i)  # backoff doux
    return False

def fallback_playwright(url: str, out: str) -> bool:
    """
    Fallback navigateur réel (sans tête).
    Nécessite: pip install playwright && playwright install chromium
    """
    try:
        from playwright.sync_api import sync_playwright
    except ImportError:
        print("❌ Playwright non installé. Installez-le: pip install playwright && playwright install chromium")
        return False

    ua = random.choice(UA_POOL)
    with sync_playwright() as p:
        browser = p.chromium.launch(headless=True)
        page = browser.new_page(user_agent=ua)
        # astuce: attendre que le réseau soit au repos pour capter le HTML final
        page.goto(url, wait_until="networkidle", timeout=30000)
        html = page.content()
        with open(out, "w", encoding="utf-8") as f:
            f.write(html)
        browser.close()
    print(f"✅ HTML enregistré via Playwright → {out}")
    return True

if __name__ == "__main__":
    if not try_requests(URL, OUT):
        print("↩️  Bascule sur Playwright…")
        ok = fallback_playwright(URL, OUT)
        if not ok:
            raise SystemExit("Échec: 403 persistant et Playwright indisponible.")



✅ HTML enregistré via requests → techsport.html
