In [4]:
"""
Scraper Rakuten – recherche « jeu video »
Récupère : image_url, titre, console
⚠︎  Pensez à vérifier les CGU et le robots.txt de Rakuten avant un usage en production.
"""

import csv
import re
import time
from pathlib import Path
from urllib.parse import urljoin, urlencode

import requests
from bs4 import BeautifulSoup

BASE_URL = "https://fr.shopping.rakuten.com/search/jeu+video"
# 30 pages max au 02‑07‑2025, ajustez si besoin
NB_PAGES  = 30
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/126.0.0.0 Safari/537.36",
    "Accept-Language": "fr-FR,fr;q=0.9",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp",
    "Referer": "https://www.google.com/",
    "DNT": "1",  # Do Not Track
}


# liste (non exhaustive) de plateformes pour détecter la console
CONSOLES = [
    # PlayStation
    "PS5", "PS4", "PS3", "PS2", "PS1", "PS Vita", "PSP",
    # Xbox
    "Xbox Series", "Xbox One", "Xbox 360", "Xbox",
    # Nintendo
    "Switch 2", "Switch", "Wii U", "Wii", "GameCube",
    "Nintendo 3DS", "Nintendo 2DS", "Nintendo DS",
    # Autres
    "PC", "MAC",
]

def normalise_src(src: str) -> str:
    """Complète éventuellement le schéma (//images…) et retire les tailles."""
    if src and src.startswith("//"):
        src = "https:" + src
    # mini‑vignettes Rakuten finissent parfois par &_nopad.jpg – on garde tel quel
    return src

def guess_console(text: str) -> str | None:
    """Renvoie la première console trouvée dans le texte, sinon None."""
    for console in CONSOLES:
        # insensitive + accents supprimés pour piéger « Wii U » / « Wii U »
        if console.lower() in text.lower():
            return console
    return None

def parse_listing(html: str) -> list[dict]:
    soup = BeautifulSoup(html, "lxml")

    data = []
    for img in soup.select("img[alt]"):
        title = img["alt"].strip()
        # on ignore les pictos de l’interface (« Rakuten group », etc.)
        if len(title) < 5 or "Rakuten" in title:
            continue

        console = guess_console(title)
        if not console:
            # Si la console n’est pas dans l’attribut alt, on regarde le
            # texte du conteneur (lien, <figure>, etc.)
            console = guess_console(img.find_parent().get_text(" "))

        if not console:
            # toujours rien ? On passe (sinon risque de bruit)
            continue

        data.append(
            {
                "title":   title,
                "console": console,
                "image":   normalise_src(
                    img.get("src") or img.get("data-src") or img.get("data-lazy") or ""
                ),
            }
        )
    return data

def scrape_pages(n_pages: int = NB_PAGES) -> list[dict]:
    all_rows = []
    session  = requests.Session()
    for page in range(1, n_pages + 1):
        params = {"p": page}            # paramètre de pagination Rakuten
        url    = f"{BASE_URL}?{urlencode(params)}"
        print(f"[+] Fetching page {page} – {url}")
        resp = session.get(url, headers=HEADERS, timeout=20)
        resp.raise_for_status()
        all_rows.extend(parse_listing(resp.text))
        time.sleep(1.5)                 # politesse pour le serveur
    return all_rows

def save_csv(rows: list[dict], dest: str = "rakuten_jeu_video.csv") -> None:
    Path(dest).parent.mkdir(parents=True, exist_ok=True)
    with open(dest, "w", newline="", encoding="utf‑8") as f:
        writer = csv.DictWriter(f, fieldnames=["title", "console", "image"])
        writer.writeheader()
        writer.writerows(rows)
    print(f"[✓] {len(rows)} lignes écrites dans {dest}")

if __name__ == "__main__":
    rows = scrape_pages()   # ou scrape_pages(5) pour tester plus vite
    save_csv(rows)


[+] Fetching page 1 – https://fr.shopping.rakuten.com/search/jeu+video?p=1


HTTPError: 403 Client Error: Forbidden for url: https://fr.shopping.rakuten.com/search/jeu+video?p=1

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time

# Configuration
BASE_URL = "https://fr.shopping.rakuten.com/search/jeu+video"
NB_PAGES = 1
WAIT_TIME = 10  # maximum de 10 secondes pour attendre les éléments

# Démarrage navigateur
options = webdriver.ChromeOptions()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

data = []

for page in range(1, NB_PAGES + 1):
    url = f"{BASE_URL}?p={page}"
    print(f"[+] Page {page} – {url}")
    driver.get(url)

    try:
        # Attendre que les produits s'affichent
        WebDriverWait(driver, WAIT_TIME).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "article[data-testid='product-card'] img"))
        )
    except:
        print("[-] Aucun produit trouvé sur cette page.")
        continue

    # Récupérer tous les produits
    products = driver.find_elements(By.CSS_SELECTOR, "article[data-testid='product-card']")
    print(products)
    for product in products:
        try:
            img = product.find_element(By.CSS_SELECTOR, "img")
            title = img.get_attribute("alt")

            # Prendre src, sinon data-frz-src
            src = img.get_attribute("src") or img.get_attribute("data-frz-src")

            if not src:
                src = img.get_attribute("data-src")  # fallback supplémentaire

            # Convertir URL relative en absolue
            if src and src.startswith("/"):
                src = "https://fr.shopping.rakuten.com" + src
                print(src)
            
            if title and src and not "rakuten" in title.lower():
                data.append({
                    "title": title.strip(),
                    "image_url": src.strip()
                })
        except:
            continue


    time.sleep(1)  # petite pause entre les pages

driver.quit()

# Sauvegarde
df = pd.DataFrame(data)
df.drop_duplicates(inplace=True)
df.to_csv("rakuten_jeux_video_images.csv", index=False, encoding="utf-8")
print(f"[✓] {len(df)} images sauvegardées dans 'rakuten_jeux_video_images.csv'")


[+] Page 1 – https://fr.shopping.rakuten.com/search/jeu+video?p=1
[-] Aucun produit trouvé sur cette page.
[+] Page 2 – https://fr.shopping.rakuten.com/search/jeu+video?p=2
[-] Aucun produit trouvé sur cette page.
[+] Page 3 – https://fr.shopping.rakuten.com/search/jeu+video?p=3
[-] Aucun produit trouvé sur cette page.
[+] Page 4 – https://fr.shopping.rakuten.com/search/jeu+video?p=4
[-] Aucun produit trouvé sur cette page.
[+] Page 5 – https://fr.shopping.rakuten.com/search/jeu+video?p=5
[-] Aucun produit trouvé sur cette page.
[✓] 0 images sauvegardées dans 'rakuten_jeux_video_images.csv'


In [7]:
products = driver.find_elements(By.CSS_SELECTOR, "li[data-testid='search-results-item']")

for product in products:
    try:
        img = product.find_element(By.CSS_SELECTOR, "img")
        title = img.get_attribute("alt")
        src = img.get_attribute("src") or img.get_attribute("data-src")

        if title and src and not "rakuten" in title.lower():
            data.append({
                "title": title.strip(),
                "image_url": src.strip()
            })
    except:
        continue


MaxRetryError: HTTPConnectionPool(host='localhost', port=56119): Max retries exceeded with url: /session/d85a464b6c6ea4e73c149f296d22a8e7/elements (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x000001C416421290>: Failed to establish a new connection: [WinError 10061] Aucune connexion n’a pu être établie car l’ordinateur cible l’a expressément refusée'))