# Import

In [7]:
import csv
import time
import random

import requests
from bs4 import BeautifulSoup

# Constantes + Variables

In [8]:
BASE = "http://quotes.toscrape.com"
URL_PAGE = BASE + "/page/{}/"

session = requests.Session()
session.headers.update({"User-Agent": "Mozilla/5.0"})

quotes = []
authors_index = {}
page = 1

# Citations

In [None]:


# Parcourir toutes les pages
while True:

    # Tant qu'il y a des pages à scraper on continue
    response = session.get(URL_PAGE.format(page), timeout=10)
    if response.status_code != 200:
        break

    # On récupère le contenu de chaque page et les blocs de citation en particulier
    soup = BeautifulSoup(response.text, "html.parser")
    blocks = soup.select("div.quote")
    if not blocks:
        break

    # On récupère chaque citation : texte + auteur + tags
    # On récupère par la même l'url de chaque auteur pour un traitement ultérieur
    for b in blocks:

        # la citation
        text = b.select_one("span.text").get_text(strip=True)
        author = b.select_one("small.author").get_text(strip=True)
        tags = " ".join([t.get_text(strip=True) for t in b.select("a.tag")])

        # l'auteur
        author_rel = b.select_one("span > a[href*='/author/']")["href"]
        author_url = BASE + author_rel

        quotes.append({"text": text, "author": author, "tags": tags})
        authors_index.setdefault(author, author_url)

    page += 1
    time.sleep(random.uniform(0.1, 0.2))

In [10]:
authors_index

{'Albert Einstein': 'http://quotes.toscrape.com/author/Albert-Einstein',
 'J.K. Rowling': 'http://quotes.toscrape.com/author/J-K-Rowling',
 'Jane Austen': 'http://quotes.toscrape.com/author/Jane-Austen',
 'Marilyn Monroe': 'http://quotes.toscrape.com/author/Marilyn-Monroe',
 'André Gide': 'http://quotes.toscrape.com/author/Andre-Gide',
 'Thomas A. Edison': 'http://quotes.toscrape.com/author/Thomas-A-Edison',
 'Eleanor Roosevelt': 'http://quotes.toscrape.com/author/Eleanor-Roosevelt',
 'Steve Martin': 'http://quotes.toscrape.com/author/Steve-Martin',
 'Bob Marley': 'http://quotes.toscrape.com/author/Bob-Marley',
 'Dr. Seuss': 'http://quotes.toscrape.com/author/Dr-Seuss',
 'Douglas Adams': 'http://quotes.toscrape.com/author/Douglas-Adams',
 'Elie Wiesel': 'http://quotes.toscrape.com/author/Elie-Wiesel',
 'Friedrich Nietzsche': 'http://quotes.toscrape.com/author/Friedrich-Nietzsche',
 'Mark Twain': 'http://quotes.toscrape.com/author/Mark-Twain',
 'Allen Saunders': 'http://quotes.toscrape.

# Auteurs

In [None]:
# Détails auteurs
authors = []

# On récupère chaque auteur unique depuis le site
for name, url in authors_index.items():
        response = session.get(url, timeout=10)
        soup = BeautifulSoup(response.text, "html.parser")

        born_date = soup.select_one(".author-born-date")
        born_loc = soup.select_one(".author-born-location")
        description = soup.select_one(".author-description")

        authors.append({
            "name": name,
            "born_date": born_date.get_text(strip=True).strip() if born_date else "",
            "born_location": born_loc.get_text(strip=True).removeprefix("in ").strip() if born_loc else "",
            "bio": description.get_text(strip=True).strip() if description else ""
        })
        
        time.sleep(random.uniform(0.1, 0.2))

# Persistence des données

In [None]:
with open("quotes.csv", "w", newline="", encoding="utf-8") as f:
    w = csv.DictWriter(f, fieldnames=["text", "author", "tags"])
    w.writeheader()
    w.writerows(quotes)

with open("authors.csv", "w", newline="", encoding="utf-8") as f:
    w = csv.DictWriter(f, fieldnames=["name", "born_date", "born_location", "bio"])
    w.writeheader()
    w.writerows(authors)