In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd


In [14]:
# URL de cible
url = "https://web-scraping.dev/products"   

# headers
headers = {
    "User-Agent":         
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
}

response = requests.get(url, headers=headers)


In [76]:
# Fonction qui sâ€™assure que le crawler a le droit de parser une page
def can_crawl(url):
    robots_url = url + "/robots.txt"
    response = requests.get(robots_url)
    if response.status_code == 200:
        robots_txt = response.text
        if "Disallow: /" in robots_txt:
            return False
    return True

# Fonction pour parser le HTML et extraire les informations (nom, description, lien)
def parse_html(soup):
    products = []
    product_elements = soup.find_all("body")
    for element in product_elements:
        name = element.find("h3", class_="card-title product-title mb-3").text.strip()
        description_text = element.find("p", class_="product-description").text.strip()
        links = element.find_all("a")
        features = element.find_all("tr", class_="feature")
        reviews = element.find_all("div", class_="mt-4")

        reviews_list = []
        for review in reviews:
            classes = review.get("class", [])
            review_id = None
            for cls in classes:
                if cls.startswith("review-") and cls != "review":
                    review_id = cls.replace("review-", "")
                    break

            rating = len(review.find_all("svg"))

            p = review.find("p")
            content = p.text.strip() if p else None

            reviews_list.append({
                "id": review_id,
                "rating": rating,
                "content": content
            })

        products.append({
            "url": url,
            "title": name,            
            "description": description_text,
            "links": [link["href"] for link in links],
            "features": {feature.find("td", class_="feature-label").text.strip(): feature.find("td", class_="feature-value").text.strip() for feature in features},
            "reviews": reviews_list
        })
    return products

In [None]:
url = "https://web-scraping.dev/product/1"
urls_prioritaire = []
response = requests.get(url, headers=headers)
if can_crawl(url):
    soup = BeautifulSoup(response.content, "html.parser")
    print(soup.title)
    products = parse_html(soup)
    list_url = products[0]['links']
    for url in list_url:
        if url not in urls:
            urls.append(url)
    print(products)
    print(urls)
else:
    print("Crawling not allowed for this URL.")

<title>web-scraping.dev product Box of Chocolate Candy</title>
[{'url': 'https://web-scraping.dev/product/1', 'title': 'Box of Chocolate Candy', 'description': "Indulge your sweet tooth with our Box of Chocolate Candy. Each box contains an assortment of rich, flavorful chocolates with a smooth, creamy filling. Choose from a variety of flavors including zesty orange and sweet cherry. Whether you're looking for the perfect gift or just want to treat yourself, our Box of Chocolate Candy is sure to satisfy.", 'links': ['https://web-scraping.dev/', '#', '/docs', '/api/graphql', 'https://web-scraping.dev/products', 'https://web-scraping.dev/reviews', 'https://web-scraping.dev/testimonials', 'https://web-scraping.dev/file-download', 'https://web-scraping.dev/login', '/cart', '/', '/products', 'https://web-scraping.dev/product/1?variant=orange-small', 'https://web-scraping.dev/product/1?variant=orange-medium', 'https://web-scraping.dev/product/1?variant=orange-large', 'https://web-scraping.dev