In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd 
from urllib.parse import urljoin
import json

In [2]:
 # headers
headers = {
    "User-Agent":         
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
}

In [3]:
# Fonction qui sâ€™assure que le crawler a le droit de parser une page
def can_crawl(url):
    robots_url = url + "/robots.txt"
    #print(robots_url)
    response = requests.get(robots_url)
    if response.status_code == 200:
        robots_txt = response.text
        if "Disallow: /" in robots_txt:
            return False
    return True

# Fonction pour parser le HTML et extraire les informations (nom, description, lien)
def parse_html(soup):
    products = []
    product_elements = soup.find_all("body")
    for element in product_elements:

        name = element.find("h3", class_="card-title product-title mb-3")
        if name:
            name = name.text.strip()
        else:
            name = soup.find("title").text.strip()

        description_text = element.find("p", class_="product-description")
        if description_text:
            description_text = description_text.text.strip()
        else:
            description_text = ""

        features = element.find_all("tr", class_="feature")
        links = element.find_all("a")
        reviews = element.find_all("div", class_="mt-4")

        reviews_list = []
        for review in reviews:
            classes = review.get("class", [])
            review_id = None
            for cls in classes:
                if cls.startswith("review-") and cls != "review":
                    review_id = cls.replace("review-", "")
                    break

            rating = len(review.find_all("svg"))

            p = review.find("p")
            content = p.text.strip() if p else None

            reviews_list.append({
                "id": review_id,
                "rating": rating,
                "content": content
            })

        products.append({
            "url": url,
            "title": name,            
            "description": description_text,
            "features": {feature.find("td", class_="feature-label").text.strip(): feature.find("td", class_="feature-value").text.strip() for feature in features},
            "links": [link.get("href") for link in links],
            "reviews": reviews_list
        })
    return products

In [4]:
url = "https://web-scraping.dev/products"

output = []
urls_priority = []
urls_non_priority = []
urls_priority.append(url)
nb_pages = 5
i = 0


for url in urls_priority:

    if i >= nb_pages:
        print("Reached maximum number of pages:", nb_pages)
        break
    
    print("Crawling URL:", url)
    if not url.startswith("http://") and not url.startswith("https://"):
        print(f"Skipping invalid URL: {url}")
        continue
    if can_crawl(url):
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, "html.parser")
        print(soup.title)
        products = parse_html(soup)
        list_url = products[0]['links']
        for url in list_url:
            if url is None:
                continue
            if "product" in url and url not in urls_priority:
                urls_priority.append(url)
            elif url not in urls_non_priority:
                urls_non_priority.append(url)
        print(products)
        output.extend(products)
        i += 1
        print(i)
    else:
        print("Crawling not allowed for this URL.", url)

products_json = json.dumps(output, ensure_ascii=False, indent=4)
with open("products.json", "w", encoding="utf-8") as f:
    f.write(products_json)

Crawling URL: https://web-scraping.dev/products
<title>web-scraping.dev product page 1</title>
[{'url': 'https://web-scraping.dev/products', 'title': 'web-scraping.dev product page 1', 'description': '', 'features': {}, 'links': ['https://web-scraping.dev/', '#', '/docs', '/api/graphql', 'https://web-scraping.dev/products', 'https://web-scraping.dev/reviews', 'https://web-scraping.dev/testimonials', 'https://web-scraping.dev/file-download', 'https://web-scraping.dev/login', '/cart', None, 'https://web-scraping.dev/products?category=apparel', 'https://web-scraping.dev/products?category=consumables', 'https://web-scraping.dev/products?category=household', 'https://web-scraping.dev/product/1', 'https://web-scraping.dev/product/2', 'https://web-scraping.dev/product/3', 'https://web-scraping.dev/product/4', 'https://web-scraping.dev/product/5', 'https://web-scraping.dev/products?page=1', None, 'https://web-scraping.dev/products?page=2', 'https://web-scraping.dev/products?page=3', 'https://w