In [11]:
# simple_crawler.py
import time
import csv
import requests
from urllib.parse import urljoin, urlparse
from urllib import robotparser
from bs4 import BeautifulSoup

In [12]:
BASE_URL = "https://books.toscrape.com/"

HEADERS = {
    "User-Agent": "MyCrawler/1.0 (+https://example.com/contact) - educational use"
}

# polite crawler: check robots.txt
def can_fetch(url, user_agent=HEADERS["User-Agent"]):
    rp = robotparser.RobotFileParser()
    robots_url = urljoin(url, "/robots.txt")
    rp.set_url(robots_url)
    try:
        rp.read()
    except Exception:
        # if robots.txt can't be fetched, default to false to be safe or True if you prefer
        return False
    return rp.can_fetch(user_agent, url)


In [13]:

def get_soup(url):
    resp = requests.get(url, headers=HEADERS, timeout=10)
    resp.raise_for_status()
    return BeautifulSoup(resp.text, "html.parser")

def parse_product_card(card, base_url):
    # Example for books.toscrape structure â€” change selectors for your target site
    title_tag = card.select_one("h3 a")
    title = title_tag["title"].strip()
    relative_link = title_tag["href"]
    product_url = urljoin(base_url, relative_link)

    price = card.select_one(".price_color").get_text(strip=True)
    availability = card.select_one(".availability").get_text(strip=True)

    return {
        "title": title,
        "price": price,
        "availability": availability,
        "product_url": product_url
    }


In [22]:
def crawl(start_url, max_pages=5, delay=1.0, output_csv="products.csv"):
    parsed = urlparse(start_url)
    domain_root = f"{parsed.scheme}://{parsed.netloc}/"
    
    if not can_fetch(domain_root):
        raise SystemExit(f"Robots.txt disallows crawling {domain_root} for this user-agent.")

    products = []
    next_page = start_url
    pages_crawled = 0

    while next_page and pages_crawled < max_pages:
        print(f"Crawling page: {next_page}")
        soup = get_soup(next_page)

        # Find all product cards
        cards = soup.select(".product_pod")
        for card in cards:
            try:
                prod = parse_product_card(card, domain_root)
                products.append(prod)
            except Exception as e:
                print("Failed to parse product card:", e)

        # Find "next" link
        next_tag = soup.select_one(".next a")
        if next_tag:
            next_page = urljoin(next_page, next_tag["href"])
        else:
            next_page = None

        pages_crawled += 1
        time.sleep(delay)  # polite delay

    # Save results
    with open(output_csv, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=["title", "price", "availability", "product_url"])
        writer.writeheader()
        writer.writerows(products)

    print(f"Saved {len(products)} products to {output_csv}")


if __name__ == "__main__":
    # start crawling from catalog page
    start = "https://books.toscrape.com/catalogue/page-1.html"
    crawl(start, max_pages=10, delay=1.0, output_csv="books.csv")


Crawling page: https://books.toscrape.com/catalogue/page-1.html
Crawling page: https://books.toscrape.com/catalogue/page-2.html
Crawling page: https://books.toscrape.com/catalogue/page-3.html
Crawling page: https://books.toscrape.com/catalogue/page-4.html
Crawling page: https://books.toscrape.com/catalogue/page-5.html
Crawling page: https://books.toscrape.com/catalogue/page-6.html
Crawling page: https://books.toscrape.com/catalogue/page-7.html
Crawling page: https://books.toscrape.com/catalogue/page-8.html
Crawling page: https://books.toscrape.com/catalogue/page-9.html
Crawling page: https://books.toscrape.com/catalogue/page-10.html
Saved 200 products to books.csv


##### 