In [4]:
"""
Books to Scrape — Data Extraction Script
----------------------------------------
This script scrapes book data from http://books.toscrape.com
and saves it to a CSV file.

Data collected:
- Title
- Price
- Rating
- Availability

Author: Kayode Micheal Olabode
Date: 2025-08-09

Requirements:
    pip install requests bs4, pandas
"""

import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

BASE_URL = "http://books.toscrape.com/catalogue/page-{}.html"
HEADERS = {
    "User-Agent": "KayodeBot/1.0 (+https://github.com/yourusername)",
    "Accept-Language": "en-US,en;q=0.9"
}

In [5]:
def fetch_page(url: str) -> BeautifulSoup:
    """Fetch HTML content and return a BeautifulSoup object."""
    response = requests.get(url, headers=HEADERS, timeout=10)
    response.encoding = "utf-8"  # ✅ Force correct encoding
    response.raise_for_status()
    return BeautifulSoup(response.text, "html.parser")

def parse_books(soup: BeautifulSoup) -> list[dict]:
    """Extract book details from a single page."""
    books = []
    for book in soup.find_all("article", class_="product_pod"):
        title = book.h3.a["title"]
        price = book.find("p", class_="price_color").text.strip()
        price = re.sub(r"^[^\£]*", "£", price)  # ✅ Ensure clean £
        rating = book.p["class"][1]  # second class is rating
        availability = book.find("p", class_="instock availability").text.strip()
        
        books.append({
            "Title": title,
            "Price": price,
            "Rating": rating,
            "Availability": availability
        })
    return books

def scrape_all_books() -> list[dict]:
    """Scrape all pages of books and return combined list."""
    all_books = []
    print(" Scraping all pages from BookstoScrape...")
    for page in range(1, 51):  # 50 pages total
        soup = fetch_page(BASE_URL.format(page))
        books = parse_books(soup)
        if not books:
            break
        all_books.extend(books)
    return all_books

In [6]:
def save_to_csv(data: list[dict], filename: str) -> None:
    """Save scraped data to a CSV file with UTF-8 BOM for Excel compatibility."""
    df = pd.DataFrame(data)
    df.to_csv(filename, index=False, encoding="utf-8-sig")  # ✅ Works in Excel
    print(f"✅ Saved {len(df)} books to {filename}")

def main():
    books = scrape_all_books()
    
    # Preview first 20 books in console
    print("\nPreview of scraped books:")
    for book in books[:20]:
        print(f"{book['Title']} | {book['Price']} | {book['Rating']} | {book['Availability']}")
    
    save_to_csv(books, "books.csv")

if __name__ == "__main__":
    main()
# This script scrapes book data from http://books.toscrape.com
# and saves it to a CSV file named "books.csv".

 Scraping all pages from BookstoScrape...

Preview of scraped books:
A Light in the Attic | ££51.77 | Three | In stock
Tipping the Velvet | ££53.74 | One | In stock
Soumission | ££50.10 | One | In stock
Sharp Objects | ££47.82 | Four | In stock
Sapiens: A Brief History of Humankind | ££54.23 | Five | In stock
The Requiem Red | ££22.65 | One | In stock
The Dirty Little Secrets of Getting Your Dream Job | ££33.34 | Four | In stock
The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull | ££17.93 | Three | In stock
The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics | ££22.60 | Four | In stock
The Black Maria | ££52.15 | One | In stock
Starving Hearts (Triangular Trade Trilogy, #1) | ££13.99 | Two | In stock
Shakespeare's Sonnets | ££20.66 | Four | In stock
Set Me Free | ££17.46 | Five | In stock
Scott Pilgrim's Precious Little Life (Scott Pilgrim #1) | ££52.29 | Five | In stock
Rip it Up and Start Again | ££35.02 