In [None]:
!pip -q install beautifulsoup4 lxml requests pandas


In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re


In [None]:
rating_map = {
    "One": 1,
    "Two": 2,
    "Three": 3,
    "Four": 4,
    "Five": 5
}

def get_rating(article):
    rating_tag = article.select_one("p.star-rating")
    if not rating_tag:
        return None

    classes = rating_tag.get("class", [])
    for c in classes:
        if c in rating_map:
            return rating_map[c]
    return None


In [None]:
def scrape_page(url):
    res = requests.get(url)
    soup = BeautifulSoup(res.text, "lxml")

    books = soup.select("article.product_pod")
    page_data = []

    for book in books:
        title_tag = book.select_one("h3 a")
        price_tag = book.select_one("p.price_color")
        stock_tag = book.select_one("p.instock.availability")

        # Title
        title = title_tag["title"].strip() if title_tag and title_tag.get("title") else None

        # Price (✅ FIXED: remove all non numeric characters)
        price_text = price_tag.get_text(strip=True) if price_tag else None
        price_gbp = None
        if price_text:
            price_gbp = re.sub(r"[^0-9.]", "", price_text)  # keeps only numbers + dot
            price_gbp = price_gbp if price_gbp else None

        # Rating
        rating = get_rating(book)

        # Availability
        availability = stock_tag.get_text(" ", strip=True) if stock_tag else None

        # Book URL
        book_url = None
        if title_tag and title_tag.get("href"):
            href = title_tag["href"]
            book_url = "https://books.toscrape.com/catalogue/" + href.replace("../../catalogue/", "")

        page_data.append({
            "Title": title,
            "Price_GBP": price_gbp,
            "Rating": rating,
            "Availability": availability,
            "Book_URL": book_url
        })

    return page_data


In [None]:
all_data = []

for page in range(1, 26):  # 25 pages total
    url = f"https://books.toscrape.com/catalogue/page-{page}.html"
    print(f"Scraping page {page}: {url}")

    page_data = scrape_page(url)
    all_data.extend(page_data)

    time.sleep(0.5)  # polite delay

print("✅ Total books collected:", len(all_data))


Scraping page 1: https://books.toscrape.com/catalogue/page-1.html
Scraping page 2: https://books.toscrape.com/catalogue/page-2.html
Scraping page 3: https://books.toscrape.com/catalogue/page-3.html
Scraping page 4: https://books.toscrape.com/catalogue/page-4.html
Scraping page 5: https://books.toscrape.com/catalogue/page-5.html
Scraping page 6: https://books.toscrape.com/catalogue/page-6.html
Scraping page 7: https://books.toscrape.com/catalogue/page-7.html
Scraping page 8: https://books.toscrape.com/catalogue/page-8.html
Scraping page 9: https://books.toscrape.com/catalogue/page-9.html
Scraping page 10: https://books.toscrape.com/catalogue/page-10.html
Scraping page 11: https://books.toscrape.com/catalogue/page-11.html
Scraping page 12: https://books.toscrape.com/catalogue/page-12.html
Scraping page 13: https://books.toscrape.com/catalogue/page-13.html
Scraping page 14: https://books.toscrape.com/catalogue/page-14.html
Scraping page 15: https://books.toscrape.com/catalogue/page-15.htm

In [None]:
df = pd.DataFrame(all_data)

# Convert price to numeric
df["Price_GBP"] = pd.to_numeric(df["Price_GBP"], errors="coerce")

# Remove missing titles (if any)
df = df.dropna(subset=["Title"]).reset_index(drop=True)

# Remove duplicates
df = df.drop_duplicates(subset=["Title"]).reset_index(drop=True)

print("✅ Final rows:", df.shape[0])
df.head(500)


✅ Final rows: 499


Unnamed: 0,Title,Price_GBP,Rating,Availability,Book_URL
0,A Light in the Attic,51.77,3,In stock,https://books.toscrape.com/catalogue/a-light-i...
1,Tipping the Velvet,53.74,1,In stock,https://books.toscrape.com/catalogue/tipping-t...
2,Soumission,50.10,1,In stock,https://books.toscrape.com/catalogue/soumissio...
3,Sharp Objects,47.82,4,In stock,https://books.toscrape.com/catalogue/sharp-obj...
4,Sapiens: A Brief History of Humankind,54.23,5,In stock,https://books.toscrape.com/catalogue/sapiens-a...
...,...,...,...,...,...
494,Unreasonable Hope: Finding Faith in the God Wh...,46.33,2,In stock,https://books.toscrape.com/catalogue/unreasona...
495,Under the Tuscan Sun,37.33,3,In stock,https://books.toscrape.com/catalogue/under-the...
496,Toddlers Are A**holes: It's Not Your Fault,25.55,1,In stock,https://books.toscrape.com/catalogue/toddlers-...
497,The Year of Living Biblically: One Man's Humbl...,34.72,1,In stock,https://books.toscrape.com/catalogue/the-year-...


In [None]:
print("Missing prices:", df["Price_GBP"].isna().sum())
df[["Title", "Price_GBP"]].head(20)


Missing prices: 0


Unnamed: 0,Title,Price_GBP
0,A Light in the Attic,51.77
1,Tipping the Velvet,53.74
2,Soumission,50.1
3,Sharp Objects,47.82
4,Sapiens: A Brief History of Humankind,54.23
5,The Requiem Red,22.65
6,The Dirty Little Secrets of Getting Your Dream...,33.34
7,The Coming Woman: A Novel Based on the Life of...,17.93
8,The Boys in the Boat: Nine Americans and Their...,22.6
9,The Black Maria,52.15


In [None]:
output_file = "books_to_scrape_dataset.csv"
df.to_csv(output_file, index=False)
print("Saved:", output_file)


Saved: books_to_scrape_dataset.csv


In [None]:
import os
print(os.listdir())


['.config', 'books_to_scrape_dataset.csv', 'sample_data']


In [None]:
from google.colab import files
files.download("books_to_scrape_dataset.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>