In [1]:
# Install BeautifulSoup (if needed)
!pip install beautifulsoup4 requests




In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [6]:
# Target URL
url = "https://books.toscrape.com/"

# Fetch HTML content
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

# Find all book containers
books = soup.find_all("article", class_="product_pod")

# Prepare data list
data = []

for book in books:
    title = book.h3.a["title"]
    price = book.find("p", class_="price_color").text
    rating = book.p["class"][1]  # star-rating class (e.g., "Three")
    data.append({"Title": title, "Price": price, "Rating": rating})

# Convert to DataFrame
df = pd.DataFrame(data)
df.head()

# Clean price column
df["Price"] = (
    df["Price"]
    .str.replace("Â", "", regex=False)  # Remove bad encoding character
    .str.replace("£", "", regex=False)  # Remove pound sign
    .str.strip()                        # Remove leading/trailing whitespace
    .astype(float)                     # Convert to float
)


# Most expensive book
df.sort_values(by="Price", ascending=False).head(1)



Unnamed: 0,Title,Price,Rating
15,Our Band Could Be Your Life: Scenes from the A...,57.25,Three


In [14]:
# List to store book data
all_books = []

# Loop through first n pages
n=5
for page in range(0, n):#change here for searching no. of pages
    url = base_url.format(page)
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all book containers
    books = soup.find_all("article", class_="product_pod")

    for book in books:
        title = book.h3.a["title"]
        price = book.find("p", class_="price_color").text
        rating = book.p["class"][1]
        all_books.append({
            "Title": title,
            "Price": price,
            "Rating": rating
        })

# Convert to DataFrame
df = pd.DataFrame(all_books)

# Clean price column
df["Price"] = (
    df["Price"]
    .str.replace("Â", "", regex=False)  # Remove bad encoding character
    .str.replace("£", "", regex=False)  # Remove pound sign
    .str.strip()                        # Remove leading/trailing whitespace
    .astype(float)                     # Convert to float
)

# Save to CSV
df.to_csv("scraped_books.csv", index=False)

# Show top 5 books
df.sort_values(by="Price", ascending=False).head(10)#change here to see top no. of books


Unnamed: 0,Title,Price,Rating
68,The Death of Humanity: and the Case for Life,58.11,Four
40,Slow States of Collapse: Poems,57.31,Three
15,Our Band Could Be Your Life: Scenes from the A...,57.25,Three
58,The Past Never Ends,56.5,Four
57,The Pioneer Woman Cooks: Dinnertime: Comfort C...,56.41,One
56,The Secret of Dreadwillow Carse,56.13,One
67,The Electric Pencil: Drawings from Inside Stat...,56.06,One
25,Birdsong: A Story in Pictures,54.64,Three
4,Sapiens: A Brief History of Humankind,54.23,Five
61,The Murder That Never Was (Forensic Instincts #5),54.11,Three
