In [25]:
import pandas as pd 
import requests 
from bs4 import BeautifulSoup
from urllib.parse import urljoin

In [26]:
# ============================================
# Step 1) Send request and get HTML
# ============================================
url = "https://books.toscrape.com/"
response = requests.get(url)

# NOTE: Fix encoding issues like "Â£"
response.encoding = "utf-8"

if response.status_code != 200:
    print("request failed:", response.status_code)

In [27]:
# ============================================
# Step 2) Parse HTML with BeautifulSoup
# ============================================
soup = BeautifulSoup(response.text, "html.parser")
print("Page title:", soup.title.text)

Page title: 
    All products | Books to Scrape - Sandbox



In [28]:
# ============================================
# Step 3) Find all book blocks on the page
# ============================================
books = soup.find_all("article", class_="product_pod")
print("Number of books found:", len(books))


Number of books found: 20


In [30]:
# ============================================
# Step 4) Create lists to store data
# ============================================
images = []
titles = []
links = []
stars = []
prices = []
availability = []



# ============================================
# Step 5) Loop through each book and extract data
# ============================================
for book in books:

    # ----- 5.1 Image -----
    image_tag = book.find("img", class_="thumbnail")
    image = image_tag.get("src") if image_tag else None
    image_normal = urljoin(url, image) if image else None  # make full URL

    # ----- 5.2 Title + Link -----
    h3_tag = book.find("h3")
    a_tag = h3_tag.find("a") if h3_tag else None

    # NOTE: full title is in the "title" attribute (not text)
    title = a_tag.get("title") if a_tag else None

    link = a_tag.get("href") if a_tag else None
    link_normal = urljoin(url, link) if link else None  # make full URL

    # ----- 5.3 Star Rating -----
    # Example: <p class="star-rating Three"></p>
    star_tag = book.find("p", class_="star-rating")
    star = star_tag.get("class")[1] if star_tag and star_tag.get("class") else None

    # ----- 5.4 Price -----
    price_tag = book.find("p", class_="price_color")
    price = price_tag.text.strip() if price_tag else None

    # NOTE: remove currency symbol for cleaner data
    price_num = price.replace("£", "").replace("Â", "").strip() if price else None

    # ----- 5.5 Availability -----
    available_tag = book.find("p", class_="instock availability")
    available = available_tag.text.strip() if available_tag else None

    # ----- 5.6 Store results -----
    images.append(image_normal)
    titles.append(title)
    links.append(link_normal)
    stars.append(star)
    prices.append(price_num)
    availability.append(available)
    


# ============================================
# Step 6) Create DataFrame
# ============================================
df = pd.DataFrame({
    "Image": images,
    "Title": titles,
    "Link": links,
    "Star": stars,
    "Price": prices,
    "Availability": availability
})

print("\nDataFrame shape:", df.shape)
df.head(5)


DataFrame shape: (20, 6)


Unnamed: 0,Image,Title,Link,Star,Price,Availability
0,https://books.toscrape.com/media/cache/2c/da/2...,A Light in the Attic,https://books.toscrape.com/catalogue/a-light-i...,Three,51.77,In stock
1,https://books.toscrape.com/media/cache/26/0c/2...,Tipping the Velvet,https://books.toscrape.com/catalogue/tipping-t...,One,53.74,In stock
2,https://books.toscrape.com/media/cache/3e/ef/3...,Soumission,https://books.toscrape.com/catalogue/soumissio...,One,50.1,In stock
3,https://books.toscrape.com/media/cache/32/51/3...,Sharp Objects,https://books.toscrape.com/catalogue/sharp-obj...,Four,47.82,In stock
4,https://books.toscrape.com/media/cache/be/a5/b...,Sapiens: A Brief History of Humankind,https://books.toscrape.com/catalogue/sapiens-a...,Five,54.23,In stock


In [31]:
# ============================================
# Step 7) Save results to CSV
# ============================================
df.to_csv("books_page1.csv", index=False)
print("\nSaved: books_page1.csv")



Saved: books_page1.csv
