# Web Scraping with Python

## Libraries 

In [92]:
import pandas as pd
import requests 
from bs4 import BeautifulSoup
from urllib.parse import urljoin

## Requests: Fetching a Wep Page 

In [93]:
# Fetching the webpage
url = "https://books.toscrape.com"
response = requests.get(url)
response

<Response [200]>

In [94]:
#from pprint import pprint
#pprint(response.content)
response.content



In [95]:
print(response.headers['Content-Type'])

text/html


In [96]:
# Parsing HTML with BeautifulSoup
soup = BeautifulSoup(response.content, "html.parser")

In [97]:
type(soup)

bs4.BeautifulSoup

In [98]:
print(soup.prettify())

<!DOCTYPE html>
<!--[if lt IE 7]>      <html lang="en-us" class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->
<!--[if IE 7]>         <html lang="en-us" class="no-js lt-ie9 lt-ie8"> <![endif]-->
<!--[if IE 8]>         <html lang="en-us" class="no-js lt-ie9"> <![endif]-->
<!--[if gt IE 8]><!-->
<html class="no-js" lang="en-us">
 <!--<![endif]-->
 <head>
  <title>
   All products | Books to Scrape - Sandbox
  </title>
  <meta content="text/html; charset=utf-8" http-equiv="content-type"/>
  <meta content="24th Jun 2016 09:29" name="created"/>
  <meta content="" name="description"/>
  <meta content="width=device-width" name="viewport"/>
  <meta content="NOARCHIVE,NOCACHE" name="robots"/>
  <!-- Le HTML5 shim, for IE6-8 support of HTML elements -->
  <!--[if lt IE 9]>
        <script src="//html5shim.googlecode.com/svn/trunk/html5.js"></script>
        <![endif]-->
  <link href="static/oscar/favicon.ico" rel="shortcut icon"/>
  <link href="static/oscar/css/styles.css" rel="stylesheet" type="tex

In [99]:
# Function to get the rating from a book element
def get_rating_number(rating_str): 
    ratings = {
        "One": 1, 
        "Two": 2, 
        "Three": 3, 
        "Four": 4,
        "Five": 5 
    }
    return ratings.get(rating_str, 0)

# Function to extract book data about the minimal rating and max. price
def scrape_books(min_rating=4, max_price=20):
    base_url = "https://books.toscrape.com/catalogue/page-{}.html"
    books = []

    for page_num in range(1, 51):  # there are 50 pages
        url = base_url.format(page_num)
        response = requests.get(url)
        if response.status_code != 200:
            break 

        soup = BeautifulSoup(response.content, "html.parser")
        book_items = soup.find_all("article", class_="product_pod")

        for item in book_items: 
            title = item.h3.a["title"]
            price_str = item.select_one("p.price_color").text.strip()
            price = float(price_str.lstrip("£"))
            rating_str = item.p.get("class")[1]
            rating = get_rating_number(rating_str)
            availability = item.select_one("p.instock.availability").text.strip()
            detail_url = urljoin("https://books.toscrape.com/catalogue/", item.h3.a["href"].replace('../../../', ''))

            # Only go to the detail page if the book meets filtering criteria
            if rating >= min_rating and price <= max_price:
                # Fetch book detail page
                detail_response = requests.get(detail_url)
                detail_soup = BeautifulSoup(detail_response.content, "html.parser")

                # Extract UPC and availability from product table
                table = detail_soup.find('table', class_='table table-striped')
                upc_tag = table.find('th', string="UPC")
                availability_tag = table.find('th', string="Availability")

                if not upc_tag or not availability_tag:
                    continue  # skip if key info is missing

                upc = upc_tag.find_next_sibling('td').text
                availability_detail = availability_tag.find_next_sibling('td').text.strip()

                # Extract genre from breadcrumb
                breadcrumb = detail_soup.select('ul.breadcrumb li')
                genre = breadcrumb[2].text.strip() if len(breadcrumb) > 2 else "Unknown"

                # Extract description
                desc_tag = detail_soup.select_one('div#product_description')
                if desc_tag:
                    description = desc_tag.find_next_sibling('p').text.strip()
                else:
                    description = ""

                books.append({
                    "title": title, 
                    "price": price, 
                    "rating": rating, 
                    "availability": availability_detail, 
                    "url": detail_url,
                    "upc": upc,
                    "genre": genre,
                    "description": description
                })

    return pd.DataFrame(books)

In [100]:
df = scrape_books(min_rating = 4, max_price= 20)
print(df)

                                                title  price  rating  \
0                                         Set Me Free  17.46       5   
1   The Four Agreements: A Practical Guide to Pers...  17.66       5   
2                                      Sophie's World  15.94       5   
3             Untitled Collection: Sabbath Poems 2014  14.27       4   
4                                     This One Summer  19.49       4   
..                                                ...    ...     ...   
70                                    The Zombie Room  19.69       5   
71                                    The Silent Wife  12.34       5   
72                                  The Girl You Lost  12.29       5   
73              The Edge of Reason (Bridget Jones #2)  19.18       4   
74  A Spy's Devotion (The Regency Spies of London #1)  16.97       5   

               availability  \
0   In stock (19 available)   
1   In stock (18 available)   
2   In stock (18 available)   
3   In stoc

In [101]:
# Another way to get the data
# list to store extracted data 
UPC = []
Title = []
Price = []
Rating = []
Genre = []
Availability = []
Description = []
URLs = []

base_url = "https://books.toscrape.com/catalogue/"

# Creating a loop to iterate through the pages
for page in range(1, 51): 
    url = base_url + f"page-{page}.html"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")

    all_books = soup.find_all("article", class_="product_pod")

    if response.status_code != 200: 
        print(f"Failed to retrieve page {page}.")
        continue

all_books = soup.find_all("article", class_= "product_pod")
all_books

[<article class="product_pod">
 <div class="image_container">
 <a href="frankenstein_20/index.html"><img alt="Frankenstein" class="thumbnail" src="../media/cache/00/25/0025515e987a1ebd648773f9ac70bfe6.jpg"/></a>
 </div>
 <p class="star-rating Two">
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 </p>
 <h3><a href="frankenstein_20/index.html" title="Frankenstein">Frankenstein</a></h3>
 <div class="product_price">
 <p class="price_color">£38.00</p>
 <p class="instock availability">
 <i class="icon-ok"></i>
     
         In stock
     
 </p>
 <form>
 <button class="btn btn-primary btn-block" data-loading-text="Adding..." type="submit">Add to basket</button>
 </form>
 </div>
 </article>,
 <article class="product_pod">
 <div class="image_container">
 <a href="forever-rockers-the-rocker-12_19/index.html"><img alt="Forever Rockers (The Rocker #12)" class="thumbnail" src="../media/cache/7f/b0/7fb03a053c270

In [102]:
for book in all_books:
    try:
        # Extract base data
        title = book.find("h3").a["title"]
        price = float(book.find("p", class_="price_color").text.strip().lstrip("£"))
        rating = book.find("p", class_="star-rating")["class"][1]
        availability = book.find("p", class_="instock availability").text.strip()
        relative_url = book.find("h3").a["href"].replace("../../", "")
        detail_url = "https://books.toscrape.com/catalogue/" + relative_url

        # Get detail page
        response = requests.get(detail_url)
        detail_soup = BeautifulSoup(response.content, "html.parser")

        # Product table
        table = detail_soup.find('table', class_='table table-striped')
        if not table:
            continue

        upc_tag = table.find('th', string="UPC")
        availability_tag = table.find('th', string="Availability")

        if not upc_tag or not availability_tag:
            continue

        upc = upc_tag.find_next_sibling('td').text
        availability_detail = availability_tag.find_next_sibling('td').text.strip()

        # Genre
        breadcrumb = detail_soup.select('ul.breadcrumb li')
        genre = breadcrumb[2].text.strip() if len(breadcrumb) > 2 else "Unknown"

        # Description
        desc_tag = detail_soup.select_one('div#product_description')
        description = desc_tag.find_next_sibling('p').text.strip() if desc_tag else "No description available"

        # Append only if all succeeded
        Title.append(title)
        Price.append(price)
        Rating.append(rating)
        Availability.append(availability_detail)
        UPC.append(upc)
        Genre.append(genre)
        Description.append(description)
        URLs.append(detail_url)

    except Exception as e:
        print(f"Skipping book due to error: {e}")
        continue

In [103]:
print(len(Title), "Title")
print(len(Price), "Price")
print(len(Rating), "Rating")
print(len(Availability), "Availability")
print(len(UPC), "UPC")
print(len(Genre), "Genre")
print(len(Description), "Description")
print(len(URLs), "URLs")

20 Title
20 Price
20 Rating
20 Availability
20 UPC
20 Genre
20 Description
20 URLs


In [104]:
Price

[38.0,
 28.8,
 39.24,
 32.93,
 51.32,
 47.09,
 28.42,
 22.85,
 41.24,
 39.07,
 29.82,
 37.26,
 20.3,
 34.65,
 43.38,
 55.53,
 57.06,
 16.97,
 53.98,
 26.08]

In [105]:
# Creating a dataframe from the lists
df_list = pd.DataFrame({
  "Title": Title,
  "Price": Price,
  "Rating": Rating,
  "Availability": Availability,
  "UPC": UPC,
  "Genre": Genre,
  "URL": [f"https://books.toscrape.com/catalogue/{book.find('h3').a['href'].replace('../../', '')}" for book in all_books]
})
df_list

Unnamed: 0,Title,Price,Rating,Availability,UPC,Genre,URL
0,Frankenstein,38.0,Two,In stock (1 available),a492f49a3e2b6a71,Default,https://books.toscrape.com/catalogue/frankenst...
1,Forever Rockers (The Rocker #12),28.8,Three,In stock (1 available),e564c3f1a93ccf2e,Music,https://books.toscrape.com/catalogue/forever-r...
2,Fighting Fate (Fighting #6),39.24,Three,In stock (1 available),6390772a3094939d,Romance,https://books.toscrape.com/catalogue/fighting-...
3,Emma,32.93,Two,In stock (1 available),2e69730561ed70ad,Classics,https://books.toscrape.com/catalogue/emma_17/i...
4,"Eat, Pray, Love",51.32,Three,In stock (1 available),df56868afd166557,Nonfiction,https://books.toscrape.com/catalogue/eat-pray-...
5,Deep Under (Walker Security #1),47.09,Five,In stock (1 available),0072b94dfa30608e,Romance,https://books.toscrape.com/catalogue/deep-unde...
6,Choosing Our Religion: The Spiritual Lives of ...,28.42,Four,In stock (1 available),a812f6969ddf3e39,Religion,https://books.toscrape.com/catalogue/choosing-...
7,Charlie and the Chocolate Factory (Charlie Buc...,22.85,Three,In stock (1 available),1774749f2cee292f,Childrens,https://books.toscrape.com/catalogue/charlie-a...
8,Charity's Cross (Charles Towne Belles #4),41.24,One,In stock (1 available),a96404cb928895f3,Romance,https://books.toscrape.com/catalogue/charitys-...
9,Bright Lines,39.07,Five,In stock (1 available),230ac636ea0ea415,Fiction,https://books.toscrape.com/catalogue/bright-li...
