# Webscraping with Beautiful Soup

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import os
from urllib.parse import urljoin

In [3]:
base_url = 'http://books.toscrape.com/'
response = requests.get(base_url)

In [5]:
soup = BeautifulSoup(response.text, 'html.parser')

In [6]:
soup

<!DOCTYPE html>

<!--[if lt IE 7]>      <html lang="en-us" class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->
<!--[if IE 7]>         <html lang="en-us" class="no-js lt-ie9 lt-ie8"> <![endif]-->
<!--[if IE 8]>         <html lang="en-us" class="no-js lt-ie9"> <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js" lang="en-us"> <!--<![endif]-->
<head>
<title>
    All products | Books to Scrape - Sandbox
</title>
<meta content="text/html; charset=utf-8" http-equiv="content-type"/>
<meta content="24th Jun 2016 09:29" name="created"/>
<meta content="" name="description"/>
<meta content="width=device-width" name="viewport"/>
<meta content="NOARCHIVE,NOCACHE" name="robots"/>
<!-- Le HTML5 shim, for IE6-8 support of HTML elements -->
<!--[if lt IE 9]>
        <script src="//html5shim.googlecode.com/svn/trunk/html5.js"></script>
        <![endif]-->
<link href="static/oscar/favicon.ico" rel="shortcut icon"/>
<link href="static/oscar/css/styles.css" rel="stylesheet" type="text/css"/>
<link href="s

In [7]:
books = soup.select('article.product_pod')
books

[<article class="product_pod">
 <div class="image_container">
 <a href="catalogue/a-light-in-the-attic_1000/index.html"><img alt="A Light in the Attic" class="thumbnail" src="media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg"/></a>
 </div>
 <p class="star-rating Three">
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 </p>
 <h3><a href="catalogue/a-light-in-the-attic_1000/index.html" title="A Light in the Attic">A Light in the ...</a></h3>
 <div class="product_price">
 <p class="price_color">Â£51.77</p>
 <p class="instock availability">
 <i class="icon-ok"></i>
     
         In stock
     
 </p>
 <form>
 <button class="btn btn-primary btn-block" data-loading-text="Adding..." type="submit">Add to basket</button>
 </form>
 </div>
 </article>,
 <article class="product_pod">
 <div class="image_container">
 <a href="catalogue/tipping-the-velvet_999/index.html"><img alt="Tipping the Velvet" class="th

In [2]:
def scrape_books_toscrape():
    """
    Scrapes book information from books.toscrape.com, a website specifically
    designed for practicing web scraping.
    """
    # Base URL of the website
    base_url = 'http://books.toscrape.com/'

    # Send a GET request to the website
    response = requests.get(base_url)

    # Check if the request was successful
    if response.status_code != 200:
        print(f"Failed to fetch the webpage: Status code {response.status_code}")
        return None

    # Parse the HTML content using Beautiful Soup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all books on the page
    books = soup.select('article.product_pod')

    # Lists to store book information
    titles = []
    prices = []
    ratings = []
    availability = []

    # Extract information from each book
    for book in books:
        # Extract title
        title = book.h3.a['title']
        titles.append(title)

        # Extract price
        price = book.select_one('div.product_price p.price_color').text
        prices.append(price)

        # Extract rating
        rating = book.select_one('p.star-rating')['class'][1]
        ratings.append(rating)

        # Extract availability
        avail = book.select_one('div.product_price p.availability').text.strip()
        availability.append(avail)

    # Create a DataFrame to store the data
    books_data = pd.DataFrame({
        'Title': titles,
        'Price': prices,
        'Rating': ratings,
        'Availability': availability
    })

    print(f"Successfully scraped {len(books_data)} books from the main page.")
    return books_data


In [8]:
def scrape_multiple_pages(max_pages=3):
    """
    Scrapes book information from multiple pages of books.toscrape.com
    """
    all_books = []
    base_url = 'http://books.toscrape.com/catalogue/page-{}.html'

    for page in range(1, max_pages + 1):
        print(f"Scraping page {page}...")

        # Construct the URL for the current page
        url = base_url.format(page)

        # Send a GET request to the page
        response = requests.get(url)

        # Check if the request was successful
        if response.status_code != 200:
            print(f"Failed to fetch page {page}: Status code {response.status_code}")
            continue

        # Parse the HTML content
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all books on the page
        books = soup.select('article.product_pod')

        # Extract information from each book
        for book in books:
            book_info = {}

            # Extract title
            book_info['Title'] = book.h3.a['title']

            # Extract price
            book_info['Price'] = book.select_one('div.product_price p.price_color').text

            # Extract rating
            book_info['Rating'] = book.select_one('p.star-rating')['class'][1]

            # Extract availability
            book_info['Availability'] = book.select_one('div.product_price p.availability').text.strip()

            # Add book info to the list
            all_books.append(book_info)

        # Add a delay to be polite to the server
        time.sleep(random.uniform(1, 3))

    # Create a DataFrame to store all the scraped data
    books_df = pd.DataFrame(all_books)

    print(f"Successfully scraped {len(books_df)} books from {max_pages} pages.")
    return books_df


In [10]:
def scrape_book_details(book_url):
    """
    Scrapes detailed information about a specific book.
    """
    # Send a GET request to the book page
    response = requests.get(book_url)

    # Check if the request was successful
    if response.status_code != 200:
        print(f"Failed to fetch the book page: Status code {response.status_code}")
        return None

    # Parse the HTML content
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract book details
    book_details = {}

    # Extract title
    book_details['Title'] = soup.select_one('div.product_main h1').text

    # Extract price
    book_details['Price'] = soup.select_one('p.price_color').text

    # Extract availability
    book_details['Availability'] = soup.select_one('p.availability').text.strip()

    # Extract product description
    description_element = soup.select_one('div#product_description + p')
    book_details['Description'] = description_element.text if description_element else 'No description available'

    # Extract product information
    product_info = {}
    rows = soup.select('table.table-striped tr')
    for row in rows:
        header = row.select_one('th').text
        value = row.select_one('td').text
        product_info[header] = value

    # Add product information to book details
    book_details.update(product_info)

    return book_details


In [11]:
def scrape_by_category():
    """
    Scrapes books by category from books.toscrape.com
    """
    base_url = 'http://books.toscrape.com/'

    # Send a GET request to the website
    response = requests.get(base_url)

    # Check if the request was successful
    if response.status_code != 200:
        print(f"Failed to fetch the webpage: Status code {response.status_code}")
        return None

    # Parse the HTML content
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all categories
    categories = soup.select('div.side_categories ul.nav-list > li > ul > li > a')

    category_data = {}

    for category in categories[:3]:  # Limit to first 3 categories for demonstration
        category_name = category.text.strip()
        category_url = urljoin(base_url, category['href'])

        print(f"Scraping category: {category_name}")

        # Send a GET request to the category page
        category_response = requests.get(category_url)

        if category_response.status_code != 200:
            print(f"Failed to fetch the category page: Status code {category_response.status_code}")
            continue

        # Parse the HTML content of the category page
        category_soup = BeautifulSoup(category_response.text, 'html.parser')

        # Find all books in this category
        books = category_soup.select('article.product_pod')

        # Lists to store book information for this category
        category_books = []

        for book in books:
            book_info = {}

            # Extract title
            book_info['Title'] = book.h3.a['title']

            # Extract price
            book_info['Price'] = book.select_one('div.product_price p.price_color').text

            # Extract rating
            book_info['Rating'] = book.select_one('p.star-rating')['class'][1]

            # Extract availability
            book_info['Availability'] = book.select_one('div.product_price p.availability').text.strip()

            # Add book info to the list
            category_books.append(book_info)

        # Store books for this category
        category_data[category_name] = pd.DataFrame(category_books)

        # Add a delay to be polite to the server
        time.sleep(random.uniform(1, 3))

    return category_data


In [12]:
def save_to_file(data, filename='scraped_data.csv'):
    """
    Saves scraped data to a CSV file.
    """
    data.to_csv(filename, index=False)
    print(f"Data saved to {filename}")


In [13]:
def main():
    """
    Main function to demonstrate different web scraping techniques
    """
    print("Web Scraping with Beautiful Soup - Demonstration")
    print("=" * 50)

    # Create a directory to store results if it doesn't exist
    if not os.path.exists('scraping_results'):
        os.makedirs('scraping_results')

    # 1. Basic scraping from the main page
    print("\n1. Basic scraping from the main page")
    basic_data = scrape_books_toscrape()
    if basic_data is not None:
        save_to_file(basic_data, 'scraping_results/basic_data.csv')

    # 2. Scraping multiple pages
    print("\n2. Scraping multiple pages")
    multiple_pages_data = scrape_multiple_pages(max_pages=2)
    if multiple_pages_data is not None:
        save_to_file(multiple_pages_data, 'scraping_results/multiple_pages_data.csv')

    # 3. Scraping book details
    print("\n3. Scraping book details")
    book_url = 'http://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html'
    book_details = scrape_book_details(book_url)
    if book_details is not None:
        print("Book details successfully scraped:")
        for key, value in book_details.items():
            print(f"{key}: {value}")

    # 4. Scraping by category
    print("\n4. Scraping by category")
    category_data = scrape_by_category()
    if category_data is not None:
        for category_name, category_df in category_data.items():
            filename = f'scraping_results/{category_name.lower().replace(" ", "_")}_books.csv'
            save_to_file(category_df, filename)

    print("\nWeb scraping demonstration completed.")



In [14]:
if __name__ == "__main__":
    main()

Web Scraping with Beautiful Soup - Demonstration

1. Basic scraping from the main page
Successfully scraped 20 books from the main page.
Data saved to scraping_results/basic_data.csv

2. Scraping multiple pages
Scraping page 1...
Scraping page 2...
Successfully scraped 40 books from 2 pages.
Data saved to scraping_results/multiple_pages_data.csv

3. Scraping book details
Book details successfully scraped:
Title: A Light in the Attic
Price: Â£51.77
Availability: In stock (22 available)
Description: It's hard to imagine a world without A Light in the Attic. This now-classic collection of poetry and drawings from Shel Silverstein celebrates its 20th anniversary with this special edition. Silverstein's humorous and creative verse can amuse the dowdiest of readers. Lemon-faced adults and fidgety kids sit still and read these rhythmic words and laugh and smile and love th It's hard to imagine a world without A Light in the Attic. This now-classic collection of poetry and drawings from Shel S