In [None]:
pip install requests beautifulsoup4 pandas



In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# The base URL of the website we want to scrape
base_url = 'http://books.toscrape.com/catalogue/'
current_page_url = base_url + 'page-1.html'

# Lists to store all the data we collect from every page
all_books_data = []

# Keep scraping as long as there is a "next" page
while True:
    print(f"Scraping page: {current_page_url}")

    # Make the request to the current page
    response = requests.get(current_page_url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all book "articles" on the current page
    books = soup.find_all('article', class_='product_pod')

    # For each book, extract the link to its detail page
    for book in books:
        # The link is relative, so we need to join it with the base URL
        book_url = base_url + book.find('a')['href']

        # Now, go to the book's specific page to get more details
        book_response = requests.get(book_url)
        book_soup = BeautifulSoup(book_response.text, 'html.parser')

        # --- Extracting the detailed data ---
        try:
            title = book_soup.find('h1').text
            price = book_soup.find('p', class_='price_color').text
            stock = book_soup.find('p', class_='instock availability').text.strip()
            rating_map = {'One': 1, 'Two': 2, 'Three': 3, 'Four': 4, 'Five': 5}
            rating_text = book_soup.find('p', class_='star-rating')['class'][1]
            rating = rating_map.get(rating_text, 0) # Default to 0 if not found
            category = book_soup.find('ul', class_='breadcrumb').find_all('a')[2].text
            description = book_soup.find('div', id='product_description').find_next_sibling('p').text

            # Append the dictionary of book data to our master list
            all_books_data.append({
                'Title': title,
                'Price': price,
                'Stock Availability': stock,
                'Rating': rating,
                'Category': category,
                'Description': description
            })
        except Exception as e:
            print(f"  Could not process book. Error: {e}")

    # Check if there is a "next" button on the page
    next_button = soup.find('li', class_='next')
    if next_button:
        # If there is, get the link and prepare to scrape the next page
        next_page_href = next_button.find('a')['href']
        current_page_url = base_url + next_page_href
        time.sleep(1) # Be polite and wait 1 second between page loads
    else:
        # If there's no "next" button, we're on the last page. Break the loop.
        print("No more pages found. Scraping complete.")
        break

# Create the final DataFrame from our list of dictionaries
books_df = pd.DataFrame(all_books_data)

# Save the rich dataset to a CSV file
books_df.to_csv('books_dataset.csv', index=False, encoding='utf-8-sig')

print("\n--- Scraping Finished! ---")
print(f"Total books scraped: {len(books_df)}")
print("\n--- Dataset Preview ---")
print(books_df.head())

Scraping page: http://books.toscrape.com/catalogue/page-1.html
Scraping page: http://books.toscrape.com/catalogue/page-2.html
Scraping page: http://books.toscrape.com/catalogue/page-3.html
Scraping page: http://books.toscrape.com/catalogue/page-4.html
Scraping page: http://books.toscrape.com/catalogue/page-5.html
Scraping page: http://books.toscrape.com/catalogue/page-6.html
Scraping page: http://books.toscrape.com/catalogue/page-7.html
Scraping page: http://books.toscrape.com/catalogue/page-8.html
Scraping page: http://books.toscrape.com/catalogue/page-9.html
  Could not process book. Error: 'NoneType' object has no attribute 'find_next_sibling'
Scraping page: http://books.toscrape.com/catalogue/page-10.html
Scraping page: http://books.toscrape.com/catalogue/page-11.html
Scraping page: http://books.toscrape.com/catalogue/page-12.html
Scraping page: http://books.toscrape.com/catalogue/page-13.html
Scraping page: http://books.toscrape.com/catalogue/page-14.html
Scraping page: http://boo