In [1]:
import time
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import random

# List to store DataFrames from each page
all_dfs = []

# Loop through 50 pages of book listings
for j in range(1, 51):
    # Define URL and headers to mimic a browser
    url = f'https://books.toscrape.com/catalogue/category/books_1/page-{j}.html'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.162 Safari/537.36'
    }

    # Send GET request and parse HTML content
    html = requests.get(url, headers=headers).text
    soup = BeautifulSoup(html, 'lxml')

    # Find all book containers on the page
    con = soup.find_all(class_='product_pod')

    # Initialize lists for storing scraped data
    title = []
    web_address = []
    rating = []
    price_in_pound = []
    availability = []
    image_link = []

    # Loop through each book and extract relevant details
    for i in con:
        # Book title
        title.append(i.h3.text)

        # Construct full URL to the book's detail page
        base = 'https://books.toscrape.com/'
        row = i.h3.a['href']
        address = row.replace('../..', base)
        web_address.append(address)

        # Book rating (e.g., Three, Four, etc.)
        rating.append(i.p.get('class')[1])

        # Price in GBP (cleaned)
        price_in_pound.append(i.find(class_='price_color').text.replace('Â£', ''))

        # Availability status (e.g., In stock)
        availability.append(i.find(class_='instock availability').text.strip())

        # Construct full image URL
        base1 = 'https://books.toscrape.com'
        row = i.a.img['src']
        img_link = row.replace('../../..', base1)
        image_link.append(img_link)

    # Create DataFrame for the current page
    d = {
        'Title': title,
        'Web Address': web_address,
        'Rating': rating,
        'Price (GBP)': price_in_pound,
        'Availability': availability,
        'Image URL': image_link
    }

    df = pd.DataFrame(d)
    all_dfs.append(df)

    # Cooldown to prevent overloading the server (1 to 5 seconds)
    time.sleep(random.randint(1, 5))

# Combine all page DataFrames into one
final = pd.concat(all_dfs, ignore_index=True)

# Save the final DataFrame to CSV file
final.to_csv('Books_data.csv', index=False)
print("Scraping complete. Data saved to 'Books_data.csv'")

Scraping complete. Data saved to 'Books_data.csv'


In [3]:
final.head()

Unnamed: 0,Title,Web Address,Rating,Price (GBP),Availability,Image URL
0,A Light in the ...,https://books.toscrape.com//a-light-in-the-att...,Three,51.77,In stock,https://books.toscrape.com/media/cache/2c/da/2...
1,Tipping the Velvet,https://books.toscrape.com//tipping-the-velvet...,One,53.74,In stock,https://books.toscrape.com/media/cache/26/0c/2...
2,Soumission,https://books.toscrape.com//soumission_998/ind...,One,50.1,In stock,https://books.toscrape.com/media/cache/3e/ef/3...
3,Sharp Objects,https://books.toscrape.com//sharp-objects_997/...,Four,47.82,In stock,https://books.toscrape.com/media/cache/32/51/3...
4,Sapiens: A Brief History ...,https://books.toscrape.com//sapiens-a-brief-hi...,Five,54.23,In stock,https://books.toscrape.com/media/cache/be/a5/b...
