In [1]:
# impoting necessary libraries for web scraping
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

In [2]:
# i am using custom headers to make request look like it’s coming from a real browser.
headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win 64 ; x64) Apple WeKit /537.36(KHTML , like Gecko) Chrome/80.0.3987.162 Safari/537.36'} 
# Sending a GET request to fetch the HTML content of the Books to Scrape website.
webpage = requests.get('http://books.toscrape.com/',headers=headers).text

In [3]:
# Using BeautifulSoup to parse the webpage content with the 'lxml' parser.
soup = BeautifulSoup(webpage, 'lxml')

In [4]:
titles = []
# extracted title of each book which are in 'title' tag
for i in soup.find_all('h3'):
    a_tag = i.find('a')
    if a_tag and 'title' in a_tag.attrs:
        titles.append(a_tag['title'])

print(titles)




['A Light in the Attic', 'Tipping the Velvet', 'Soumission', 'Sharp Objects', 'Sapiens: A Brief History of Humankind', 'The Requiem Red', 'The Dirty Little Secrets of Getting Your Dream Job', 'The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull', 'The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics', 'The Black Maria', 'Starving Hearts (Triangular Trade Trilogy, #1)', "Shakespeare's Sonnets", 'Set Me Free', "Scott Pilgrim's Precious Little Life (Scott Pilgrim #1)", 'Rip it Up and Start Again', 'Our Band Could Be Your Life: Scenes from the American Indie Underground, 1981-1991', 'Olio', 'Mesaerion: The Best Science Fiction Stories 1800-1849', 'Libertarianism for Beginners', "It's Only the Himalayas"]


In [5]:
price = []
# extracted price
for i in soup.find_all('p',class_='price_color'):
    p = i.text[2:]
    price.append(p)
    print(p)

51.77
53.74
50.10
47.82
54.23
22.65
33.34
17.93
22.60
52.15
13.99
20.66
17.46
52.29
35.02
57.25
23.88
37.59
51.33
45.17


In [6]:
ratings = []
for i in soup.find_all('p', class_='star-rating'):
    class_name = i.get('class')  # Getting the class attribute as a list

    # Determine the rating based on the class name('star-rating')
    # The ratings on the page are in stars, so I converted them into numeric values, using conditional statements.
    if "One" in class_name:
        ratings.append(1)
    elif "Two" in class_name:
        ratings.append(2)
    elif "Three" in class_name:
        ratings.append(3)
    elif "Four" in class_name:
        ratings.append(4)
    elif "Five" in class_name:
        ratings.append(5)

# Print all ratings
print(ratings)

[3, 1, 1, 4, 5, 1, 4, 3, 4, 1, 2, 4, 5, 5, 5, 3, 1, 1, 2, 2]


In [7]:
df = pd.DataFrame({
    "Title": titles,
    "Price" : price,
    "Rating": ratings
})


In [22]:
book_store_data = pd.DataFrame()

# The site has content on 50 pages.
for n in range(1, 51):
    # getting data from the URL 
    url = 'http://books.toscrape.com/catalogue/page-{}.html'.format(n)
    webpage = requests.get(url).text
    soup = BeautifulSoup(webpage, 'lxml')
    
    # all the information about each book is in 'product_pod' class
    books_data = soup.find_all(class_='product_pod')
    
    title = []
    price = []
    rating = []
    
    # Extracting data for each book
    for i in books_data:
        
            # extracting Title
        try:
            a_tag = i.find('h3').find('a')
            title.append(a_tag['title'] if a_tag else np.nan)
        except:
            title.append(np.nan)
        
            # extracting price
        try:
            price_tag = i.find('p', class_='price_color')
            price.append(price_tag.text[2:] if price_tag else np.nan)
        except:
            price.append(np.nan)
        
            # extracting Rating
        try:
            rating_tag = i.find('p', class_='star-rating')
            class_name = rating_tag.get('class') if rating_tag else []
            if "One" in class_name:
                rating.append(1)
            elif "Two" in class_name:
                rating.append(2)
            elif "Three" in class_name:
                rating.append(3)
            elif "Four" in class_name:
                rating.append(4)
            elif "Five" in class_name:
                rating.append(5)
            else:
                rating.append(np.nan)
        except:
            rating.append(np.nan)
    
    # Creating DataFrame for this page
    df = pd.DataFrame({
        'Title': title,
        'Price': price,
        'Rating': rating
    })
    
    # Appending to main DataFrame
    book_store_data = pd.concat([book_store_data, df], ignore_index=True)


In [23]:
# the final data
print(book_store_data)

Unnamed: 0,Title,Price,Rating
0,A Light in the Attic,51.77,3
1,Tipping the Velvet,53.74,1
2,Soumission,50.10,1
3,Sharp Objects,47.82,4
4,Sapiens: A Brief History of Humankind,54.23,5
...,...,...,...
995,Alice in Wonderland (Alice's Adventures in Won...,55.53,1
996,"Ajin: Demi-Human, Volume 1 (Ajin: Demi-Human #1)",57.06,4
997,A Spy's Devotion (The Regency Spies of London #1),16.97,5
998,1st to Die (Women's Murder Club #1),53.98,1


In [34]:
# storing the extracted data in csv file
book_store_data.to_csv('book_store_data.csv', index=False)  