In [1]:
from bs4 import BeautifulSoup
import requests

In [2]:
URL = "https://books.toscrape.com/catalogue/page-1.html"
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')

In [3]:
# Globals
listCategories = []
listBooks = []

In [4]:
# Locators
BOOKLOCATOR = "li article.product_pod h3 a"
RATINGLOCATOR = "li article.product_pod p.star-rating"
PRICELOCATOR = "li article.product_pod div.product_price p.price_color"
NUMBERPAGESLOCATOR = "div.container-fluid div.page_inner div.row div.col-sm-8 section div div ul.pager li.current"
CATEGORIESLOCATOR = "aside.sidebar div.side_categories ul.nav-list ul li a"
BOOKSLOCATOR = "div.page_inner div.row div section ol.row li"
PICLOCATOR = "li article.product_pod div.image_container a img"

In [5]:
# functions
def getProductName(book):
    return book.select_one(BOOKLOCATOR).attrs['title']

def getURLProduct(book):
    return book.select_one(BOOKLOCATOR).attrs['href']

def ratingNumber(rating):
    match rating:
        case 'One':
            return 1
        case 'Two':
            return 2
        case 'Three':
            return 3
        case 'Four':
            return 4
        case 'Five':
            return 5
    return None

def getRatingProduct(book):
    rating = book.select_one(RATINGLOCATOR).attrs['class'][1]
    return ratingNumber(rating)

def getProductPrice(book):
    price = book.select_one(PRICELOCATOR).text
    return float(price[1:])

def getTotalPages():
    temp = soup.select_one(NUMBERPAGESLOCATOR).text
    return int(temp.split()[-1])

def getAllCategories():
    return soup.select(CATEGORIESLOCATOR)

def fillCategoryList():
    allCategory = getAllCategories()
    for category in allCategory:
        listCategories.append(category.text.strip())
    print(listCategories)

def getAllBooks(soup):
    return soup.select(BOOKSLOCATOR)

def getProductPicURL(book):
    domain = "https://books.toscrape.com/"
    relative = book.select_one(PICLOCATOR).attrs['src']
    return domain + relative[3:]

def extractAllBooks(allBooks):
    for book in allBooks:
        bookName = getProductName(book)
        bookURL = getURLProduct(book)
        bookRating = getRatingProduct(book)
        bookPrice = getProductPrice(book)
        bookPic = getProductPicURL(book)
        listBooks.append({
            'bookName' : bookName,
            'bookURL' : bookURL,
            'bookRating' : bookRating,
            'bookPrice' : bookPrice,
            'bookPic' : bookPic
        })
        

def fillListBooks(totalPages):
    # inicialmente vamos buscar apenas os dados de uma página
    #for i in range(1, 3):
    for i in range(1, totalPages+1):
        print(f"Extracting from Page {i}\n")
        URL = f"https://books.toscrape.com/catalogue/page-{i}.html"
        page = requests.get(URL)
        soup = BeautifulSoup(page.content, 'html.parser')
        allBooks = getAllBooks(soup)
        extractAllBooks(allBooks)

In [6]:
# call functions
totalPages = getTotalPages()
fillCategoryList()
fillListBooks(totalPages)
print(len(listBooks))

['Travel', 'Mystery', 'Historical Fiction', 'Sequential Art', 'Classics', 'Philosophy', 'Romance', 'Womens Fiction', 'Fiction', 'Childrens', 'Religion', 'Nonfiction', 'Music', 'Default', 'Science Fiction', 'Sports and Games', 'Add a comment', 'Fantasy', 'New Adult', 'Young Adult', 'Science', 'Poetry', 'Paranormal', 'Art', 'Psychology', 'Autobiography', 'Parenting', 'Adult Fiction', 'Humor', 'Horror', 'History', 'Food and Drink', 'Christian Fiction', 'Business', 'Biography', 'Thriller', 'Contemporary', 'Spirituality', 'Academic', 'Self Help', 'Historical', 'Christian', 'Suspense', 'Short Stories', 'Novels', 'Health', 'Politics', 'Cultural', 'Erotica', 'Crime']
Extracting from Page 1

Extracting from Page 2

Extracting from Page 3

Extracting from Page 4

Extracting from Page 5

Extracting from Page 6

Extracting from Page 7

Extracting from Page 8

Extracting from Page 9

Extracting from Page 10

Extracting from Page 11

Extracting from Page 12

Extracting from Page 13

Extracting from 