# 00 Import Libraries
We need `requests` for visiting websites, `BeautifulSoup` for scraping websites and `json` for saving the extracted information.

In [None]:
from bs4 import BeautifulSoup
import requests
import json

# 01 Initialize Variables and Beautiful Soup
Set the Homepage of the website we want to scrape and initialize lists for those information.

In [None]:
# HOMEPAGE URL TO SCRAPE
homepage = "https://books.toscrape.com/"

# INITIALIZE BEAUTIFUL SOUP
response = requests.get(homepage)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'html.parser')

# INITIALIZE LISTS TO STORE INFORMATION
books_info = []
books_description = []

# 02 Loop To Extract and Clean Information
First we need to get all book categories and the links to those categories. We then go to each of those category websites and extract all book titles. We also get the links to each of those titles to get the description of those books. If there is pagination (more then one page per category) we also visit those websites.

In [None]:
# LOOP FOR CATEGORIES
for a in soup.select('.side_categories ul ul li a'):
    book_category = a.text.strip()
    category_link = a['href'].replace("..", "")

    # GO TO CATEGORY PAGE AND EXTRACT ALL BOOKS FROM CATEGORY
    category_page_url = homepage + category_link

    while category_page_url:
        response_category = requests.get(category_page_url)
        response_category.encoding = 'utf-8'
        soup_category = BeautifulSoup(response_category.content, 'html.parser')

        # FIND ALL BOOKS INSIDE <li> ELEMENTS
        books = soup_category.find_all('li', class_='col-xs-6 col-sm-4 col-md-3 col-lg-3')

        # LOOP THRU ALL BOOKS TO EXTRACT INFORMATION
        for book in books:

            # EXTRACT Title, Rating, Price, Availability
            book_title_tag = book.find('h3').find('a')
            book_title = book_title_tag.text
            book_link = book_title_tag['href'].replace("../../../", "")
            book_rating = book.find('p', class_='star-rating')['class'][1]
            book_price = book.find('p', class_='price_color').text
            book_availability = book.find('p', class_='instock availability').text.strip()

            # GET description FROM DETAIL PAGE
            detail_page_url = homepage + "catalogue/" + book_link
            response_detail = requests.get(detail_page_url)
            soup_detail = BeautifulSoup(response_detail.content, 'html.parser')
            
            if soup_detail.find('article', class_='product_page').find('p', class_=False):
                p_tags = soup_detail.find('article', class_='product_page').find('p', class_=False)
                books_description = p_tags.text
            else:
                books_description = "No description available"

            # ADD INFORMATION TO LIST
            books_info.append({
                'book_title': book_title,
                'book_category': book_category,
                'books_description': books_description,
                'book_link': book_link,
                'book_rating': book_rating,
                'book_price': book_price,
                'book_availability': book_availability,
            })

        # CHECK FOR NEXT-PAGE IN THIS CATEGORY
        next_page = soup_category.find('li', class_='next')
        if next_page:
            url_next = next_page.find('a')['href']
            category_page_url = category_page_url.rsplit('/', 1)[0] + '/' + url_next
        else:
            category_page_url = None

# 03 Save Information to a File
Save all extracted information in a local JSON file.

In [None]:
# CONVERT TO JSON
json_data = json.dumps(books_info, ensure_ascii=False, indent=4)

# SAVE JSON
with open('json/books_info.json', 'w') as file:
    file.write(json_data)