In [25]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin

# Function to extract content from a book's page
def extract_book_content(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Extract the book title
        page_name = soup.h1.get_text().strip() if soup.h1 else 'No title'

        # Extract book description (from <meta> tag or first paragraph)
        description_tag = soup.find('meta', {'name': 'description'})
        content = description_tag['content'].strip() if description_tag else 'No description available'
        
        return page_name, url, content
    except requests.exceptions.RequestException as e:
        print(f"Error accessing {url}: {e}")
        return None, url, None

# Function to crawl all pages of Books to Scrape
def crawl_books_to_scrape(main_url):
    data = []
    current_page_url = main_url
    page_num = 1

    while current_page_url:
        try:
            # Request the current page
            response = requests.get(current_page_url)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')

            # Find all book links on the current page
            book_links = [urljoin(current_page_url, a['href']) for a in soup.select('h3 a')]

            # Extract data from each book's detail page
            for link in book_links:
                page_name, url, content = extract_book_content(link)
                if page_name:
                    data.append([page_name, url, content])

            # Check for the "next" button and move to the next page if available
            next_button = soup.find('li', class_='next')
            if next_button:
                page_num += 1
                next_page_url = f'catalogue/page-{page_num}.html'
                current_page_url = urljoin(main_url, next_page_url)
            else:
                current_page_url = None

        except requests.exceptions.RequestException as e:
            print(f"Error accessing {current_page_url}: {e}")
            break

    return data

# Main code to execute the crawler and save to Excel
if __name__ == '__main__':
    main_url = 'https://books.toscrape.com/'
    crawled_data = crawl_books_to_scrape(main_url)

    # Save results to Excel
    df = pd.DataFrame(crawled_data, columns=['Page Name', 'URL', 'Content'])
    df.to_excel('books_to_scrape_full_data.xlsx', index=False)

    print("Data saved to books_to_scrape_full_data.xlsx")

# Takes around 15 min to finish scraping 1000 books.

Data saved to books_to_scrape_full_data.xlsx
