In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# Function to fetch URLs of nested pages (book detail pages) from the parent page
def fetch_books_urls_and_details(parent_url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    response = requests.get(parent_url, headers=headers)
    
    if response.status_code != 200:
        print(f"Failed to retrieve the parent page: {parent_url}")
        return []
    
    # Parse the HTML content of the parent page
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Extract titles and authors directly from the parent page
    book_data = []
    for book in soup.find_all('tr', itemtype='http://schema.org/Book'):
        title_tag = book.find('a', class_='bookTitle')
        title = title_tag.get_text(strip=True) if title_tag else 'Unknown Title'
        
        author_tag = book.find('a', class_='authorName')
        author = author_tag.get_text(strip=True) if author_tag else 'Unknown Author'
        
        # The link to the book detail page
        book_url = 'https://www.goodreads.com' + title_tag['href'] if title_tag else None
        
        # Scrape genre from the book detail page
        genre = scrape_genre_from_book_page(book_url) if book_url else 'Unknown Genre'
        
        book_data.append({
            'title': title,
            'author': author,
            'genre': genre
        })
    
    return book_data

# Function to scrape the genre from the book's detail page
def scrape_genre_from_book_page(book_url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    response = requests.get(book_url, headers=headers)
    
    if response.status_code != 200:
        print(f"Failed to retrieve the book page: {book_url}")
        return 'Unknown Genre'
    
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Goodreads might list genres as part of a 'genres' section
    genre_tag = soup.find('a', class_='actionLinkLite bookPageGenreLink')
    genre = genre_tag.get_text(strip=True) if genre_tag else 'Unknown Genre'
    
    return genre

# Main function to scrape data from parent and nested pages
def scrape_data_from_goodreads_list(parent_url, num_books):
    all_books = []
    
    # Step 1: Get the books and their details (including links to the book detail pages)
    books_data = fetch_books_urls_and_details(parent_url)
    
    # Step 2: Process a limited number of books (up to num_books)
    for idx, book_data in enumerate(books_data[:num_books]):
        print(f"Scraping book {idx + 1}: {book_data['title']} by {book_data['author']}")
        all_books.append(book_data)
        
        # Add a delay to avoid overwhelming the server
        time.sleep(2)
    
    return all_books

# Example parent URL (Best Books Ever list)
parent_url = 'https://www.goodreads.com/list/show/1.Best_Books_Ever'

# Scrape data for the first 10 books (you can adjust the number)
scraped_data = scrape_data_from_goodreads_list(parent_url, 10)

# Convert the scraped data to a pandas DataFrame and save it to a CSV file
df = pd.DataFrame(scraped_data)
df.to_csv('goodreads_books_data.csv', index=False)

print("Scraping complete. Data saved to goodreads_books_data.csv")


Scraping book 1: The Hunger Games (The Hunger Games, #1) by Suzanne Collins
Scraping book 2: Harry Potter and the Order of the Phoenix (Harry Potter, #5) by J.K. Rowling
Scraping book 3: Pride and Prejudice by Jane Austen
Scraping book 4: To Kill a Mockingbird by Harper Lee
Scraping book 5: The Book Thief by Markus Zusak
Scraping book 6: Twilight (The Twilight Saga, #1) by Stephenie Meyer
Scraping book 7: Animal Farm by George Orwell
Scraping book 8: J.R.R. Tolkien 4-Book Boxed Set: The Hobbit and The Lord of the Rings by J.R.R. Tolkien
Scraping book 9: The Chronicles of Narnia (The Chronicles of Narnia, #1-7) by C.S. Lewis
Scraping book 10: The Fault in Our Stars by John Green
Scraping complete. Data saved to goodreads_books_data.csv
