In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# Function to fetch the list URLs from a single page of the "Recently Active Lists"
def fetch_list_urls(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    # Send a GET request to the Goodreads page
    response = requests.get(url, headers=headers)
    
    # Check if the response is OK (status code 200)
    if response.status_code != 200:
        print(f"Failed to retrieve the webpage: {url}")
        return []
    
    # Parse the HTML content of the page
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find the list containers and their respective URLs
    list_urls = []
    for list_container in soup.find_all('a', class_='listTitle'):
        list_name = list_container.get_text(strip=True)
        list_url = 'https://www.goodreads.com' + list_container['href']  # Construct the full URL
        list_urls.append((list_name, list_url))
    
    return list_urls

# Function to fetch book titles and user ratings from a given list URL
def fetch_books_from_list(list_name, list_url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    response = requests.get(list_url, headers=headers)
    
    if response.status_code != 200:
        print(f"Failed to retrieve the list: {list_url}")
        return []
    
    # Print the HTML content for debugging purposes
    print(f"Fetching books from list: {list_name} ({list_url})")
    print(response.content[:1000])  # Print the first 1000 characters of the HTML content
    
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find all book titles and user ratings in the list
    book_data = []
    
    # Debugging: Check if class names are correct
    for book_item in soup.find_all('tr', class_='bookalike'):
        book_title = book_item.find('a', class_='bookTitle').get_text(strip=True)
        
        # Find the user rating
        rating_tag = book_item.find('span', class_='minirating')
        if rating_tag:
            user_rating = rating_tag.get_text(strip=True)
        else:
            user_rating = "No rating"
        
        book_data.append((list_name, book_title, user_rating))
    
    return book_data

# Function to scrape list URLs and then scrape books from those lists
def scrape_lists(base_url, total_pages):
    all_books = []
    
    for page_num in range(1, total_pages + 1):
        url = f"{base_url}{page_num}"
        print(f"Scraping page {page_num}: {url}")
        
        # Fetch list URLs from the current page
        list_urls = fetch_list_urls(url)
        
        if list_urls:
            for list_name, list_url in list_urls:
                print(f"Fetching books from list: {list_name} ({list_url})")
                
                # Fetch books and ratings from the list
                books = fetch_books_from_list(list_name, list_url)
                all_books.extend(books)
        else:
            print(f"No lists found on page {page_num}")
        
        # Pause to avoid overwhelming the server
        time.sleep(2)
    
    return all_books

# Function to save titles and ratings to a CSV file using pandas
def save_to_csv(book_data, filename):
    # Create a pandas DataFrame from the list of (list_name, book_title, user_rating)
    df = pd.DataFrame(book_data, columns=['List Name', 'Book Title', 'User Rating'])
    
    # Save the DataFrame to a CSV file
    df.to_csv(filename, index=False)

# Base URL of the Goodreads "Recently Active Lists" with pagination
base_url = 'https://www.goodreads.com/list/recently_active_lists?page='

# Total number of pages to scrape (1 to 4 for testing)
total_pages = 4

# Scrape book titles and ratings from the first 100 pages
book_data = scrape_lists(base_url, total_pages)

# Saving titles and ratings to a CSV file if any books are found
if book_data:
    save_to_csv(book_data, 'goodreads_recently_active_books_with_ratings.csv')
    print(f'Successfully saved {len(book_data)} books and ratings to CSV file.')
else:
    print("No books were scraped.")


Scraping page 1: https://www.goodreads.com/list/recently_active_lists?page=1
Fetching books from list: Christmas Fantasy Wish List (https://www.goodreads.com/list/show/15150.Christmas_Fantasy_Wish_List)
Fetching books from list: Christmas Fantasy Wish List (https://www.goodreads.com/list/show/15150.Christmas_Fantasy_Wish_List)
b'<!DOCTYPE html>\n<html class="desktop withSiteHeaderTopFullImage\n">\n<head>\n  <title>Christmas Fantasy Wish List (3388 books)</title>\n\n<meta content="3,388 books based on 202 votes: A Wrinkle in Time by Madeleine L\'Engle, The Lion, the Witch and the Wardrobe by C.S. Lewis, In a Holidaze by Christina La..." name=\'description\'>\n<meta content=\'telephone=no\' name=\'format-detection\'>\n<link href=\'https://www.goodreads.com/list/show/15150.Christmas_Fantasy_Wish_List\' rel=\'canonical\'>\n\n\n\n    <script type="text/javascript"> var ue_t0=window.ue_t0||+new Date();\n </script>\n  <script type="text/javascript">\n    var ue_mid = "A1PQBFHBHS6YH1";\n    va