In [19]:
# import requests
# from bs4 import BeautifulSoup
# import pandas as pd

In [20]:
# headers    = {
#     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
# url        = 'https://www.goodreads.com/review/list/73221135?page=1'
# response   = requests.get(url, headers=headers)
# soup       = BeautifulSoup(response.content,'html.parser')
# temph1     = soup.find_all('a',class_='bookTitle')

In [21]:
# tempa = soup.find_all('div',class_='value')
# tempa

In [27]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

# Set headers
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

# Function to fetch and parse a page
def fetch_page(page_num):
    url = f"https://www.goodreads.com/review/list/73221135?page={page_num}"

    # Use a session to handle retries
    session = requests.Session()
    retry = Retry(connect=5, backoff_factor=0.5)  # Retry failed connections 5 times
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)

    try:
        response = session.get(url, headers=headers)
        response.raise_for_status()  # Raise an exception for 4XX/5XX HTTP status codes
    except Exception as e:
        print(f"Error fetching page {page_num}: {e}")
        return []

    # Parse the content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all divs with class 'value'
    tempa = soup.find_all('div', class_='value')

    # Extract book_id and URL
    books = []
    for div in tempa:
        a_tag = div.find('a', href=True)
        if a_tag:
            book_url = a_tag['href']
            if '/book/show/' in book_url:
                # Extract book_id before '-' and '.'
                book_id = book_url.split('/book/show/')[1].split('-')[0]
                book_id = book_id.split('.')[0]
                books.append({'book_id': book_id, 'url': f"https://www.goodreads.com{book_url}"})
    
    return books

# Main function to fetch multiple pages concurrently
def fetch_all_pages(total_pages=103):
    all_books = []

    # Use ThreadPoolExecutor to fetch multiple pages concurrently
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = [executor.submit(fetch_page, i) for i in range(1, total_pages+1)]

        # As pages complete, gather the results
        for future in as_completed(futures):
            try:
                page_books = future.result()
                all_books.extend(page_books)
            except Exception as exc:
                print(f"Error processing a future: {exc}")

    # Convert the list of dictionaries to a pandas DataFrame
    df = pd.DataFrame(all_books)

    # Remove duplicates based on book_id and url columns
    df.drop_duplicates(subset=['book_id', 'url'], inplace=True)

    # Save the cleaned DataFrame to a CSV file
    df.to_csv('books_with_ids2.csv', index=False)
    
    print(f"Total unique books fetched: {len(df)}")
    return df

# Run the function to fetch all pages
if __name__ == "__main__":
    fetch_all_pages(total_pages=103)


Total unique books fetched: 2057
