In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from time import sleep
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

# Define headers
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

# Set up retry logic for requests
session = requests.Session()
retry = Retry(
    total=5,          # Retry up to 5 times
    backoff_factor=1,  # Wait 1 second between retries, then exponentially increase
    status_forcelist=[429, 500, 502, 503, 504],  # Retry on these HTTP status codes
)
adapter = HTTPAdapter(max_retries=retry)
session.mount('https://', adapter)
session.mount('http://', adapter)

# Initialize the list to store book data
master_list = []

# Function to scrape a single page
def scrape_page(i):
    try:
        url = f"https://www.goodreads.com/shelf/show/bengali?page={i}"
        response = session.get(url, headers=headers, timeout=10)  # Set timeout to 10 seconds
        response.raise_for_status()  # Raise HTTP errors
        soup = BeautifulSoup(response.content, 'html.parser')

        page_books = []

        # Find all book items
        for x in soup.find_all('a', {"class": "bookTitle"}):
            # Extract the URL
            book_url = 'https://www.goodreads.com' + x['href']

            # Extract the book ID from the href URL
            book_id = x['href'].split('/')[-1]  # Extracting the ID from the URL
            book_id = book_id.split('.')[0]

            # Append the data as a dictionary to the page_books list
            page_books.append({
                'Book_ID': book_id,
                'URL': book_url
            })

        return page_books
    except requests.exceptions.RequestException as e:
        print(f"Error occurred while scraping page {i}: {e}")
        return []

# Use ThreadPoolExecutor to scrape with 5 threads (adjust as needed)
with ThreadPoolExecutor(max_workers=5) as executor:  # Reduced from 10 to 5
    futures = [executor.submit(scrape_page, i) for i in range(1, 25+1)]  # Adjust range as needed

    for future in as_completed(futures):
        master_list.extend(future.result())

# Convert the master_list into a pandas DataFrame
df = pd.DataFrame(master_list)

# Save the data to a CSV file
df.to_csv('books_with_ids.csv', index=False)

# Check the number of books scraped
print(len(master_list))


1250
