In [2]:
import pandas as pd
import time
from datetime import datetime
import json
import requests
from bs4 import BeautifulSoup
from random import randint
from tqdm import tqdm

# Function to scrape Goodreads
def scrape_goodreads(book_id):
    goodreads_url = f"https://www.goodreads.com/book/show/{book_id}"
    response = requests.get(goodreads_url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find the script tag containing the JSON-LD data
        json_ld_tag = soup.find('script', {'type': 'application/ld+json'})
        if json_ld_tag:
            data = json.loads(json_ld_tag.string)
            
            # Extract the relevant data
            return {
                'rating_value': data['aggregateRating']['ratingValue'],
                'rating_count': data['aggregateRating']['ratingCount'],
                'review_count': data['aggregateRating']['reviewCount']
            }
        else:
            return {'error': 'JSON-LD tag not found'}
    else:
        return {'error': 'Failed to fetch data'}

# Load the CSV file
file_path = 'goodreads_library_export.csv'
goodreads_data = pd.read_csv(file_path)

# Iterate over the rows in the dataframe
for index, row in tqdm(goodreads_data.iterrows(), total=goodreads_data.shape[0]):
    scraped_data = scrape_goodreads(row['Book Id'])
    current_datetime = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    if 'error' not in scraped_data:
        new_row = pd.DataFrame([{
            'ISBN': row['ISBN'],
            'Title': row['Title'],
            'Scraped Rating Value': scraped_data['rating_value'],
            'Scraped Rating Count': scraped_data['rating_count'],
            'Scraped Review Count': scraped_data['review_count'],
            'Data Retrieved': current_datetime
        }])
        scraped_results = pd.concat([scraped_results, new_row], ignore_index=True)
    
    time.sleep(randint(3,7))

# Write the updated data to a new CSV file
output_file_path = 'updated_goodreads_data.csv'
scraped_results.to_csv(output_file_path, index=False)

  0%|          | 0/190 [00:00<?, ?it/s]

  scraped_results = pd.concat([scraped_results, new_row], ignore_index=True)
100%|██████████| 190/190 [34:48<00:00, 10.99s/it]
