In [None]:
import pandas as pd
from dotenv import load_dotenv
import os
import requests
import time

In [None]:
load_dotenv()
file = "../public/top_100k_books.csv"

df = pd.read_csv(file, on_bad_lines='skip')
df.to_dict()
print(df.columns)
isbns = df["ISBN"].dropna().unique()

In [None]:
results = []

for isbn in isbns:
    isbn = str(isbn).strip()
    url = f"https://openlibrary.org/isbn/{isbn}.json"
    try:
        response = requests.get(url, headers={"accept": "application/json", 
                                              "User-Agent": os.getenv("USER_AGENT")})
        if response.status_code == 200:
            data = response.json()

            description = data.get('description')
            if isinstance(description, dict):
                description = description.get('value')

            results.append({
                "isbn": isbn,
                "isbn13": data.get('isbn_13', [None])[0],
                "subjects": data.get("subjects", []),
                "description": description,
                "pagesNumber": data.get("number_of_pages"),
                "title": data.get("title")
            })
        else:
            print(f"❌ ISBN {isbn}: Not found")
    except Exception as e:
        print(f"⚠️ Error with ISBN {isbn}: {e}")
    time.sleep(1)

In [None]:
library_data = []
for item in results:
    library_data.append({
        'ISBN10': item.get("isbn"),
        'ISBN13': item.get("isbn13"),
        'subjects': item.get('subjects', []),
        'description': item.get('description')
        #'pagesNumber': item.get('pagesNumber'),
    })
library_df = pd.DataFrame(library_data)
columns_to_keep = [
    'Id', 'Name', 'Publisher', 'PublishYear', 'CountsOfReview', 
    'Authors', 'Rating', 'ISBN', 'pagesNumber'
]
clean_df = df[columns_to_keep]
final_df = pd.merge(
    clean_df,
    library_df,
    left_on='ISBN',
    right_on='ISBN10',
    how='left'
)
final_df = final_df.drop(columns=['ISBN10'])

output_file = "../public/test_29k.csv"
final_df.to_csv(output_file, index=False)

In [None]:
print("\n=== DEBUG COUNTERS ===")

# Count descriptions and subjects in results
desc_count = sum(1 for item in results if item.get('description'))
subj_count = sum(1 for item in results if item.get('subjects'))
total_api_success = len(results)

print(f"API Success: {total_api_success}/{len(isbns)} books")
print(f"Books with descriptions: {desc_count}/{total_api_success}")
print(f"Books with subjects: {subj_count}/{total_api_success}")

if 'final_df' in locals():
    print("\nFinal DataFrame Stats:")
    print(f"Total rows: {len(final_df)}")
    print(f"Rows with description: {final_df['description'].notna().sum()}")
    print(f"Rows with subjects: {final_df['subjects'].notna().sum()}")
    print(f"Rows with ISBN13: {final_df['ISBN13'].notna().sum()}")
    
    if len(final_df) != len(clean_df):
        print(f"⚠️ Row count changed during merge! Before: {len(clean_df)}, After: {len(final_df)}")
else:
    print("Final DataFrame not created yet")

print("=== DEBUG END ===")