In [78]:
import pandas as pd
from dotenv import load_dotenv
import os
import requests
import time

In [79]:
load_dotenv()
file = "public/top_100k_books.csv"

df = pd.read_csv(file, on_bad_lines='skip')
df.to_dict()
print(df.columns)
isbns = df["ISBN"].dropna().unique()

Index(['Id', 'Name', 'RatingDist1', 'pagesNumber', 'RatingDist4',
       'RatingDistTotal', 'PublishMonth', 'PublishDay', 'Publisher',
       'CountsOfReview', 'PublishYear', 'Language', 'Authors', 'Rating',
       'RatingDist2', 'RatingDist5', 'ISBN', 'RatingDist3', 'Description',
       'Count of text reviews', 'PagesNumber'],
      dtype='object')


In [80]:
results = []

for isbn in isbns:
    isbn = str(isbn).strip()
    url = f"https://openlibrary.org/isbn/{isbn}.json"
    try:
        response = requests.get(url, headers={"accept": "application/json", 
                                              "User-Agent": os.getenv("USER_AGENT")})
        if response.status_code == 200:
            data = response.json()

            description = data.get('description')
            if isinstance(description, dict):
                description = description.get('value')

            results.append({
                "isbn": isbn,
                "isbn13": data.get('isbn_13', [None])[0],
                "subjects": data.get("subjects", []),
                "description": description,
                "pagesNumber": data.get("number_of_pages"),
                "title": data.get("title")
            })
        else:
            print(f"❌ ISBN {isbn}: Not found")
    except Exception as e:
        print(f"⚠️ Error with ISBN {isbn}: {e}")
    time.sleep(1)

❌ ISBN 0330449605: Not found
❌ ISBN 0152061548: Not found
❌ ISBN 0765354063: Not found
❌ ISBN 0142004413: Not found
❌ ISBN 0312349486: Not found
❌ ISBN 1416913184: Not found
❌ ISBN 1596327723: Not found
❌ ISBN 0439579287: Not found
❌ ISBN 1599981416: Not found
❌ ISBN 0061712582: Not found
❌ ISBN 061709555: Not found
❌ ISBN 0007203116: Not found
❌ ISBN 0545091063: Not found
❌ ISBN 1416928119: Not found
❌ ISBN 1591854135: Not found
❌ ISBN 1599360284: Not found
❌ ISBN 8501090425: Not found
❌ ISBN 1846165997: Not found
❌ ISBN 4757518080: Not found
❌ ISBN 9755103554: Not found
❌ ISBN 0739374664: Not found
❌ ISBN 0080727587: Not found
❌ ISBN 964936448X: Not found
❌ ISBN 0060823844: Not found
❌ ISBN 7815903881: Not found
❌ ISBN 014305693X: Not found
❌ ISBN 9705804893: Not found
❌ ISBN 2756008621: Not found
❌ ISBN 159998315X: Not found
❌ ISBN 0385664745: Not found
❌ ISBN 1419906100: Not found
❌ ISBN 084386874: Not found
❌ ISBN 9137129120: Not found
❌ ISBN 9113015303: Not found
❌ ISBN 160462993

In [None]:
library_data = []
for item in results:
    library_data.append({
        'ISBN10': item.get("isbn"),
        'ISBN13': item.get("isbn13"),
        'subjects': item.get('subjects', []),
        'description': item.get('description')
        #'pagesNumber': item.get('pagesNumber'),
    })
library_df = pd.DataFrame(library_data)
columns_to_keep = [
    'Id', 'Name', 'Publisher', 'PublishYear', 'CountsOfReview', 
    'Authors', 'Rating', 'ISBN', 'pagesNumber'
]
clean_df = df[columns_to_keep]
final_df = pd.merge(
    clean_df,
    library_df,
    left_on='ISBN',
    right_on='ISBN10',
    how='left'
)
final_df = final_df.drop(columns=['ISBN10'])

output_file = "public/test_29k.csv"
final_df.to_csv(output_file, index=False)

In [None]:
print("\n=== DEBUG COUNTERS ===")

# Count descriptions and subjects in results
desc_count = sum(1 for item in results if item.get('description'))
subj_count = sum(1 for item in results if item.get('subjects'))
total_api_success = len(results)

print(f"API Success: {total_api_success}/{len(isbns)} books")
print(f"Books with descriptions: {desc_count}/{total_api_success}")
print(f"Books with subjects: {subj_count}/{total_api_success}")

if 'final_df' in locals():
    print("\nFinal DataFrame Stats:")
    print(f"Total rows: {len(final_df)}")
    print(f"Rows with description: {final_df['description'].notna().sum()}")
    print(f"Rows with subjects: {final_df['subjects'].notna().sum()}")
    print(f"Rows with ISBN13: {final_df['ISBN13'].notna().sum()}")
    
    if len(final_df) != len(clean_df):
        print(f"⚠️ Row count changed during merge! Before: {len(clean_df)}, After: {len(final_df)}")
else:
    print("Final DataFrame not created yet")

print("=== DEBUG END ===")


=== DEBUG COUNTERS ===
API Success: 25323/25411 books
Books with descriptions: 4187/25323
Books with subjects: 17034/25323

Final DataFrame Stats:
Total rows: 25981
Rows with description: 4192
Rows with subjects: 25355
Rows with ISBN13: 18804
=== DEBUG END ===
