In [8]:
import requests
import pandas as pd
import time
import random
import re


In [9]:


def fetch_books(query, max_results=1000):
    """Fetch books for one query using pagination (up to max_results)."""
    books = []
    for start in range(0, max_results, 40):  # Google API allows 40 per page
        url = f"https://www.googleapis.com/books/v1/volumes?q={query}&startIndex={start}&maxResults=40"
        r = requests.get(url)
        data = r.json()
        
        if "items" not in data:
            break
        
        for item in data["items"]:
            info = item.get("volumeInfo", {})
            books.append({
                "Title": info.get("title"),
                "Authors": ", ".join(info.get("authors", [])) if "authors" in info else None,
                "Publisher": info.get("publisher"),
                "Published Date": info.get("publishedDate"),
                "Description": info.get("description"),
                "Page Count": info.get("pageCount"),
                "Categories": ", ".join(info.get("categories", [])) if "categories" in info else None,
                "Average Rating": info.get("averageRating"),
                "Ratings Count": info.get("ratingsCount"),
                "Language": info.get("language"),
                "Preview Link": info.get("previewLink"),
                "ISBNs": ", ".join([id["identifier"] for id in info.get("industryIdentifiers", [])]),
                 "Image": info.get("imageLinks", {}).get("thumbnail")
            })
        
        time.sleep(0.2)  # polite delay (avoid hitting API too hard)
    return books


def collect_books(target=10000):
    """Keep scraping queries until we have at least target unique books."""
    
    queries = [
        "harry potter books","pride","ninteen","great","solitude","mockingbird","catcher","ignited","lord of the rings","alchemist",
        "trial","wonderland","invisible","divine","margaritta","diary","one","iliad","beloved","gray","wrath","trial","bovary","odyssey",
        "love", "life", "data", "history", "science", "fiction","nonfiction","horror","thriller","magic","socialism",
        "poetry", "art", "music", "philosophy", "children", "technology", "health","fantasy","autobiography","humor","christian","crime",
        "education", "novel", "war", "business", "adventure", "culture", "future","travel","mystery","romance","religion",
        "politics","geography","film","movie","drama","mathematics","statistics","games","classics","animals","humans","psychology",
        "medicine","sports","biography","comedy","memoir","countries"
    ]
    
    all_books = []
    for q in queries:
        print(f" Fetching books for query: '{q}' ...")
        books = fetch_books(q, max_results=1000)  # try up to 1000 per query
        all_books.extend(books)
        
        # Convert to DataFrame & deduplicate
        df = pd.DataFrame(all_books).drop_duplicates(subset=["Title", "Authors"])
        print(f" After query '{q}': {len(df)} unique books")
        
        # Stop once we hit target
        if len(df) >= target:
            print(f" Reached {target} unique books!")
            return df
    
    return pd.DataFrame(all_books).drop_duplicates(subset=["Title", "Authors"])


# Run collector
books_df = collect_books(target=20000)

# Save to CSV
books_df.to_csv("../data/raw/googleapi_books.csv", index=False, encoding="utf-8-sig")
print(f" Saved {len(books_df)} books to googleapi_books.csv")


 Fetching books for query: 'harry potter books' ...
 After query 'harry potter books': 306 unique books
 Fetching books for query: 'pride' ...
 After query 'pride': 616 unique books
 Fetching books for query: 'ninteen' ...
 After query 'ninteen': 877 unique books
 Fetching books for query: 'great' ...
 After query 'great': 1183 unique books
 Fetching books for query: 'solitude' ...
 After query 'solitude': 1485 unique books
 Fetching books for query: 'mockingbird' ...
 After query 'mockingbird': 1784 unique books
 Fetching books for query: 'catcher' ...
 After query 'catcher': 2086 unique books
 Fetching books for query: 'ignited' ...
 After query 'ignited': 2377 unique books
 Fetching books for query: 'lord of the rings' ...
 After query 'lord of the rings': 2659 unique books
 Fetching books for query: 'alchemist' ...
 After query 'alchemist': 2944 unique books
 Fetching books for query: 'trial' ...
 After query 'trial': 3232 unique books
 Fetching books for query: 'wonderland' ...
 A