In [1]:
import requests
from bs4 import BeautifulSoup
import time
import csv

BASE_URL = "https://www.goodreads.com/list/show/1.Best_Books_Ever?page={}"
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/115.0 Safari/537.36"
}

books = []
max_books = 1000
page = 1

while len(books) < max_books:
    url = BASE_URL.format(page)
    print(f"Scraping page {page} → {url}")
    
    response = requests.get(url, headers=HEADERS)
    if response.status_code != 200:
        print("Blocked or error:", response.status_code)
        break
    
    soup = BeautifulSoup(response.text, "html.parser")
    
    rows = soup.select("tr[itemtype='http://schema.org/Book']")
    
    for row in rows:
        title = row.find("a", class_="bookTitle").get_text(strip=True)
        author = row.find("a", class_="authorName").get_text(strip=True)
        rating = row.find("span", class_="minirating").get_text(strip=True)
        
        books.append([title, author, rating])
        
        if len(books) >= max_books:
            break
    
    page += 1
    time.sleep(2)  # polite delay

# Save results to CSV
with open("goodreads_books.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["Title", "Author", "Rating"])
    writer.writerows(books)

print(f"✅ Scraped {len(books)} books into goodreads_books.csv")


Scraping page 1 → https://www.goodreads.com/list/show/1.Best_Books_Ever?page=1
Scraping page 2 → https://www.goodreads.com/list/show/1.Best_Books_Ever?page=2
Scraping page 3 → https://www.goodreads.com/list/show/1.Best_Books_Ever?page=3
Scraping page 4 → https://www.goodreads.com/list/show/1.Best_Books_Ever?page=4
Scraping page 5 → https://www.goodreads.com/list/show/1.Best_Books_Ever?page=5
Scraping page 6 → https://www.goodreads.com/list/show/1.Best_Books_Ever?page=6
Scraping page 7 → https://www.goodreads.com/list/show/1.Best_Books_Ever?page=7
Scraping page 8 → https://www.goodreads.com/list/show/1.Best_Books_Ever?page=8
Scraping page 9 → https://www.goodreads.com/list/show/1.Best_Books_Ever?page=9
Scraping page 10 → https://www.goodreads.com/list/show/1.Best_Books_Ever?page=10
✅ Scraped 1000 books into goodreads_books.csv


In [2]:
import requests
import time
import csv

API_KEY = "AIzaSyANYI_VorpsjbkGb0gKwhyiqUr5bycexjw"
BASE_URL = "https://www.googleapis.com/books/v1/volumes"

TARGET = 1000
MAX_RESULTS = 40   # API allows max 40 results per query
DELAY = 0.5        # seconds between requests
OUT_FILE = "google_books_1000.csv"

books = []
seen_ids = set()

# Query terms to cover different topics (broad coverage)
queries = [
    "fiction", "science", "history", "fantasy", "romance",
    "philosophy", "technology", "poetry", "mystery", "art",
    "medicine", "children", "education", "religion", "travel",
    "politics", "economics", "psychology", "biography", "culture"
]

def fetch_books(query, start_index):
    params = {
        "q": query,
        "startIndex": start_index,
        "maxResults": MAX_RESULTS,
        "printType": "books",
        "key": API_KEY
    }
    r = requests.get(BASE_URL, params=params, timeout=20)
    if r.status_code != 200:
        print("Error:", r.status_code, r.text[:200])
        return []
    data = r.json()
    return data.get("items", [])

for q in queries:
    start = 0
    while len(books) < TARGET:
        items = fetch_books(q, start)
        if not items:
            break
        for item in items:
            book_id = item.get("id")
            if not book_id or book_id in seen_ids:
                continue
            volume = item.get("volumeInfo", {})
            books.append({
                "id": book_id,
                "title": volume.get("title"),
                "authors": ", ".join(volume.get("authors", [])),
                "publisher": volume.get("publisher"),
                "publishedDate": volume.get("publishedDate"),
                "categories": ", ".join(volume.get("categories", [])),
                "averageRating": volume.get("averageRating"),
                "ratingsCount": volume.get("ratingsCount"),
                "pageCount": volume.get("pageCount"),
                "language": volume.get("language"),
                "description": volume.get("description", "")[:500]  # shorten
            })
            seen_ids.add(book_id)
            if len(books) >= TARGET:
                break
        start += MAX_RESULTS
        time.sleep(DELAY)
    if len(books) >= TARGET:
        break

# Save to CSV
with open(OUT_FILE, "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=books[0].keys())
    writer.writeheader()
    writer.writerows(books)

print(f"✅ Collected {len(books)} unique books → {OUT_FILE}")


✅ Collected 1000 unique books → google_books_1000.csv
