# 📚 Goodreads Book Scraper (100 Pages)

A Python-based web scraper that collects book data from Goodreads' "Best Books Ever" list across 100 pages.  
Extracted information includes **Title**, **Author**, and **Average Rating**, saved to a clean CSV for analysis or portfolio use.

## 🔍 Features
- Scrapes 100 pages from Goodreads' book list
- Extracts title, author, and rating info
- Cleans and structures the data
- Saves results to CSV
- Performs basic analysis (top-rated books, top authors)


### Install Required Libraries

In [None]:
!pip install requests beautifulsoup4 pandas

### Import Libraries

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

### Define the Scraper Function

In [None]:
def scrape_goodreads_books(pages=100):
    base_url = "https://www.goodreads.com/list/show/1.Best_Books_Ever?page="
    headers = {"User-Agent": "Mozilla/5.0"}
    
    all_books = []

    for page in range(1, pages + 1):
        url = base_url + str(page)
        print(f"Scraping page {page}...")

        try:
            response = requests.get(url, headers=headers, timeout=15)
            response.raise_for_status()
        except requests.exceptions.RequestException as e:
            print(f" Failed to fetch page {page}: {e}")
            time.sleep(5)
            continue

        soup = BeautifulSoup(response.content, "html.parser")
        book_rows = soup.find_all("tr", itemtype="http://schema.org/Book")

        for book in book_rows:
            try:
                title = book.find("a", class_="bookTitle").text.strip()
                author = book.find("a", class_="authorName").text.strip()
                rating = book.find("span", class_="minirating").text.strip()
                all_books.append({
                    "Title": title,
                    "Author": author,
                    "Rating Info": rating
                })
            except:
                continue

        time.sleep(2)  # Be kind to the server

    return pd.DataFrame(all_books)


### Run the Scraper for All 100 Pages

In [None]:
df_books = scrape_goodreads_books(pages=100)
df_books.head()

### Clean the Rating Info Column

In [None]:
def extract_rating(text):
    try:
        return float(text.split(" ")[0])
    except:
        return None

df_books["Rating"] = df_books["Rating Info"].apply(extract_rating)
df_books = df_books.drop(columns=["Rating Info"])
df_books.head()

### Analyze the Data 

In [None]:
df_books["Author"].value_counts().head(10)

### Plot Top-Rated Books

In [None]:
import matplotlib.pyplot as plt

top_books = df_books.sort_values(by="Rating", ascending=False).head(10)

plt.figure(figsize=(10, 6))
plt.barh(top_books["Title"], top_books["Rating"], color='skyblue')
plt.xlabel("Rating")
plt.title("Top 10 Highest Rated Books")
plt.gca().invert_yaxis()
plt.show()


### Save the scraped data

In [None]:
df_books.to_csv("cleaned_goodreads_books.csv", index=False)