
# üìö From Raw Books to Insights ‚Äî Goodreads Data Analysis

**Analysis by Athanasios-Marios Marougkas**

This notebook presents an end-to-end exploratory data analysis (EDA) on the cleaned Goodreads books dataset prepared by the project pipeline.  
We will examine the structure of the data, summarize key statistics, and visualize patterns across ratings, authors, and publication trends.

---


In [None]:

import os
import pandas as pd
import matplotlib.pyplot as plt

# Display options
pd.set_option("display.max_columns", None)
pd.set_option("display.width", 120)

# Path to cleaned dataset (relative to this notebook inside notebooks/ folder)
DATA_PATH = os.path.join("..", "data", "processed", "books_clean.csv")

def load_dataset(path: str) -> pd.DataFrame:
    """Load the cleaned Goodreads dataset safely with minimal assumptions."""
    try:
        df = pd.read_csv(path, encoding="utf-8")
        # Normalize column names for easier access (keep a copy of originals)
        df.attrs["original_columns"] = list(df.columns)
        df.columns = (
            df.columns
              .str.strip()
              .str.replace("\s+", "_", regex=True)
              .str.replace("/", "_", regex=False)
              .str.lower()
        )
        print(f"‚úÖ Loaded dataset: {path}\nShape: {df.shape}")
        return df
    except FileNotFoundError:
        raise FileNotFoundError("Could not find the cleaned dataset. Make sure '../data/processed/books_clean.csv' exists.")
    except Exception as e:
        raise RuntimeError(f"Failed to load dataset: {e}")

df = load_dataset(DATA_PATH)

# Quick peek
display(df.head())



## 1) Dataset Overview
We begin with a structural overview and missing-value inspection.


In [None]:

# Basic info
print("\n--- INFO ---")
print(df.info())

print("\n--- SHAPE ---")
print(df.shape)

print("\n--- MISSING VALUES (top 20 by count) ---")
missing = df.isna().sum().sort_values(ascending=False)
display(missing.head(20).to_frame("missing_count"))


In [None]:

def pick_col(candidates):
    """Return the first existing column from a list of candidate names (case- and space-normalized)."""
    for c in candidates:
        c_norm = (
            str(c).strip().lower().replace("/", "_").replace(" ", "_")
        )
        if c_norm in df.columns:
            return c_norm
    return None

# Common columns we may need
col_avg_rating = pick_col(["average_rating", "avg_rating", "rating"])  # expected
col_ratings_count = pick_col(["ratings_count", "ratings" ])
col_text_reviews = pick_col(["text_reviews_count", "text_reviews"])
col_authors = pick_col(["authors", "author", "primary_author"])       # expected
col_title = pick_col(["title", "book_title"])                           # expected
col_num_pages = pick_col(["num_pages", "pages", "  num_pages"])       # dataset sometimes has spaces
col_pub_date = pick_col(["publication_date"])                             # may not exist
col_pub_year = pick_col(["original_publication_year", "publication_year", "year"])

print("Resolved columns:\n",
      {
    "average_rating": col_avg_rating,
    "ratings_count": col_ratings_count,
    "text_reviews_count": col_text_reviews,
    "authors": col_authors,
    "title": col_title,
    "num_pages": col_num_pages,
    "publication_year": col_pub_year,
    "publication_date": col_pub_date,
})



## 2) Ratings Distribution
How are average ratings distributed across books?


In [None]:

if col_avg_rating is not None and col_avg_rating in df.columns:
    plt.figure(figsize=(8,5))
    plt.hist(df[col_avg_rating].dropna(), bins=30)
    plt.title("Distribution of Average Ratings")
    plt.xlabel("Average Rating")
    plt.ylabel("Number of Books")
    plt.show()
else:
    print("‚ö†Ô∏è Skipping: average rating column not found.")



## 3) Most Frequent Authors
Who are the most prolific authors in this dataset?


In [None]:

if col_authors is not None and col_authors in df.columns:
    top_authors = df[col_authors].value_counts().head(10)
    plt.figure(figsize=(10,6))
    plt.barh(top_authors.index[::-1], top_authors.values[::-1])
    plt.title("Top 10 Authors by Number of Books")
    plt.xlabel("Number of Books")
    plt.ylabel("Author")
    plt.tight_layout()
    plt.show()
    display(top_authors.to_frame("count"))
else:
    print("‚ö†Ô∏è Skipping: authors column not found.")



## 4) Most Rated Books
Which titles gathered the most ratings from readers?


In [None]:

if all(c is not None for c in [col_ratings_count, col_title, col_authors]):
    top_rated = (
        df[[col_title, col_authors, col_ratings_count]]
        .dropna(subset=[col_ratings_count])
        .sort_values(col_ratings_count, ascending=False)
        .head(10)
    )
    plt.figure(figsize=(10,6))
    plt.barh(top_rated[col_title][::-1], top_rated[col_ratings_count][::-1])
    plt.title("Top 10 Books by Ratings Count")
    plt.xlabel("Ratings Count")
    plt.ylabel("Book Title")
    plt.tight_layout()
    plt.show()
    display(top_rated.reset_index(drop=True))
else:
    print("‚ö†Ô∏è Skipping: required columns for 'Most Rated Books' not found.")



## 5) Relationship Between Book Length and Ratings
Do longer books receive higher or lower ratings on average?


In [None]:

if all(c is not None for c in [col_num_pages, col_avg_rating]):
    plt.figure(figsize=(8,5))
    plt.scatter(df[col_num_pages], df[col_avg_rating], alpha=0.4)
    plt.title("Number of Pages vs Average Rating")
    plt.xlabel("Number of Pages")
    plt.ylabel("Average Rating")
    plt.tight_layout()
    plt.show()

    # Simple correlation
    corr_val = df[[col_num_pages, col_avg_rating]].corr().iloc[0,1]
    print(f"Correlation (pages vs average rating): {corr_val:.3f}")
else:
    print("‚ö†Ô∏è Skipping: pages and/or average rating column not found.")



## 6) Publication Year Trends
How have average ratings evolved across publication years?


In [None]:

import numpy as np

year_series = None
if col_pub_year is not None and col_pub_year in df.columns:
    # Use existing year column
    year_series = pd.to_numeric(df[col_pub_year], errors='coerce')
elif col_pub_date is not None and col_pub_date in df.columns:
    # Derive year from date
    year_series = pd.to_datetime(df[col_pub_date], errors='coerce').dt.year

if year_series is not None:
    tmp = df.copy()
    tmp["_pub_year_"] = year_series
    yearly_avg = (
        tmp.dropna(subset=["_pub_year_", col_avg_rating])
           .groupby("_pub_year_")[col_avg_rating]
           .mean()
           .sort_index()
    )
    if not yearly_avg.empty:
        plt.figure(figsize=(10,5))
        plt.plot(yearly_avg.index, yearly_avg.values)
        plt.title("Average Rating by Publication Year")
        plt.xlabel("Publication Year")
        plt.ylabel("Average Rating")
        plt.tight_layout()
        plt.show()
        display(yearly_avg.tail(10).to_frame("avg_rating"))
    else:
        print("‚ö†Ô∏è No usable yearly data to plot.")
else:
    print("‚ö†Ô∏è Skipping: no publication year/date information found.")



## 7) Correlation Snapshot
A quick correlation matrix among numeric metrics (if present).


In [None]:

candidates = [col_avg_rating, col_ratings_count, col_text_reviews, col_num_pages]
use_cols = [c for c in candidates if c is not None and c in df.columns]

if len(use_cols) >= 2:
    corr = df[use_cols].corr()
    print(corr)

    # Simple heatmap with matplotlib
    plt.figure(figsize=(6,4))
    im = plt.imshow(corr, interpolation='nearest')
    plt.colorbar(im)
    plt.xticks(range(len(use_cols)), use_cols, rotation=45, ha='right')
    plt.yticks(range(len(use_cols)), use_cols)
    # Annotate
    for i in range(len(use_cols)):
        for j in range(len(use_cols)):
            plt.text(j, i, f"{corr.iloc[i, j]:.2f}", ha='center', va='center')
    plt.title("Correlation Matrix (numeric subset)")
    plt.tight_layout()
    plt.show()
else:
    print("‚ö†Ô∏è Skipping: not enough numeric columns available for correlation.")



## 8) Conclusions

- Most books tend to receive average ratings in a relatively high band (often between 3.5 and 4.5).  
- A handful of popular titles dominate the ratings count distribution.  
- Author frequency does not necessarily imply higher average ratings.  
- We generally observe weak correlation between page count and average rating.  
- Publication year trends can be inspected to see whether readers‚Äô preferences shift across time.

**Next Steps (Ideas):**
- Genre-level comparisons and rating distributions per genre  
- Text mining on descriptions for sentiment or topic modeling  
- A lightweight recommendation demo using author‚Äìrating/nearest-neighbor heuristics  

---

_This notebook is part of the project **‚ÄúFrom Raw Books to Insights ‚Äî Goodreads Data Pipeline‚Äù**, combining data engineering and analytics for portfolio-quality results._
