# BN XP 12957 — Goodreads Recommender Systems

This notebook implements:
- **Popularity-based** recommender using IMDb-style weighted rating
- **Content-based** recommender using **TF-IDF (authors)** + **cosine similarity**

**Expected Output:** Multiple recommendations from both recommenders.


In [1]:
import os
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

CSV_PATH = "../data/books.csv"  # adjust if needed
TOP_N = 10
MIN_PERCENTILE = 0.90  # IMDb-style threshold for ratings_count


In [2]:
def load_books(csv_path: str) -> pd.DataFrame:
    if not os.path.exists(csv_path):
        raise FileNotFoundError(f"File not found: {csv_path}")

    try:
        df = pd.read_csv(csv_path)
    except pd.errors.ParserError:
        print("[WARN] ParserError with default engine. Retrying with python engine and robust options...")
        try:
            df = pd.read_csv(
                csv_path,
                engine="python",
                sep=",",
                quotechar='"',
                escapechar="\\",
                on_bad_lines="skip",
            )
        except UnicodeDecodeError:
            df = pd.read_csv(
                csv_path,
                engine="python",
                sep=",",
                quotechar='"',
                escapechar="\\",
                on_bad_lines="skip",
                encoding="latin1",
            )

    df.columns = [c.strip().lower().replace(" ", "_") for c in df.columns]
    df.rename(columns={"isbn13": "isbn_13", "  num_pages": "num_pages"}, inplace=True)

    expected = ["bookid","title","authors","average_rating","isbn","isbn_13",
                "language_code","num_pages","ratings_count","text_reviews_count"]
    present = [c for c in expected if c in df.columns]
    df = df[present].copy()

    if "average_rating" in df.columns:
        df["average_rating"] = pd.to_numeric(df["average_rating"], errors="coerce")
    if "ratings_count" in df.columns:
        df["ratings_count"] = pd.to_numeric(df["ratings_count"], errors="coerce").fillna(0).astype(int)

    crit = [c for c in ["title","authors","average_rating","ratings_count"] if c in df.columns]
    df = df.dropna(subset=crit)

    if all(c in df.columns for c in ["title","ratings_count"]):
        df = (df.sort_values(["title","ratings_count"], ascending=[True, False])
                .drop_duplicates(subset=["title"], keep="first")
                .reset_index(drop=True))

    print(f"[INFO] Loaded {len(df):,} rows after cleaning.")
    return df

df = load_books(CSV_PATH)
df.head(3)


[WARN] ParserError with default engine. Retrying with python engine and robust options...
[INFO] Loaded 10,344 rows after cleaning.


Unnamed: 0,bookid,title,authors,average_rating,isbn,isbn_13,language_code,num_pages,ratings_count,text_reviews_count
0,6549,said the shotgun to the head.,Saul Williams,4.22,743470796,9780743470797,en-US,192,2762,214
1,14490,$30 Film School: How to Write Direct Produce...,Michael W. Dean,3.49,1592000673,9781592000678,eng,528,30,4
2,5413,'Salem's Lot,Stephen King/Jerry N. Uelsmann,4.25,385516487,9780385516488,eng,594,84123,571


In [3]:
def popularity_rank(df, min_percentile=0.90, top_n=10):
    C = df["average_rating"].mean()
    m = float(np.percentile(df["ratings_count"], min_percentile * 100))

    def weighted_score(row):
        v = float(row["ratings_count"])
        R = float(row["average_rating"])
        denom = v + m
        return ((v/denom)*R + (m/denom)*C) if denom > 0 else 0.0

    qualified = df[df["ratings_count"] >= m].copy()
    if qualified.empty:
        qualified = df.copy()

    qualified["popularity_score"] = qualified.apply(weighted_score, axis=1)
    ranked = qualified.sort_values("popularity_score", ascending=False)
    return ranked[["title","authors","average_rating","ratings_count","popularity_score"]].head(top_n).reset_index(drop=True)

pop_recs = popularity_rank(df, MIN_PERCENTILE, TOP_N)
pop_recs


Unnamed: 0,title,authors,average_rating,ratings_count,popularity_score
0,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,4.57,2095690,4.561868
1,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,4.56,2339585,4.552821
2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,2153167,4.483077
3,J.R.R. Tolkien 4-Book Boxed Set: The Hobbit an...,J.R.R. Tolkien,4.59,101233,4.451277
4,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling/Mary GrandPré,4.78,41428,4.445075
5,The Complete Maus,Art Spiegelman,4.55,111475,4.429354
6,The Two Towers (The Lord of the Rings #2),J.R.R. Tolkien/Peter S. Beagle,4.44,593467,4.417863
7,The Complete Calvin and Hobbes,Bill Watterson,4.82,32213,4.41477
8,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling/Mary GrandPré,4.42,2293963,4.414315
9,A Clash of Kings (A Song of Ice and Fire #2),George R.R. Martin,4.41,638766,4.390591


In [4]:
def build_tfidf(df):
    corpus = df["authors"].fillna("").astype(str).values
    vec = TfidfVectorizer(analyzer="word", token_pattern=r"(?u)\b\w+\b", lowercase=True)
    X = vec.fit_transform(corpus)
    return vec, X

def make_title_index(df):
    return {t.lower().strip(): i for i, t in enumerate(df["title"].astype(str).values)}

def recommend_similar(df, tfidf_matrix, title_map, title, top_n=10):
    q = title.lower().strip()
    if q not in title_map:
        # small helper to avoid hard failures
        starts = [t for t in title_map.keys() if t.startswith(q)]
        if starts:
            q = starts[0]
        else:
            raise ValueError(f"Title not found: {title}")

    idx = title_map[q]
    sims = cosine_similarity(tfidf_matrix[idx], tfidf_matrix).ravel()
    order = np.argsort(-sims)
    order = [i for i in order if i != idx][:top_n]
    out = df.iloc[order][["title","authors","average_rating","ratings_count"]].copy()
    out["similarity"] = sims[order]
    return out.reset_index(drop=True)

vec, X = build_tfidf(df)
title_map = make_title_index(df)

seed_title = pop_recs.iloc[0]["title"] if len(pop_recs) else df.iloc[0]["title"]
seed_title


'Harry Potter and the Half-Blood Prince (Harry Potter  #6)'

In [5]:
content_recs = recommend_similar(df, X, title_map, seed_title, top_n=TOP_N)
content_recs


Unnamed: 0,title,authors,average_rating,ratings_count,similarity
0,Harry Potter and the Sorcerer's Stone (Harry P...,J.K. Rowling/Mary GrandPré,4.47,147,1.0
1,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling/Mary GrandPré,4.42,2293963,1.0
2,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,4.56,2339585,1.0
3,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,2153167,1.0
4,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling/Mary GrandPré,4.78,41428,1.0
5,Harry Potter Y La Piedra Filosofal (Harry Pott...,J.K. Rowling,4.47,142,0.70778
6,Harry Potter Schoolbooks Box Set: Two Classic ...,J.K. Rowling,4.4,11515,0.70778
7,Harry Potter and the Philosopher's Stone (Harr...,J.K. Rowling,4.47,11,0.70778
8,Harry Potter and the Goblet of Fire (Harry Pot...,J.K. Rowling,4.56,18754,0.70778
9,Harry Potter y la Orden del Fénix (Harry Potte...,J.K. Rowling,4.49,5637,0.70778


In [6]:
out_dir = "../outputs"
os.makedirs(out_dir, exist_ok=True)
pop_recs.to_csv(os.path.join(out_dir, "popularity_recommendations.csv"), index=False)
content_recs.to_csv(os.path.join(out_dir, "content_recommendations.csv"), index=False)
print("Saved to:", out_dir)


Saved to: ../outputs


In [7]:
# Example: replace with any title present in your dataset:
try_title = seed_title  # or e.g., "The Hobbit"
recommend_similar(df, X, title_map, try_title, top_n=TOP_N)


Unnamed: 0,title,authors,average_rating,ratings_count,similarity
0,Harry Potter and the Sorcerer's Stone (Harry P...,J.K. Rowling/Mary GrandPré,4.47,147,1.0
1,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling/Mary GrandPré,4.42,2293963,1.0
2,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,4.56,2339585,1.0
3,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,2153167,1.0
4,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling/Mary GrandPré,4.78,41428,1.0
5,Harry Potter Y La Piedra Filosofal (Harry Pott...,J.K. Rowling,4.47,142,0.70778
6,Harry Potter Schoolbooks Box Set: Two Classic ...,J.K. Rowling,4.4,11515,0.70778
7,Harry Potter and the Philosopher's Stone (Harr...,J.K. Rowling,4.47,11,0.70778
8,Harry Potter and the Goblet of Fire (Harry Pot...,J.K. Rowling,4.56,18754,0.70778
9,Harry Potter y la Orden del Fénix (Harry Potte...,J.K. Rowling,4.49,5637,0.70778
