*source: https://www.kaggle.com/datasets/mohamedbakhet/amazon-books-reviews*

In [11]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [7]:
raw_books_df = pd.read_csv("../data/raw/books_data.csv")
books_df = pd.read_csv("../data/clean/cleaned_books_data.csv")
ratings_df = pd.read_csv("../data/raw/Books_rating.csv")

raw_books_df.columns = raw_books_df.columns.str.lower()
books_df.columns = books_df.columns.str.lower()
ratings_df.columns = ratings_df.columns.str.lower()

In [None]:
print(ratings_df.shape)
print(list(ratings_df.columns))
ratings_df.info()

In [None]:
ratings_df.head()

In [None]:
# How many unique users?
print(f"unique users: {len(ratings_df['user_id'].unique())}")

# How many unique books?
print(f"unique books: {len(ratings_df['title'].unique())}")

# How many books in raw data?
print(f"books in raw data: {len(raw_books_df)}")

# How many books in clean data?
print(f"books in clean data: {len(books_df)}")

In [None]:
# null values as percentage
ratings_df.isna().sum() / len(ratings_df) * 100

#### `ratings_df` `review/score`

In [None]:
ratings_df["review/score"].describe()
ratings_df["review/score"].value_counts().sort_index()

In [None]:
# Plot the distribution of ratings across all reviews
ratings_df['review/score'].value_counts().sort_index().plot(kind='bar')
plt.xlabel('Review score')
plt.ylabel('Count')
plt.show()

#### `ratings_df` `user_id`

In [None]:
ratings_df['user_id'].nunique()

In [None]:
reviews_per_user = ratings_df['user_id'].value_counts()
reviews_per_user

In [None]:
reviews_per_user.hist(bins=50, log=True)
plt.xlabel('Reviews per user')
plt.ylabel('Number of users (log scale)')
plt.show()

#### `ratings_df` `title`

In [None]:
reviews_per_title = ratings_df['title'].value_counts()
reviews_per_title

In [None]:
reviews_per_title.hist(bins=50, log=True)
plt.xlabel('Reviews per title')
plt.ylabel('Number of titles (log scale)')
plt.show()

In [None]:
ratings_df[['review/score', 'user_id', 'title']].isna().sum()

In [None]:
title_stats = ratings_df.groupby('title')['review/score'].agg(['mean', 'count'])
plt.scatter(title_stats['count'], title_stats['mean'], alpha=0.1)
plt.xscale('log')
plt.xlabel('Number of reviews')
plt.ylabel('Average score')
plt.show()

#### **duplicates**

In [None]:
duplicate_mask = ratings_df.duplicated(subset=["title", "user_id"])
duplicates = ratings_df[duplicate_mask]
duplicates

In [None]:
ratings_df.loc[[162, 164]]

In [None]:
ratings_df.duplicated(["user_id", "title"]).any()
ratings_df.duplicated(["user_id", "title"]).sum()

duplicates = ratings_df[ratings_df.duplicated(["user_id", "title"], keep=False)]
duplicates = duplicates.sort_values(["user_id", "title", "review/time"])
duplicates.head(20)

In [None]:
ratings_df.groupby(["user_id", "title"]).size().value_counts().sort_index()

#### **manipulation strategies**

In [None]:
import html
def normalize_text_field(cleaned_col):
    # Ensure string type
    cleaned_col = cleaned_col.fillna("").astype(str)
    # Remove HTML tags
    cleaned_col = cleaned_col.str.replace(r"<[^>]+>", "", regex=True)
    # Decode HTML entities
    cleaned_col = cleaned_col.apply(html.unescape)
    # Remove escaped characters
    cleaned_col = cleaned_col.str.replace(r"[\n\t\r]", " ", regex=True)
    # Collapse multiple spaces
    cleaned_col = cleaned_col.str.strip().str.replace(r"\s+", " ", regex=True)
    # Remove control/non-printable characters
    cleaned_col = cleaned_col.apply(lambda s: "".join(ch for ch in s if ch.isprintable()))
    return cleaned_col

In [54]:
raw_books_df = pd.read_csv("../data/raw/books_data.csv")
books_df = pd.read_csv("../data/clean/cleaned_books_data.csv")
ratings_df = pd.read_csv("../data/raw/Books_rating.csv")

raw_books_df.columns = raw_books_df.columns.str.lower()
books_df.columns = books_df.columns.str.lower()
ratings_df.columns = ratings_df.columns.str.lower()

# Drop rows with missing title, user_id, review/score
ratings_df = ratings_df[ratings_df["title"].notna()]
ratings_df = ratings_df[ratings_df["user_id"].notna()]
ratings_df = ratings_df[ratings_df["review/score"].notna()]
print(f"data size after dropping rows with no title or user_id or review/score: {ratings_df.shape[0]}")

ratings_df["title"] = normalize_text_field(ratings_df["title"])
books_ratings_df = pd.merge(ratings_df, books_df, on="title", how="inner")
ratings_df = books_ratings_df[ratings_df.columns].copy()
print(f"data size after inner joining on clean book titles: {ratings_df.shape[0]}")

# Force UTC awareness during conversion
ratings_df["datetime"] = pd.to_datetime(ratings_df["review/time"], unit="s", utc=True)
ratings_df = ratings_df.sort_values("review/time").drop_duplicates(subset=["user_id", "title"], keep="last")
print(f"data size after removing duplicate (user_id, title) pairs: {ratings_df.shape[0]}")

ratings_df["confidence"] = ratings_df["review/score"].clip(lower=3) - 3

print(f"final shape now: {ratings_df.shape}")

data size after dropping rows with no title or user_id or review/score: 2438018
data size after inner joining on clean book titles: 1801602
data size after removing duplicate (user_id, title) pairs: 1569596
final shape now: (1569596, 12)


In [55]:
interactions_df = ratings_df

n_users = interactions_df['user_id'].nunique()  
n_books = interactions_df['title'].nunique()
n_interactions = len(interactions_df)

print(f"Total interactions: {n_interactions}")
print(f"Unique users: {n_users}")
print(f"Unique books: {n_books}")

# User–item matrix sparsity
print(f"Sparsity: {100 * (1 - n_interactions / (n_users * n_books)):.4f}%")

print(f"Avg interactions per user: {n_interactions / n_users:.2f}")
print(f"Avg interactions per book: {n_interactions / n_books:.2f}")

print(f"Median interactions per user: {interactions_df.groupby('user_id').size().median()}")

print(f"Confidence distribution:\n{interactions_df['confidence'].value_counts().sort_index()}")

Total interactions: 1569596
Unique users: 822137
Unique books: 132539
Sparsity: 99.9986%
Avg interactions per user: 1.91
Avg interactions per book: 11.84
Median interactions per user: 1.0
Confidence distribution:
confidence
0.0    318904
1.0    316353
2.0    934339
Name: count, dtype: int64


In [57]:
MIN_USER_INTERACTIONS = 2
MAX_USER_INTERACTIONS = 500
MIN_BOOK_INTERACTIONS = 5

ratings_filtered = ratings_df.groupby("user_id").filter(lambda x: len(x) >= MIN_USER_INTERACTIONS)
print(f"data size after dropping user interactions < {MIN_USER_INTERACTIONS}: {ratings_filtered.shape}")

ratings_filtered = ratings_filtered.groupby("user_id").filter(lambda x: len(x) <= MAX_USER_INTERACTIONS)
print(f"data size after dropping user interactions > {MAX_USER_INTERACTIONS}: {ratings_filtered.shape}")

ratings_filtered = ratings_filtered.groupby("title").filter(lambda x: len(x) >= MIN_BOOK_INTERACTIONS)
print(f"data size after dropping book interactions < {MIN_BOOK_INTERACTIONS}: {ratings_filtered.shape}")

interactions_df = ratings_filtered

n_users = interactions_df['user_id'].nunique()
n_books = interactions_df['title'].nunique()
n_interactions = len(interactions_df)

print(f"Total interactions: {n_interactions}")
print(f"Unique users: {n_users}")
print(f"Unique books: {n_books}")
# User–item matrix sparsity
print(f"Sparsity: {100 * (1 - n_interactions / (n_users * n_books)):.4f}%")
print(f"Avg interactions per user: {n_interactions / n_users:.2f}")
print(f"Avg interactions per book: {n_interactions / n_books:.2f}")
print(f"Median interactions per user: {interactions_df.groupby('user_id').size().median()}")
print(f"Confidence distribution:\n{interactions_df['confidence'].value_counts().sort_index()}")

data size after dropping user interactions < 2: (956240, 12)
data size after dropping user interactions < 500: (941783, 12)
data size after dropping book interactions < 5: (813068, 12)
Total interactions: 813068
Unique users: 199492
Unique books: 28104
Sparsity: 99.9855%
Avg interactions per user: 4.08
Avg interactions per book: 28.93
Median interactions per user: 2.0
Confidence distribution:
confidence
0.0    169159
1.0    180416
2.0    463493
Name: count, dtype: int64


In [44]:
interactions_df.groupby("user_id").filter(lambda x: len(x) >= 500)

Unnamed: 0,id,title,price,user_id,profilename,review/helpfulness,review/score,review/time,review/summary,review/text,datetime,confidence
1279,087584782X,The Living Company,,A1K1JW1C5CUSUZ,"Donald Mitchell ""Jesus Loves You!""",8/8,5.0,917136000,This book is must reading for any leader or as...,Arie de Geus is probably the most unique busin...,1999-01-24 00:00:00+00:00,2.0
1220777,0375407723,The Brand You 50 : Or : Fifty Ways to Transfor...,13.04,A1K1JW1C5CUSUZ,"Donald Mitchell ""Jesus Loves You!""",11/11,5.0,938736000,Guide for the Free Agent Employee (Contractor)...,There has been a lot written and said about ho...,1999-10-01 00:00:00+00:00,2.0
972152,B000IN8DHQ,O is for Outlaw,,A1K1JW1C5CUSUZ,"Donald Mitchell ""Jesus Loves You!""",3/3,5.0,941500800,An Authentic Period Piece from the 60s and 80s,Much of the charm of many detective series rel...,1999-11-02 00:00:00+00:00,2.0
1030705,B00005JH2U,O is for Outlaw: A Kinsey Millhone Mystery,,A1K1JW1C5CUSUZ,"Donald Mitchell ""Jesus Loves You!""",3/3,5.0,941500800,An Authentic Period Piece from the 60s and 80s,Much of the charm of many detective series rel...,1999-11-02 00:00:00+00:00,2.0
298703,B000MK50EE,Goodnight Moon,,A1K1JW1C5CUSUZ,"Donald Mitchell ""Jesus Loves You!""",83/88,5.0,941846400,Read this Book to Create Happy Memories and Pl...,This is a book that I read every night to my c...,1999-11-06 00:00:00+00:00,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...
182890,B000HQVVDW,GAUDY NIGHT (A LORD PETER WIMSEY MYSTERY),,AFVQZQ8PW0L,Harriet Klausner,1/1,5.0,1350345600,engaging historical mystery,Mystery writer Harriet Vane returns to Oxford'...,2012-10-16 00:00:00+00:00,2.0
1642821,B0006IU3BM,The Shadow Rising: Book Four of The Wheel of Time,,AFVQZQ8PW0L,Harriet Klausner,0/0,4.0,1351814400,enjoyable entry,"The Shadow RisingRobert JordanTor, Oct 2 2012,...",2012-11-02 00:00:00+00:00,1.0
791326,B00087P9HY,The decline and fall of the Roman empire (Ever...,,AHD101501WCN1,"Shalom Freedman ""Shalom Freedman""",0/0,5.0,1358380800,A masterful abridgment of a masterpiece,This is a mastefully done abridgment of a grea...,2013-01-17 00:00:00+00:00,2.0
776802,B000JK56M8,The Decline and Fall of the Roman Empire 7 Vol...,,AHD101501WCN1,"Shalom Freedman ""Shalom Freedman""",0/0,5.0,1358380800,A masterful abridgment of a masterpiece,This is a mastefully done abridgment of a grea...,2013-01-17 00:00:00+00:00,2.0


In [45]:
interactions_df[interactions_df["user_id"] == "A1K1JW1C5CUSUZ"]

Unnamed: 0,id,title,price,user_id,profilename,review/helpfulness,review/score,review/time,review/summary,review/text,datetime,confidence
1279,087584782X,The Living Company,,A1K1JW1C5CUSUZ,"Donald Mitchell ""Jesus Loves You!""",8/8,5.0,917136000,This book is must reading for any leader or as...,Arie de Geus is probably the most unique busin...,1999-01-24 00:00:00+00:00,2.0
1220777,0375407723,The Brand You 50 : Or : Fifty Ways to Transfor...,13.04,A1K1JW1C5CUSUZ,"Donald Mitchell ""Jesus Loves You!""",11/11,5.0,938736000,Guide for the Free Agent Employee (Contractor)...,There has been a lot written and said about ho...,1999-10-01 00:00:00+00:00,2.0
972152,B000IN8DHQ,O is for Outlaw,,A1K1JW1C5CUSUZ,"Donald Mitchell ""Jesus Loves You!""",3/3,5.0,941500800,An Authentic Period Piece from the 60s and 80s,Much of the charm of many detective series rel...,1999-11-02 00:00:00+00:00,2.0
1030705,B00005JH2U,O is for Outlaw: A Kinsey Millhone Mystery,,A1K1JW1C5CUSUZ,"Donald Mitchell ""Jesus Loves You!""",3/3,5.0,941500800,An Authentic Period Piece from the 60s and 80s,Much of the charm of many detective series rel...,1999-11-02 00:00:00+00:00,2.0
298703,B000MK50EE,Goodnight Moon,,A1K1JW1C5CUSUZ,"Donald Mitchell ""Jesus Loves You!""",83/88,5.0,941846400,Read this Book to Create Happy Memories and Pl...,This is a book that I read every night to my c...,1999-11-06 00:00:00+00:00,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1213494,078626988X,Dark Voyage,,A1K1JW1C5CUSUZ,"Donald Mitchell ""Jesus Loves You!""",0/0,5.0,1348099200,This Is My Favorite of the Furst Spy Novels,"""Your tackle is loosed,They could not strength...",2012-09-20 00:00:00+00:00,2.0
766457,B0006P1ZAS,"The great divorce,",,A1K1JW1C5CUSUZ,"Donald Mitchell ""Jesus Loves You!""",2/2,5.0,1348531200,A Metaphor for Gaining Salvation You Won't Soo...,"""And those who heard it said, 'Who then can be...",2012-09-25 00:00:00+00:00,2.0
1164647,0025705504,The Great Divorce,,A1K1JW1C5CUSUZ,"Donald Mitchell ""Jesus Loves You!""",2/2,5.0,1348531200,A Metaphor for Gaining Salvation You Won't Soo...,"""And those who heard it said, 'Who then can be...",2012-09-25 00:00:00+00:00,2.0
1012637,B000736QWG,The great divorce,,A1K1JW1C5CUSUZ,"Donald Mitchell ""Jesus Loves You!""",2/2,5.0,1348531200,A Metaphor for Gaining Salvation You Won't Soo...,"""And those who heard it said, 'Who then can be...",2012-09-25 00:00:00+00:00,2.0


In [53]:
(interactions_df.groupby("user_id").size().sort_values(ascending=False) >= 500).sum()

6

#### **building the recommender**

In [32]:
catalog_books_df = pd.read_feather("../data/clean/cleaned_books_data.ftr")
cf_ratings_df = pd.read_feather("../data/clean/cleaned_ratings_data.ftr")

catalog_books_df.columns = catalog_books_df.columns.str.lower()
cf_ratings_df.columns = cf_ratings_df.columns.str.lower()

In [33]:
print(catalog_books_df.shape)
print(cf_ratings_df.shape)
# catalog (143815, 6)
# ratings (1360308, 7)

(143815, 6)
(1360308, 7)


In [34]:
MIN_USER_INTERACTIONS = 5
MAX_USER_INTERACTIONS = 500
MIN_BOOK_INTERACTIONS = 5

while True:
    prev_len = len(cf_ratings_df)

    user_counts = cf_ratings_df["user_id"].value_counts()
    cf_ratings_df = cf_ratings_df[
        cf_ratings_df["user_id"].isin(
            user_counts[
                (user_counts >= MIN_USER_INTERACTIONS)
                & (user_counts <= MAX_USER_INTERACTIONS)
            ].index
        )
    ]

    book_counts = cf_ratings_df["book_id"].value_counts()
    cf_ratings_df = cf_ratings_df[
        cf_ratings_df["book_id"].isin(
            book_counts[book_counts >= MIN_BOOK_INTERACTIONS].index
        )
    ]

    if len(cf_ratings_df) == prev_len:
        break

unique_users = cf_ratings_df["user_id"].unique()
unique_books = cf_ratings_df["book_id"].unique()

user_to_idx = {user_id: idx for idx, user_id in enumerate(unique_users)}
book_to_idx = {book_id: idx for idx, book_id in enumerate(unique_books)}

In [39]:
type(cf_ratings_df["user_id"].unique())

numpy.ndarray

In [14]:
print(cf_ratings_df[cf_ratings_df["confidence"] > 0].shape)
print(cf_ratings_df[cf_ratings_df["confidence"] == 1].shape)
print(cf_ratings_df[cf_ratings_df["confidence"] == 2].shape)

(1360308, 7)
(342021, 7)
(1018287, 7)


In [15]:
# Users universe
U = cf_ratings_df['user_id'].unique()
n_users = len(U)

# Items universe (Full Catalog)
I_full = catalog_books_df['book_id'].unique()
n_items_full = len(I_full)

# CF-trainable items (Books that actually appear in ratings)
I_cf = cf_ratings_df['book_id'].unique()
n_items_cf = len(I_cf)

print(f"Universe size: {n_users:,} users x {n_items_full:,} items")
print(f"CF-trainable items: {n_items_cf:,} ({100*n_items_cf/n_items_full:.1f}% of catalog)")
print(f"Cold-start items: {n_items_full - n_items_cf:,}")

Universe size: 701,771 users x 143,815 items
CF-trainable items: 135,693 (94.4% of catalog)
Cold-start items: 8,122


In [None]:
# Create user ID to integer index mapping
user_to_idx = {user_id: idx for idx, user_id in enumerate(U)}
idx_to_user = {idx: user_id for user_id, idx in user_to_idx.items()}

# Create book ID to integer index mapping (only CF-trainable books)
book_to_idx = {book_id: idx for idx, book_id in enumerate(I_cf)}
idx_to_book = {idx: book_id for book_id, idx in book_to_idx.items()}

In [8]:
OUTPUT_RATINGS = "../data/clean/cleaned_ratings_data.ftr"
ratings_df = pd.read_feather(OUTPUT_RATINGS)

In [9]:
ratings_df.shape

(1360308, 7)