In [14]:
import nltk
nltk.download('stopwords')
nltk.download('vader_lexicon')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [7]:
!pip install kaggle



In [8]:
!pip install fuzzy

Collecting fuzzy
  Downloading Fuzzy-1.2.2.tar.gz (14 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: fuzzy
  Building wheel for fuzzy (setup.py) ... [?25l[?25hdone
  Created wheel for fuzzy: filename=Fuzzy-1.2.2-cp311-cp311-linux_x86_64.whl size=220704 sha256=b1a826002649a5147fab870dff3e67e95ae507bf20564b496e3ddbfc0ed008f9
  Stored in directory: /root/.cache/pip/wheels/c7/1c/77/28af87176ebf6eb6208c17e64a45a8e48eda4194bd8f605096
Successfully built fuzzy
Installing collected packages: fuzzy
Successfully installed fuzzy-1.2.2


In [9]:
!pip install fuzzywuzzy

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0


In [2]:
!pip install rapidfuzz

Collecting rapidfuzz
  Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m27.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz
Successfully installed rapidfuzz-3.13.0


In [None]:
"""
 Collaborative Book Recommendation System
 ---------------------------------------
 Cleaned and modular version of the original notebook code.

 * Downloads the Amazon Books Reviews dataset via KaggleHub.
 * Performs data reduction, cleaning & preprocessing (title normalization, text cleaning).
 * Builds a TF–IDF + Truncated SVD content representation and a K‑NN model
   for item‑based recommendations.
 * Builds a user‑item matrix and a K‑NN model for user‑based CF and generates
   recommendations with a distance‑weighted score.

 Usage
 -----
 $ python collaborative_recommender.py
 """

import os
import re
import warnings
from pathlib import Path
from typing import Tuple

import kagglehub
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
from rapidfuzz import fuzz, process
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer

# ----------------------------------------------------------------------------
# Globals & constants
# ----------------------------------------------------------------------------
DATASET = "mohamedbakhet/amazon-books-reviews"
DATA_DIR = Path("data")
SAMPLE_FRACTION = 0.10  # 10 % sample for rapid iteration
MIN_RATINGS = 5         # Filter threshold for active users / popular books
RANDOM_STATE = 42

warnings.filterwarnings("ignore", category=FutureWarning)
plt.style.use("ggplot")

# ----------------------------------------------------------------------------
# Utility functions
# ----------------------------------------------------------------------------

def download_dataset() -> Path:
    path = Path(kagglehub.dataset_download(DATASET))
    print(f"✔ Downloaded dataset to {path}")
    return path


def find_csv(root: Path) -> Path:
    for fpath in root.rglob("*.csv"):
        if fpath.suffix == ".csv" and fpath.name.lower().endswith("rating.csv"):
            print(f"✔ Found CSV file: {fpath}")
            return fpath
    raise FileNotFoundError("No rating CSV found in dataset directory.")


def reduce_and_sample_dataset(csv_path: Path, out_path: Path, sample_frac=0.1, random_state=42) -> pd.DataFrame:
    df = pd.read_csv(csv_path)
    df = df.dropna(subset=['User_id'])
    df = df.sample(frac=1, random_state=random_state).reset_index(drop=True)
    sampled_df = df.sample(frac=sample_frac, random_state=random_state).reset_index(drop=True)
    sampled_df.to_csv(out_path, index=False)
    print(f"✔ Saved {sample_frac*100:.0f}% sample to {out_path}")
    return sampled_df


# ---------------------------------------------------------------------------
# Text preprocessing helpers
# ---------------------------------------------------------------------------
STOP_WORDS = set(stopwords.words("english"))
LEMMATIZER = WordNetLemmatizer()
SIA = SentimentIntensityAnalyzer()

def preprocess_summary(text: str) -> str:
    text = text or ""
    tokens = nltk.word_tokenize(text)
    tokens = [w.lower() for w in tokens if w.isalpha() and w.lower() not in STOP_WORDS]
    lemmas = [LEMMATIZER.lemmatize(t) for t in tokens]
    return " ".join(lemmas)


def clean_title(text) -> str:
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"\s*\(.*?\)\s*", "", text)
    text = text.replace("&", "and")
    text = re.sub(r"[^\w\s]", "", text)
    return text.strip()


def lotr_title(text: str) -> str:
    return "the lord of the rings" if isinstance(text, str) and re.match(r"the lord of the rings.*", text) else text


def _best_match(title: str, choices: np.ndarray, threshold: int = 80) -> str:
    """Return the best fuzzy match exceeding *threshold* or the original title."""
    result = process.extractOne(title, choices, scorer=fuzz.token_sort_ratio)
    if result is None:
        return title  # no candidates
    match = result[0]
    score = result[1]
    return match if score >= threshold else title


def fuzzy_dedupe_titles(chunk: pd.Series) -> pd.Series:
    unique_titles = chunk.unique()
    return chunk.apply(lambda x: _best_match(x, unique_titles))



In [15]:

# ---------------------------------------------------------------------------
# Main ETL pipeline
# ---------------------------------------------------------------------------

def basic_clean(df: pd.DataFrame) -> pd.DataFrame:
    df = df.drop(columns=[
        "Price", "profileName", "review/helpfulness", "review/time", "review/text",
    ], errors="ignore")
    df["Title"] = df["Title"].apply(clean_title).apply(lotr_title)
    df = df[df["Title"].astype(bool)]
    chunks = np.array_split(df, 10)
    processed = []
    for c in chunks:
        c["Title"] = fuzzy_dedupe_titles(c["Title"])
        processed.append(c)
    df = pd.concat(processed)
    df["Title"] = df["Title"].str.title()
    return df.drop_duplicates()

def enrich_text_features(df: pd.DataFrame) -> Tuple[pd.DataFrame, np.ndarray]:
    df["cleaned_summary"] = df["review/summary"].fillna("").apply(preprocess_summary)
    vectorizer = TfidfVectorizer(max_features=1000)
    tfidf = vectorizer.fit_transform(df["cleaned_summary"])
    svd = TruncatedSVD(n_components=100, random_state=RANDOM_STATE)
    svd_matrix = svd.fit_transform(tfidf)
    df["sentiment"] = df["review/summary"].fillna("").apply(lambda x: SIA.polarity_scores(x)["compound"])
    return df, svd_matrix


def filter_active(df: pd.DataFrame) -> pd.DataFrame:
    active_users = df["User_id"].value_counts()
    active_users = active_users[active_users >= MIN_RATINGS].index
    popular_books = df["Title"].value_counts()
    popular_books = popular_books[popular_books >= MIN_RATINGS].index
    return df[df["User_id"].isin(active_users) & df["Title"].isin(popular_books)]


def build_item_knn(svd_matrix: np.ndarray) -> NearestNeighbors:
    knn = NearestNeighbors(n_neighbors=6, metric="cosine")
    knn.fit(svd_matrix)
    return knn


def build_user_knn(user_item: pd.DataFrame) -> NearestNeighbors:
    model = NearestNeighbors(n_neighbors=6, metric="cosine")
    model.fit(user_item)
    return model


def user_item_matrix(df: pd.DataFrame) -> pd.DataFrame:
    ui = df.pivot_table(index="User_id", columns="Title", values="review/score")
    return ui.fillna(0)

# ---------------------------------------------------------------------------
# Demo / script execution
# ---------------------------------------------------------------------------

def main():
    for pkg in ("punkt", "wordnet", "stopwords", "vader_lexicon"):
        nltk.download(pkg, quiet=True)

    DATA_DIR.mkdir(exist_ok=True)
    csv_path = find_csv(download_dataset())

    subset_path = DATA_DIR / "amazonCollaborativeSubset.csv"
    sampled = reduce_and_sample_dataset(csv_path, subset_path, SAMPLE_FRACTION, RANDOM_STATE)

    cleaned = basic_clean(sampled)
    enriched, svd_matrix = enrich_text_features(cleaned)
    filtered = filter_active(enriched)

    out_csv = DATA_DIR / "ratings2_processed.csv"
    filtered.to_csv(out_csv, index=False)
    print(f"✔ Saved cleaned sample to {out_csv}")

    item_knn = build_item_knn(svd_matrix)
    distances, idxs = item_knn.kneighbors(svd_matrix[0].reshape(1, -1))
    print("\nTop‑5 similar books to first item:")
    for rank, idx in enumerate(idxs[0][1:], 1):
        print(f" {rank}. {cleaned.iloc[idx]['Title']} (dist={distances[0][rank]:.3f})")

    ui = user_item_matrix(filtered)
    user_knn = build_user_knn(ui)
    dists, user_idxs = user_knn.kneighbors(ui.iloc[0].values.reshape(1, -1))
    target_uid = ui.index[0]
    neighbors = ui.index[user_idxs[0][1:]]
    neighbor_df = filtered[filtered["User_id"].isin(neighbors)]
    already_read = filtered[filtered["User_id"] == target_uid]["Title"].unique()
    recs = (neighbor_df[~neighbor_df["Title"].isin(already_read)]
            .groupby("Title")["review/score"].mean()
            .sort_values(ascending=False)[:10])

    print(f"\nTop recommendations for user {target_uid}:")
    for title, score in recs.items():
        print(f" - {title}: {score:.2f}")


if __name__ == "__main__":
    main()


✔ Downloaded dataset to /kaggle/input/amazon-books-reviews
✔ Found CSV file: /kaggle/input/amazon-books-reviews/Books_rating.csv
✔ Saved 10% sample to data/amazonCollaborativeSubset.csv
✔ Saved cleaned sample to data/ratings2_processed.csv

Top‑5 similar books to first item:
 1. Death In Hyde Park (dist=0.000)
 2. Poison Mind (dist=0.000)
 3. Imitation In Death (dist=0.000)
 4. Death Match (dist=0.000)
 5. Seduction In Death (dist=0.000)





Top recommendations for user A100V1W0C8BWOL:
 - Alfred Hitchcock Presents 12 Stories For Late At Night: 5.00
 - Pride And Prejudice: 5.00
 - Lord Of The Flies: 5.00
 - Their Eyes Were Watching God: 5.00
 - You Cannot Be Serious: 5.00
 - To Kill A Mocking Bird: 5.00
 - The Great Gatsby: 4.00
 - Huckleberry Finn: 4.00
 - Inherit The Wind: 4.00
 - Spark Notes Our Town: 4.00
