In [6]:
import nltk
nltk.download('stopwords')
nltk.download('vader_lexicon')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
!pip install kaggle



In [3]:
!pip install rapidfuzz

Collecting rapidfuzz
  Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/3.1 MB[0m [31m36.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m46.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz
Successfully installed rapidfuzz-3.13.0


In [9]:
"""
 Test Script for Collaborative Book Recommender
 ---------------------------------------------
 Loads the pre-sampled & pre-cleaned CSV file from Google Drive
 and runs the full collaborative filtering + KNN logic.
"""

import os
import re
import nltk
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Constants
CSV_PATH = "SampleData.csv"

# Download NLTK resources
nltk.download("punkt")
nltk.download("wordnet")
nltk.download("stopwords")
nltk.download("vader_lexicon")

# Setup
STOP_WORDS = set(stopwords.words("english"))
LEMMATIZER = WordNetLemmatizer()
SIA = SentimentIntensityAnalyzer()

# Load data
df = pd.read_csv(CSV_PATH)
print("\u2714 Loaded dataset:", df.shape)

# Preprocess summaries
def preprocess_summary(text):
    if pd.isna(text): return ""
    tokens = nltk.word_tokenize(text)
    tokens = [t.lower() for t in tokens if t.isalpha() and t.lower() not in STOP_WORDS]
    lemmas = [LEMMATIZER.lemmatize(t) for t in tokens]
    return " ".join(lemmas)

df["cleaned_summary"] = df["review/summary"].fillna("").apply(preprocess_summary)

# TF-IDF + SVD
vectorizer = TfidfVectorizer(max_features=1000)
tfidf = vectorizer.fit_transform(df["cleaned_summary"])
svd = TruncatedSVD(n_components=100, random_state=42)
svd_matrix = svd.fit_transform(tfidf)

# Sentiment
df["sentiment"] = df["review/summary"].fillna("").apply(lambda x: SIA.polarity_scores(x)["compound"])

# Filter active users & popular books
MIN_RATINGS = 5
user_counts = df["User_id"].value_counts()
book_counts = df["Title"].value_counts()
df = df[df["User_id"].isin(user_counts[user_counts >= MIN_RATINGS].index)]
df = df[df["Title"].isin(book_counts[book_counts >= MIN_RATINGS].index)]

# Item KNN
item_knn = NearestNeighbors(n_neighbors=6, metric="cosine")
item_knn.fit(svd_matrix)

# Recommend similar items
distances, indices = item_knn.kneighbors(svd_matrix[0].reshape(1, -1))
print("\nTop –5 similar books to first item:")
for rank, idx in enumerate(indices[0][1:], 1):
    if idx < len(df):
        print(f" {rank}. {df.iloc[idx]['Title']} (dist={distances[0][rank]:.3f})")

# User KNN
user_item_matrix = df.pivot_table(index="User_id", columns="Title", values="review/score").fillna(0)
user_knn = NearestNeighbors(n_neighbors=6, metric="cosine")
user_knn.fit(user_item_matrix)

target_uid = user_item_matrix.index[0]
dists, user_idxs = user_knn.kneighbors(user_item_matrix.iloc[0].values.reshape(1, -1))
neighbors = user_item_matrix.index[user_idxs[0][1:]]
neigh_df = df[df["User_id"].isin(neighbors)]
already_read = df[df["User_id"] == target_uid]["Title"].unique()

recs = (
    neigh_df[~neigh_df["Title"].isin(already_read)]
    .groupby("Title")["review/score"]
    .mean()
    .sort_values(ascending=False)
    .head(10)
)

print(f"\nTop recommendations for user {target_uid}:")
for title, score in recs.items():
    print(f" - {title}: {score:.2f}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


✔ Loaded dataset from Google Drive: (243793, 5)

Top –5 similar books to first item:

Top recommendations for user A100V1W0C8BWOL:
 - Alfred Hitchcock Presents 12 Stories For Late At Night: 5.00
 - Lord Of The Flies: 5.00
 - Their Eyes Were Watching God: 5.00
 - You Cannot Be Serious: 5.00
 - To Kill A Mocking Bird: 5.00
 - The Scarlet Letter: 4.75
 - The Great Gatsby: 4.00
 - Huckleberry Finn: 4.00
 - Inherit The Wind: 4.00
 - Special Delivery: 4.00


