In [20]:
import pandas as pd
import numpy as np
import joblib
from scipy import sparse
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import ndcg_score
from sklearn.metrics.pairwise import cosine_similarity

df = pd.read_csv(r"..\data\processed\gutendex-cleaned-dataset_v3.csv")

tfidf_sparse = sparse.load_npz(r"../models/tfidf_matrix.npz")
feature_names = np.load(r"../models/tfidf_feature_names.npy", allow_pickle=True)

print("Dataset shape:", df.shape)
print("TF-IDF shape:", tfidf_sparse.shape)


Dataset shape: (9149, 7)
TF-IDF shape: (9149, 11742)


In [21]:
knn_tfidf = NearestNeighbors(n_neighbors=11, metric='cosine')
knn_tfidf.fit(tfidf_sparse)

distances_a, indices_a = knn_tfidf.kneighbors(tfidf_sparse)

similarities_a = 1 - distances_a[:, 1:]
mean_similarity_a = np.mean(similarities_a)

print("Model A Mean Top-10 Similarity:", mean_similarity_a)


Model A Mean Top-10 Similarity: 0.3228167287569576


In [22]:
svd_200 = TruncatedSVD(n_components=200, random_state=42)
tfidf_reduced = svd_200.fit_transform(tfidf_sparse)

knn_svd = NearestNeighbors(n_neighbors=11, metric='cosine')
knn_svd.fit(tfidf_reduced)

distances_b, indices_b = knn_svd.kneighbors(tfidf_reduced)

similarities_b = 1 - distances_b[:, 1:]
mean_similarity_b = np.mean(similarities_b)

print("Model B Mean Top-10 Similarity:", mean_similarity_b)


Model B Mean Top-10 Similarity: 0.7293997731445321


In [23]:
ndcg_scores_a = []
ndcg_scores_b = []

for i in range(len(df)):
    true_relevance = similarities_a[i].reshape(1, -1)
    predicted_scores_a = similarities_a[i].reshape(1, -1)
    predicted_scores_b = similarities_b[i].reshape(1, -1)
    
    ndcg_scores_a.append(ndcg_score(true_relevance, predicted_scores_a))
    ndcg_scores_b.append(ndcg_score(true_relevance, predicted_scores_b))

print("Model A Mean NDCG@10:", np.mean(ndcg_scores_a))
print("Model B Mean NDCG@10:", np.mean(ndcg_scores_b))


Model A Mean NDCG@10: 1.0
Model B Mean NDCG@10: 0.9999995939442156


In [24]:
popularity_a = []
popularity_b = []

for i in range(len(df)):
    rec_indices_a = indices_a[i][1:]
    rec_indices_b = indices_b[i][1:]
    
    popularity_a.append(df.iloc[rec_indices_a]['download_count'].mean())
    popularity_b.append(df.iloc[rec_indices_b]['download_count'].mean())

print("Model A Avg Recommended Popularity:", np.mean(popularity_a))
print("Model B Avg Recommended Popularity:", np.mean(popularity_b))


Model A Avg Recommended Popularity: 2037.6629139796698
Model B Avg Recommended Popularity: 1992.5407804131596


In [25]:
print("Model A Similarity Std:", np.std(similarities_a))
print("Model B Similarity Std:", np.std(similarities_b))


Model A Similarity Std: 0.1403320799728983
Model B Similarity Std: 0.11590120017997045


In [26]:
def parse_subjects(s):
    if pd.isna(s):
        return set()
    return set([x.strip().lower() for x in s.split(",")])

subject_sets = df["subjects"].apply(parse_subjects).tolist()

def compute_ndcg(similarities, indices, k=10):
    ndcg_scores = []
    for i in range(len(df)):
        rec_indices = indices[i][1:k+1]
        true_relevance = []
        predicted_scores = similarities[i][:k]
        base_subjects = subject_sets[i]
        for idx in rec_indices:
            overlap = len(base_subjects.intersection(subject_sets[idx]))
            true_relevance.append(1 if overlap > 0 else 0)
        if sum(true_relevance) == 0:
            continue
        ndcg = ndcg_score([true_relevance], [predicted_scores])
        ndcg_scores.append(ndcg)
    return np.mean(ndcg_scores)

ndcg_a = compute_ndcg(similarities_a, indices_a, k=10)
ndcg_b = compute_ndcg(similarities_b, indices_b, k=10)

print("Model A Proper NDCG@10:", ndcg_a)
print("Model B Proper NDCG@10:", ndcg_b)


Model A Proper NDCG@10: 0.8523426806829285
Model B Proper NDCG@10: 0.8533672185333697


In [27]:
def compute_random_ndcg(k=10):
    ndcg_scores = []
    all_indices = np.arange(len(df))
    
    for i in range(len(df)):
        base_subjects = subject_sets[i]
        random_indices = np.random.choice(all_indices[all_indices != i], size=k, replace=False)
        
        true_relevance = []
        for idx in random_indices:
            overlap = len(base_subjects.intersection(subject_sets[idx]))
            true_relevance.append(1 if overlap > 0 else 0)
        
        if sum(true_relevance) == 0:
            continue
        
        random_scores = np.random.rand(k)
        ndcg = ndcg_score([true_relevance], [random_scores])
        ndcg_scores.append(ndcg)
    
    return np.mean(ndcg_scores)

random_ndcg = compute_random_ndcg(k=10)
print("Random Baseline NDCG@10:", random_ndcg)


Random Baseline NDCG@10: 0.4518251620569098


In [28]:
def compute_holdout_ndcg(similarities, indices, k=10):
    ndcg_scores = []
    
    for i in range(len(df)):
        subjects = list(subject_sets[i])
        if len(subjects) < 2:
            continue
        
        held_out = subjects[0]
        remaining = set(subjects[1:])
        
        rec_indices = indices[i][1:k+1]
        predicted_scores = similarities[i][:k]
        
        true_relevance = []
        for idx in rec_indices:
            overlap = held_out in subject_sets[idx]
            true_relevance.append(1 if overlap else 0)
        
        if sum(true_relevance) == 0:
            continue
        
        ndcg = ndcg_score([true_relevance], [predicted_scores])
        ndcg_scores.append(ndcg)
    
    return np.mean(ndcg_scores)

holdout_a = compute_holdout_ndcg(similarities_a, indices_a, k=10)
holdout_b = compute_holdout_ndcg(similarities_b, indices_b, k=10)

print("Model A Hold-out NDCG@10:", holdout_a)
print("Model B Hold-out NDCG@10:", holdout_b)


Model A Hold-out NDCG@10: 0.8613488644808529
Model B Hold-out NDCG@10: 0.8611998388150808


In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer

df_=pd.read_csv(r"../data/processed/transformed_dataset_v1.csv")
tfidf_vectorizer = TfidfVectorizer(min_df=5, max_df=0.8)

tfidf_sparse = tfidf_vectorizer.fit_transform(df_["combined_text"])

knn_tfidf = NearestNeighbors(n_neighbors=11, metric='cosine')
knn_tfidf.fit(tfidf_sparse)

joblib.dump(tfidf_vectorizer, r"../models/tfidf_vectorizer.pkl")
joblib.dump(knn_tfidf, r"../models/knn_tfidf.pkl")
sparse.save_npz(r"../models/tfidf_matrix.npz", tfidf_sparse)


In [33]:
#Testing

def recommend_books(title, k=10):
    matches = df[df["title"].str.lower().str.contains(title.lower(), na=False)]
    
    if matches.empty:
        print("No matching title found.")
        return
    
    idx = matches.index[0]
    
    print("\nSelected Book:")
    print(df.iloc[idx]["title"])
    print("-" * 60)
    
    print("\nModel A (TF-IDF + KNN) Recommendations:\n")
    rec_indices_a = indices_a[idx][1:k+1]
    for i, rec_idx in enumerate(rec_indices_a):
        print(f"{i+1}. {df.iloc[rec_idx]['title']}  |  Similarity: {similarities_a[idx][i]:.4f}")
    
    print("\nModel B (SVD + KNN) Recommendations:\n")
    rec_indices_b = indices_b[idx][1:k+1]
    for i, rec_idx in enumerate(rec_indices_b):
        print(f"{i+1}. {df.iloc[rec_idx]['title']}  |  Similarity: {similarities_b[idx][i]:.4f}")


title=input("Enter a book name: ")
recommend_books(title)


Selected Book:
Frankenstein; or, the modern prometheus
------------------------------------------------------------

Model A (TF-IDF + KNN) Recommendations:

1. Frankenstein; Or, The Modern Prometheus  |  Similarity: 0.8686
2. The Last Man  |  Similarity: 0.3620
3. The Vampyre; a Tale  |  Similarity: 0.3292
4. Black magic : $b A tale of the rise and fall of Antichrist  |  Similarity: 0.3136
5. Dracula  |  Similarity: 0.3081
6. The Beetle: A Mystery  |  Similarity: 0.3030
7. Northanger Abbey  |  Similarity: 0.2782
8. The Life and Letters of Mary Wollstonecraft Shelley, Volume 1 (of 2)  |  Similarity: 0.2664
9. A Princess of Mars  |  Similarity: 0.2553
10. Wuthering Heights  |  Similarity: 0.2527

Model B (SVD + KNN) Recommendations:

1. Frankenstein; Or, The Modern Prometheus  |  Similarity: 0.9810
2. The Last Man  |  Similarity: 0.8346
3. Dracula  |  Similarity: 0.7924
4. The colour out of space  |  Similarity: 0.7559
5. At the mountains of madness  |  Similarity: 0.7249
6. The shadow

In [31]:
joblib.dump(knn_tfidf, r"..\models\knn_tfidf.pkl")
joblib.dump(tfidf_vectorizer, r"..\models\tfidf_vectorizer.pkl")


['..\\models\\tfidf_vectorizer.pkl']

In [32]:
def find_book(title_input):
    title_input = title_input.lower().strip()
    
    exact_match = df[df["title"].str.lower() == title_input]
    if not exact_match.empty:
        return exact_match.index.tolist()
    
    partial_match = df[df["title"].str.lower().str.contains(title_input, na=False)]
    if not partial_match.empty:
        return partial_match.index.tolist()
    
    return []

def recommend_existing(idx, k=10):
    distances, indices = knn_tfidf.kneighbors(tfidf_sparse[idx], n_neighbors=k+1)

    similarities = 1 - distances[0][1:]
    rec_indices = indices[0][1:]
    
    print("\nRecommendations:\n")
    for i, rec_idx in enumerate(rec_indices):
        print(f"{i+1}. {df.iloc[rec_idx]['title']}  | Similarity: {similarities[i]:.4f}")

def recommend_new(title, summary, k=10):
    words = summary.split()
    if len(words) > 300:
        summary = " ".join(words[:300])
    
    combined_text = (title + " " + summary).lower()
    new_vector = vectorizer.transform([combined_text])
    
    distances, indices = knn_model.kneighbors(new_vector, n_neighbors=k)
    similarities = 1 - distances[0]
    
    print("\nRecommendations:\n")
    for i, rec_idx in enumerate(indices[0]):
        print(f"{i+1}. {df.iloc[rec_idx]['title']}  | Similarity: {similarities[i]:.4f}")

def run_recommender():
    title_input = input("Enter book title: ").strip()
    
    matches = find_book(title_input)
    
    if len(matches) == 1:
        print("\nBook found in dataset:")
        print(df.iloc[matches[0]]["title"])
        recommend_existing(matches[0])
    
    elif len(matches) > 1:
        print("\nMultiple matches found:\n")
        for i, idx in enumerate(matches[:10]):
            print(f"{i+1}. {df.iloc[idx]['title']}")
        
        choice = input("\nSelect book number: ").strip()
        
        if choice.isdigit() and 1 <= int(choice) <= len(matches[:10]):
            selected_idx = matches[int(choice)-1]
            recommend_existing(selected_idx)
        else:
            print("Invalid selection.")
    
    else:
        print("\nBook not found in dataset.")
        summary = input("Enter short summary (max 300 words): ").strip()
        recommend_new(title_input, summary)

run_recommender()



Book found in dataset:
Pride and Prejudice

Recommendations:

1. Pride and Prejudice, a play founded on Jane Austen's novel  | Similarity: 0.3819
2. Sense and Sensibility  | Similarity: 0.3582
3. Persuasion  | Similarity: 0.3480
4. The Complete Project Gutenberg Works of Jane Austen: A Linked Index of all PG Editions of Jane Austen  | Similarity: 0.3370
5. Northanger Abbey  | Similarity: 0.3235
6. Jane Eyre  | Similarity: 0.2876
7. Jane Eyre: An Autobiography  | Similarity: 0.2869
8. Jane Austen and her works  | Similarity: 0.2840
9. Love and Freindship [sic]  | Similarity: 0.2753
10. Emma  | Similarity: 0.2748
