In [None]:
import pandas as pd
import numpy as np
import re
import html
from collections import defaultdict

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error, pairwise
from surprise import Reader, Dataset, SVD, accuracy
from surprise.model_selection import train_test_split

# -------------------- A. Evaluation Metrics --------------------

def calculate_rmse(actual_ratings, predicted_ratings):
    return np.sqrt(mean_squared_error(actual_ratings, predicted_ratings))

def calculate_rmse_surprise(predictions):
    return accuracy.rmse(predictions, verbose=False)

def precision_at_k(actual, predicted, k=10, threshold=4):
    actual_set = {item: rating for item, rating in actual.items() if rating >= threshold}
    predicted_set = set(predicted[:k])
    relevant_items = actual_set.keys() & predicted_set
    precision = len(relevant_items) / k if predicted else 0
    return precision

def recall_at_k(actual, predicted, k=10, threshold=4):
    actual_set = {item: rating for item, rating in actual.items() if rating >= threshold}
    predicted_set = set(predicted[:k])
    relevant_items = actual_set.keys() & predicted_set
    recall = len(relevant_items) / len(actual_set) if actual_set else 0
    return recall

def precision_recall_at_k_surprise(predictions, k=10, threshold=4):
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions, recalls = {}, {}
    for uid, user_ratings in user_est_true.items():
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])
        n_rel_and_rec_k = sum(((est >= threshold) and (true_r >= threshold)) for (est, true_r) in user_ratings[:k])

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k else 0
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel else 0

    return precisions, recalls

# -------------------- B. Collaborative Filtering --------------------

def load_and_prepare_cf_data(file="D:/ratings_cleaned.csv", rating_scale=(1, 10), test_size=0.2):
    ratings = pd.read_csv(file)
    reader = Reader(rating_scale=rating_scale)
    data = Dataset.load_from_df(ratings[['User-ID', 'ISBN', 'Book-Rating']], reader)
    return data, *train_test_split(data, test_size=test_size)

def train_svd_model(trainset, n_factors=100):
    algo = SVD(n_factors=n_factors, random_state=42)
    algo.fit(trainset)
    return algo

def evaluate_cf_model(algo, testset, k=10, threshold=4):
    predictions = algo.test(testset)
    rmse = calculate_rmse_surprise(predictions)
    precisions, recalls = precision_recall_at_k_surprise(predictions, k=k, threshold=threshold)
    avg_precision = np.mean(list(precisions.values()))
    avg_recall = np.mean(list(recalls.values()))
    return rmse, avg_precision, avg_recall, predictions

# -------------------- C. Content-Based Filtering --------------------

def clean_text(text):
    if pd.isnull(text):
        return ''
    text = html.unescape(text).lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text

def create_book_soup(books):
    books['Book-Title'] = books['Book-Title'].apply(clean_text)
    books['Book-Author'] = books['Book-Author'].apply(clean_text)
    books['Publisher'] = books['Publisher'].apply(clean_text)
    books['soup'] = books['Book-Title'] + ' ' + (books['Book-Author'] + ' ') * 3 + books['Publisher']
    return books

def get_content_similarity(books):
    tfidf = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf.fit_transform(books['soup'])
    return pairwise.cosine_similarity(tfidf_matrix, tfidf_matrix)

def recommend_content_based(title, indices, cosine_sim, books, top_n=10):
    try:
        idx = indices[title]
    except KeyError:
        return []
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n + 1]
    book_indices = [i[0] for i in sim_scores]
    return books['Book-Title'].iloc[book_indices].tolist()

def diversity(recommendations, indices, cosine_sim):
    if len(recommendations) <= 1:
        return 1.0
    diversity_sum = 0
    num_pairs = 0
    for i in range(len(recommendations)):
        for j in range(i + 1, len(recommendations)):
            try:
                idx_i, idx_j = indices[recommendations[i]], indices[recommendations[j]]
                diversity_sum += (1 - cosine_sim[idx_i][idx_j])
                num_pairs += 1
            except KeyError:
                continue
    return diversity_sum / num_pairs if num_pairs else 1.0

def evaluate_cbf_model(books, indices, cosine_sim, user_interactions, top_n=10):
    precisions, recalls, diversities = [], [], []

    for user, interactions in user_interactions.items():
        actual_items = {book: 1 for book in interactions}
        recommendations = recommend_content_based(interactions[0], indices, cosine_sim, books, top_n)

        if recommendations:
            precisions.append(precision_at_k(actual_items, recommendations, top_n))
            recalls.append(recall_at_k(actual_items, recommendations, top_n))
            diversities.append(diversity(recommendations, indices, cosine_sim))

    print(f"CBF Avg Precision@{top_n}: {np.mean(precisions):.4f}")
    print(f"CBF Avg Recall@{top_n}: {np.mean(recalls):.4f}")
    print(f"CBF Avg Diversity@{top_n}: {np.mean(diversities):.4f}")

# -------------------- D. Hybrid Recommendation System --------------------

def get_book_index(book_title, books):
    try:
        # Get the index of the book by its title
        return books[books['Book-Title'] == book_title].index[0]
    except IndexError:
        # Return -1 if the book title is not found
        return -1

def get_top_n_recommendations(user_id, ratings_df, svd_model, books_df, top_n=10):
    """
    Get the top N book recommendations for a user using a trained SVD model.

    Args:
        user_id (str or int): The ID of the user for whom to generate recommendations.
        ratings_df (pd.DataFrame): The DataFrame containing user-book ratings.
        svd_model (surprise.prediction_algorithms.matrix_factorization.SVD):
            The trained SVD model from Surprise.
        books_df (pd.DataFrame): The DataFrame containing book details, including ISBN.
        top_n (int, optional): The number of top recommendations to return. Defaults to 10.

    Returns:
        pd.DataFrame: A DataFrame containing the top N recommended books
                      with columns 'Book-Title', 'Book-Author', and 'Year-Of-Publication'.
                      Returns an empty DataFrame if no recommendations can be made.
    """
    # 1. Get the set of books the user has already rated.  We need to exclude these
    # from the recommendations.
    user_ratings = ratings_df[ratings_df['User-ID'] == user_id]
    rated_book_isbns = set(user_ratings['ISBN'])

    # 2. Get all unique ISBNs from the books DataFrame.  These are the books
    # that *could* be recommended.
    all_book_isbns = set(books_df['ISBN'])

    # 3.  Find the books that the user has *not* yet rated.  These are the
    # candidates for recommendations.
    eligible_book_isbns = list(all_book_isbns - rated_book_isbns)

    # 4. Predict ratings for the eligible books.
    predicted_ratings = []
    for isbn in eligible_book_isbns:
        try:
            # Important:  Use the string form of the user_id.
            predicted_rating = svd_model.predict(str(user_id), isbn).est
            predicted_ratings.append((isbn, predicted_rating))
        except ValueError:  # Handle cases where the user or book is not in the training set
            #  It's possible a book in books_df isn't in the training data.  Skip it.
            print(f"Skipping prediction for ISBN: {isbn} (Not in training data)")
            continue

    # 5. Sort the predictions by estimated rating in descending order.
    predicted_ratings.sort(key=lambda x: x[1], reverse=True)

    # 6. Get the top N ISBNs.
    top_n_isbn_recommendations = [isbn for isbn, _ in predicted_ratings[:top_n]]

    # 7.  Join with books_df to get the book details.
    recommended_books = books_df[books_df['ISBN'].isin(top_n_isbn_recommendations)]

    return recommended_books[['Book-Title', 'Book-Author', 'Year-Of-Publication']] if not recommended_books.empty else pd.DataFrame()


def hybrid_recommendations(user_id, book_title, ratings_df, books_df, cosine_sim, svd_model, top_n=10, alpha=0.5):
    """
    Generate hybrid book recommendations for a user based on a given book title,
    combining content-based and collaborative filtering approaches.

    Args:
        user_id (str or int): The ID of the user.
        book_title (str): The title of the book for which to find similar books.
        ratings_df (pd.DataFrame): DataFrame containing user-book ratings.
        books_df (pd.DataFrame): DataFrame containing book details.
        cosine_sim (np.ndarray): Cosine similarity matrix for books.
        svd_model (surprise.prediction_algorithms.matrix_factorization.SVD): Trained SVD model.
        top_n (int, optional): Number of recommendations to return. Defaults to 10.
        alpha (float, optional): Weighting factor between content-based and collaborative
            recommendations (0 to 1).  1.0 means use only content-based, 0.0 is only CF.
            Defaults to 0.5.

    Returns:
        list: A list of book titles representing the hybrid recommendations.
              Returns an empty list if the book title is not found.
    """
    idx = get_book_index(book_title, books_df)  # Use the function here
    if idx == -1:
        print(f"Book '{book_title}' not found in the dataset.")
        return []

    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n + 1]
    book_indices = [i[0] for i in sim_scores]

    # Get content-based recommendations (titles only)
    content_recommendations_titles = books_df['Book-Title'].iloc[book_indices].tolist()

    # Get collaborative filtering recommendations (Dataframe)
    top_cf_recommendations_df = get_top_n_recommendations(
        user_id, ratings_df, svd_model, books_df, top_n
    )
    top_cf_recommendations_titles = top_cf_recommendations_df['Book-Title'].tolist()

    # Combine recommendations, handling potential duplicates, and limit to top_n
    hybrid_recommendations_titles = []
    content_weight = alpha
    cf_weight = 1 - alpha

    # Add content-based recommendations, weighted
    for title in content_recommendations_titles:
        if title not in hybrid_recommendations_titles:
            hybrid_recommendations_titles.append(title)
        if len(hybrid_recommendations_titles) >= top_n:
            break

    # Add collaborative filtering recommendations, weighted
    for title in top_cf_recommendations_titles:
        if title not in hybrid_recommendations_titles:
            hybrid_recommendations_titles.append(title)
        if len(hybrid_recommendations_titles) >= top_n:
            break
    return hybrid_recommendations_titles
# -------------------- E. Main Program --------------------

if __name__ == "__main__":
    # Load datasets
    ratings = pd.read_csv("ratings_cleaned.csv")
    books = pd.read_csv("books_cleaned.csv").head(5000)

    # --- Collaborative Filtering ---
    print("\n--- Collaborative Filtering ---")
    data, trainset, testset = load_and_prepare_cf_data()
    algo = train_svd_model(trainset)
    rmse, precision, recall, _ = evaluate_cf_model(algo, testset)
    print(f"CF RMSE: {rmse:.4f}")
    print(f"CF Avg Precision@10: {precision:.4f}")
    print(f"CF Avg Recall@10: {recall:.4f}")

    # --- Content-Based Filtering ---
    print("\n--- Content-Based Filtering ---")
    books = create_book_soup(books)
    cosine_sim = get_content_similarity(books)
    indices = pd.Series(books.index, index=books['Book-Title']).to_dict()

    # Dummy user interactions (simulate a few users)
    user_interactions = {
        'User1': [books['Book-Title'].iloc[10], books['Book-Title'].iloc[20]],
        'User2': [books['Book-Title'].iloc[5], books['Book-Title'].iloc[15]],
        'User3': [books['Book-Title'].iloc[3], books['Book-Title'].iloc[25]]
    }

    evaluate_cbf_model(books, indices, cosine_sim, user_interactions)

    # --- Hybrid Recommendation ---
    print("\n--- Hybrid Recommendation ---")
    # Assuming '276725' is a valid user ID from your ratings data
    hybrid_recs = hybrid_recommendations(
        user_id='276725',
        book_title='black water',
        ratings_df=ratings,  # Pass the ratings DataFrame
        books_df=books,
        cosine_sim=cosine_sim,
        svd_model=algo,
        top_n=10,
        alpha=0.6  # Adjust alpha as needed (0.6 favors content-based slightly)
    )
    if hybrid_recs:
        print(f"Hybrid recommendations for user 276725 based on 'black water':")
        for title in hybrid_recs:
            print(f"- {title}")
    else:
        print("No hybrid recommendations found.")



--- Collaborative Filtering ---
CF RMSE: 3.4933
CF Avg Precision@10: 0.1972
CF Avg Recall@10: 0.1780

--- Content-Based Filtering ---
CBF Avg Precision@10: 0.0000
CBF Avg Recall@10: 0.0000
CBF Avg Diversity@10: 0.6727

--- Hybrid Recommendation ---
Hybrid recommendations for user 276725 based on 'black water':
- potshot
- god save the child
- all our yesterdays large print
- suspicion of deceit
- sudden mischief spenser mysteries hardcover
- a tally of types with additions by several hands  and with a new introduction by mike parker
- tithe
- the broken hearts club
- franklin delano roosevelt champion of freedom
- the great pretender
- the little prince
