In [None]:
# Book Recommendation System (Content-based, Collaborative, and Hybrid)

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy
import warnings
warnings.filterwarnings("ignore")

# ---------------------- Step 1: Read CSV ----------------------
books = pd.read_csv("updated_books.csv")
ratings = pd.read_csv("synthetic_ratings.csv")

# ---------------------- Step 2: EDA ----------------------
print("Books shape:", books.shape)
print("Ratings shape:", ratings.shape)
print(books.info())
print(ratings.info())
print(books.isnull().sum())
print(ratings.isnull().sum())

# Drop any books with missing title or author (if any remain)
books.dropna(subset=["title", "authors"], inplace=True)

# ---------------------- Step 3: Merge Data ----------------------
ratings = ratings.merge(books[['book_id', 'title', 'authors']], on='book_id', how='left')

# ---------------------- Step 4: Content-Based Filtering ----------------------
books['content'] = books['title'] + ' ' + books['authors']
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(books['content'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

indices = pd.Series(books.index, index=books['title']).drop_duplicates()

def get_content_recommendations(title, top_n=5):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]
    book_indices = [i[0] for i in sim_scores]
    return books[['title', 'authors']].iloc[book_indices]

# ---------------------- Step 5: Collaborative Filtering (SVD) ----------------------
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[['user_id', 'book_id', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

model = SVD()
model.fit(trainset)
predictions = model.test(testset)

print("RMSE:", accuracy.rmse(predictions))

def get_collab_recommendations(user_id, top_n=5):
    book_ids = ratings['book_id'].unique()
    rated_books = ratings[ratings['user_id'] == user_id]['book_id']
    book_ids_to_predict = [bid for bid in book_ids if bid not in rated_books.values]
    pred_ratings = [(bid, model.predict(user_id, bid).est) for bid in book_ids_to_predict]
    pred_ratings.sort(key=lambda x: x[1], reverse=True)
    top_books = [x[0] for x in pred_ratings[:top_n]]
    return books[books['book_id'].isin(top_books)][['title', 'authors']]

# ---------------------- Step 6: Hybrid Filtering ----------------------
def hybrid_recommendation(user_id, title, top_n=5):
    content_recs = get_content_recommendations(title, top_n=20)
    content_ids = books[books['title'].isin(content_recs['title'])]['book_id']
    hybrid_preds = [(bid, model.predict(user_id, bid).est) for bid in content_ids]
    hybrid_preds.sort(key=lambda x: x[1], reverse=True)
    top_books = [x[0] for x in hybrid_preds[:top_n]]
    return books[books['book_id'].isin(top_books)][['title', 'authors']]

# ---------------------- Step 7: Example Calls ----------------------
print("\n--- Content-Based ---")
print(get_content_recommendations("Twilight", 5))

print("\n--- Collaborative Filtering ---")
print(get_collab_recommendations(user_id=10, top_n=5))

print("\n--- Hybrid Recommendation ---")
print(hybrid_recommendation(user_id=10, title="Twilight", top_n=5))
