In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

# -----------------------------
# 1️⃣ Load and preprocess data
# -----------------------------
ratings = pd.read_csv(r'D:\Braindead-Team-Egghead\data\ratings.csv')
movies = pd.read_csv(r'D:\Braindead-Team-Egghead\data\movies.csv')
movie_genre = pd.read_csv(r'D:\Braindead-Team-Egghead\data\movie_genre.csv')
genre = pd.read_csv(r'D:\Braindead-Team-Egghead\data\genre.csv')

# Normalize column names
ratings.columns = ratings.columns.str.lower()
movies.columns = movies.columns.str.lower()
movie_genre.columns = movie_genre.columns.str.lower()
genre.columns = genre.columns.str.lower()

# Merge genre info
movie_content = movie_genre.merge(genre, on='genreid', how='left')
movie_content = movie_content.rename(columns={'genre_y': 'genre'})
movie_content = movie_content.groupby('movieid')['genre'].apply(lambda x: ' '.join(x.dropna())).reset_index()
movies = movies.merge(movie_content, on='movieid', how='left')

# -----------------------------
# 2️⃣ Content-based filtering
# -----------------------------
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['genre'].fillna(''))
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
movie_indices = pd.Series(movies.index, index=movies['movieid']).drop_duplicates()

# -----------------------------
# 3️⃣ Collaborative filtering (SVD)
# -----------------------------
reader = Reader(rating_scale=(ratings['rating'].min(), ratings['rating'].max()))
data = Dataset.load_from_df(ratings[['userid', 'movieid', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

svd = SVD(n_factors=50, n_epochs=20, random_state=42)
svd.fit(trainset)

# -----------------------------
# 4️⃣ Precompute user profiles for content
# -----------------------------
def user_profile_vector(user_id):
    user_rated = ratings[ratings['userid'] == user_id]
    if len(user_rated) == 0:
        return np.zeros(tfidf_matrix.shape[1])
    rated_indices = [movie_indices[mid] for mid in user_rated['movieid']]
    weights = user_rated['rating'].values
    profile = np.average(tfidf_matrix[rated_indices].toarray(), axis=0, weights=weights)
    return profile

# -----------------------------
# 5️⃣ Hybrid scoring
# -----------------------------
def hybrid_score_user(user_id, movie_ids, alpha=0.5):
    profile = user_profile_vector(user_id)
    idxs = [movie_indices[mid] for mid in movie_ids]
    
    # Content scores
    content_scores = cosine_similarity(tfidf_matrix[idxs], profile.reshape(1, -1)).flatten()
    
    # CF scores
    cf_scores = np.array([svd.predict(user_id, mid).est for mid in movie_ids])
    
    # Hybrid
    return alpha * content_scores + (1 - alpha) * cf_scores

# -----------------------------
# 6️⃣ Top-K recommendation over test set
# -----------------------------
def recommend_top_k_test(user_id, test_ratings_user, k=10, alpha=0.5):
    movie_ids = [iid for (uid, iid, r) in test_ratings_user]
    if not movie_ids:
        return pd.DataFrame(columns=['movieid', 'score'])
    scores = hybrid_score_user(user_id, movie_ids, alpha)
    return pd.DataFrame({'movieid': movie_ids, 'score': scores}).sort_values('score', ascending=False).head(k)

# -----------------------------
# 7️⃣ Evaluation Metrics
# -----------------------------
# 7a. Rating prediction
predictions = [svd.predict(uid, iid) for (uid, iid, r) in testset]
y_true = [r for (_, _, r) in testset]
y_pred = [pred.est for pred in predictions]

rmse = np.sqrt(mean_squared_error(y_true, y_pred))
mae = mean_absolute_error(y_true, y_pred)
print(f"Rating Prediction -> RMSE: {rmse:.4f}, MAE: {mae:.4f}")

# 7b. Top-K metrics (Precision@K, Recall@K)
precision_list, recall_list = [], []

# Group test set by user
test_user_group = {}
for (uid, iid, r) in testset:
    test_user_group.setdefault(uid, []).append((uid, iid, r))

for u, test_ratings_user in test_user_group.items():
    top_k = recommend_top_k_test(u, test_ratings_user, k=10, alpha=0.5)
    actual = set(iid for (uid, iid, r) in test_ratings_user if r >= 4)
    recommended = set(top_k['movieid'])
    if recommended:
        precision = len(actual & recommended) / len(recommended)
        recall = len(actual & recommended) / len(actual) if len(actual) > 0 else 0
        precision_list.append(precision)
        recall_list.append(recall)

print(f"Top-K Metrics -> Precision@10: {np.mean(precision_list):.4f}, Recall@10: {np.mean(recall_list):.4f}")


# -----------------------------
# 8️⃣ Diversity & Novelty Metrics
# -----------------------------
# Catalog Coverage
recommended_items = set()
for u, test_ratings_user in test_user_group.items():
    top_k = recommend_top_k_test(u, test_ratings_user, k=10, alpha=0.5)
    recommended_items.update(top_k['movieid'])

catalog_coverage = len(recommended_items) / len(movies)
print(f"Catalog Coverage: {catalog_coverage:.4f}")

# Intra-List Diversity (based on genre similarity)
def intra_list_diversity(movie_ids):
    if len(movie_ids) < 2:
        return 0
    idxs = [movie_indices[mid] for mid in movie_ids]
    sim_matrix = cosine_similarity(tfidf_matrix[idxs])
    # Diversity = 1 - avg pairwise similarity
    n = len(movie_ids)
    diversity = 0
    count = 0
    for i in range(n):
        for j in range(i+1, n):
            diversity += (1 - sim_matrix[i,j])
            count += 1
    return diversity / count if count > 0 else 0

diversity_list = []
for u, test_ratings_user in test_user_group.items():
    top_k = recommend_top_k_test(u, test_ratings_user, k=10, alpha=0.5)
    diversity_list.append(intra_list_diversity(top_k['movieid'].tolist()))

avg_intra_list_diversity = np.mean(diversity_list)
print(f"Intra-List Diversity: {avg_intra_list_diversity:.4f}")

# Popularity-Normalized Hits
# Popularity = number of ratings per movie
movie_popularity = ratings.groupby('movieid')['rating'].count().to_dict()

pop_norm_hits = []
for u, test_ratings_user in test_user_group.items():
    top_k = recommend_top_k_test(u, test_ratings_user, k=10, alpha=0.5)
    hits = 0
    for iid in top_k['movieid']:
        if any(iid == actual_iid and r >= 4 for (_, actual_iid, r) in test_ratings_user):
            hits += 1 / np.log(1 + movie_popularity.get(iid, 1))
    pop_norm_hits.append(hits / 10)  # normalized by K

avg_pop_norm_hits = np.mean(pop_norm_hits)
print(f"Popularity-Normalized Hits: {avg_pop_norm_hits:.4f}")



In [None]:
import pickle

# -----------------------------
# Save models and precomputed data
# -----------------------------

# 1️⃣ TF-IDF vectorizer
# with open('tfidf_vectorizer.pkl', 'wb') as f:
#     pickle.dump(tfidf, f)

# # 2️⃣ Cosine similarity matrix
# with open('cosine_sim.pkl', 'wb') as f:
#     pickle.dump(cosine_sim, f)

# # 3️⃣ SVD model (collaborative filtering)
# with open('svd_model.pkl', 'wb') as f:
#     pickle.dump(svd, f)

# # 4️⃣ Movies dataframe with genre info
# with open('movies.pkl', 'wb') as f:
#     pickle.dump(movies, f)

# # 5️⃣ Movie indices mapping (movie_id → row index in movies DataFrame)
# with open('movie_indices.pkl', 'wb') as f:
#     pickle.dump(movie_indices, f)

# print("✅ All models and data saved as .pkl files successfully!")


In [None]:
import pickle
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

class FastHybridRecommender:
    def __init__(self, ratings_path):
        # Load saved models
        # with open('movies.pkl', 'rb') as f:
        #     self.movies = pickle.load(f)
        # with open('movie_indices.pkl', 'rb') as f:
        #     self.movie_indices = pickle.load(f)
        # with open('tfidf_vectorizer.pkl', 'rb') as f:
        #     self.tfidf = pickle.load(f)
        # with open('cosine_sim.pkl', 'rb') as f:
        #     self.cosine_sim = pickle.load(f)
        # with open('svd_model.pkl', 'rb') as f:
        #     self.svd = pickle.load(f)

        # Load ratings
        self.ratings = pd.read_csv(ratings_path)
        self.ratings.columns = self.ratings.columns.str.lower()

        # Precompute TF-IDF matrix
        self.tfidf_matrix = self.tfidf.transform(self.movies['genre'].fillna(''))

    # Compute user profile vector
    def user_profile_vector(self, user_id):
        user_rated = self.ratings[self.ratings['userid'] == user_id]
        if len(user_rated) == 0:
            return np.zeros(self.tfidf_matrix.shape[1])
        rated_indices = [self.movie_indices[mid] for mid in user_rated['movieid']]
        weights = user_rated['rating'].values
        profile = np.average(self.tfidf_matrix[rated_indices].toarray(), axis=0, weights=weights)
        return profile

    # Hybrid scoring using batch computation
    def hybrid_scores(self, user_id, movie_ids, alpha=0.5):
        profile = self.user_profile_vector(user_id)
        idxs = [self.movie_indices[mid] for mid in movie_ids]

        # Content scores
        content_scores = cosine_similarity(self.tfidf_matrix[idxs], profile.reshape(1, -1)).flatten()

        # CF scores (batch)
        testset = [(user_id, mid, 0) for mid in movie_ids]  # dummy rating
        cf_preds = self.svd.test(testset)
        cf_scores = np.array([pred.est for pred in cf_preds])

        # Hybrid
        return alpha * content_scores + (1 - alpha) * cf_scores

    # Generate explanations
    def explain_recommendation(self, user_id, movie_id, top_n_sources=3):
        explanations = []

        user_rated_movies = self.ratings[self.ratings['userid'] == user_id]['movieid'].tolist()
        movie_genres = set(self.movies.loc[self.movies['movieid'] == movie_id, 'genre'].values[0].split())

        # Content-based overlap
        overlap_genres = {}
        for rated_movie in user_rated_movies:
            rated_genres = set(self.movies.loc[self.movies['movieid'] == rated_movie, 'genre'].values[0].split())
            common = movie_genres & rated_genres
            if common:
                overlap_genres[rated_movie] = common

        if overlap_genres:
            sorted_overlap = sorted(overlap_genres.items(), key=lambda x: len(x[1]), reverse=True)[:top_n_sources]
            for rated_movie, common_genre in sorted_overlap:
                explanations.append(
                    f"Because you liked '{self.movies.loc[self.movies['movieid'] == rated_movie, 'title'].values[0]}' "
                    f"which shares genres {', '.join(common_genre)}"
                )

        # Collaborative signal
        similar_users = self.ratings[self.ratings['movieid'] == movie_id].sort_values('rating', ascending=False)['userid'].tolist()
        similar_users = [u for u in similar_users if u != user_id]
        if similar_users:
            explanations.append(
                f"Users similar to you also rated '{self.movies.loc[self.movies['movieid'] == movie_id, 'title'].values[0]}' highly"
            )

        return explanations

    # Recommend top-K movies for any user
    def recommend_top_k(self, user_id, top_k=10, alpha=0.5):
        seen = set(self.ratings[self.ratings['userid'] == user_id]['movieid'])
        unseen_movies = self.movies[~self.movies['movieid'].isin(seen)].copy()
        movie_ids = unseen_movies['movieid'].tolist()

        unseen_movies['score'] = self.hybrid_scores(user_id, movie_ids, alpha)
        top_movies = unseen_movies.sort_values('score', ascending=False).head(top_k)
        top_movies['explanations'] = top_movies['movieid'].apply(lambda m: self.explain_recommendation(user_id, m))

        return top_movies

# -----------------------------
# Example usage
# -----------------------------
if __name__ == "__main__":
    recommender = FastHybridRecommender(r'D:\Braindead-Team-Egghead\data\ratings.csv')

    # Accept any user ID parametrically
    user_id = int(input("Enter user ID: "))
    top_movies = recommender.recommend_top_k(user_id=user_id, top_k=10, alpha=0.5)

    print(f"\nTop 10 recommended movies for user {user_id}:\n")
    for idx, row in top_movies.iterrows():
        print(f"{row['title']} | Genre: {row['genre']} | Score: {row['score']:.3f}")
        for exp in row['explanations']:
            print("  -", exp)
        print()
