In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
import pickle
import numpy as np

# -----------------------------
# Load CSVs
# -----------------------------
ratings = pd.read_csv(r'D:\Braindead-Team-Egghead\data\ratings.csv')
movies = pd.read_csv(r'D:\Braindead-Team-Egghead\data\movies.csv')
movie_genre = pd.read_csv(r'D:\Braindead-Team-Egghead\data\movie_genre.csv')
genre = pd.read_csv(r'D:\Braindead-Team-Egghead\data\genre.csv')
links = pd.read_csv(r'D:\Braindead-Team-Egghead\data\links.csv')

# Merge movie + genre info for content features
# Normalize column names
ratings.columns = ratings.columns.str.strip().str.lower()
movies.columns = movies.columns.str.strip().str.lower()
movie_genre.columns = movie_genre.columns.str.strip().str.lower()
genre.columns = genre.columns.str.strip().str.lower()
links.columns = links.columns.str.strip().str.lower()

# Merge movie + genre info for content features
# Use the correct genre column
movie_content = movie_genre.merge(genre, on='genreid', how='left')

# Rename the column we need
movie_content = movie_content.rename(columns={'genre_y': 'genre'})

# Combine all genres of a movie into a single string
movie_content = movie_content.groupby('movieid')['genre'].apply(lambda x: ' '.join(x.dropna())).reset_index()

# Merge with movies DataFrame
movies = movies.merge(movie_content, on='movieid', how='left')



# -----------------------------
# Content-based filtering (TF-IDF)
# -----------------------------
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['genre'].fillna(''))

# Compute cosine similarity between movies
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Map movieId to index
movie_indices = pd.Series(movies.index, index=movies['movieid']).drop_duplicates()

# -----------------------------
# Collaborative filtering (SVD)
# -----------------------------
reader = Reader(rating_scale=(ratings['rating'].min(), ratings['rating'].max()))

# Use lowercase column names
data = Dataset.load_from_df(ratings[['userid', 'movieid', 'rating']], reader)

trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# Train SVD collaborative filtering model
svd = SVD(n_factors=50, n_epochs=20, random_state=42)
svd.fit(trainset)








In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from surprise import SVD, Dataset, Reader
import pickle

# -----------------------------
# Load pickled models
# # -----------------------------
# with open('tfidf_vectorizer.pkl', 'rb') as f:
#     tfidf = pickle.load(f)

# with open('cosine_sim.pkl', 'rb') as f:
#     cosine_sim = pickle.load(f)

# with open('svd_model.pkl', 'rb') as f:
#     svd = pickle.load(f)

# with open('movies.pkl', 'rb') as f:
#     movies = pickle.load(f)

# with open('movie_indices.pkl', 'rb') as f:
#     movie_indices = pickle.load(f)



# -----------------------------
# Fast Hybrid Recommender Class
# -----------------------------
class FastHybridRecommender:
    def __init__(self, ratings, movies, movie_indices, tfidf_matrix, cosine_sim, svd):
        self.ratings = ratings
        self.movies = movies
        self.movie_indices = movie_indices
        self.tfidf_matrix = tfidf_matrix
        self.cosine_sim = cosine_sim
        self.svd = svd

    # -----------------------------
    # Vectorized content score
    # -----------------------------
    def content_score(self, user_id):
        user_rated = self.ratings[self.ratings['userid'] == user_id]
        if user_rated.empty:
            return np.zeros(len(self.movies))
        
        rated_indices = [self.movie_indices[mid] for mid in user_rated['movieid']]
        weights = user_rated['rating'].values

        # Weighted average of TF-IDF vectors
        user_profile = np.average(self.tfidf_matrix[rated_indices].toarray(), axis=0, weights=weights)
        return cosine_similarity(self.tfidf_matrix, user_profile.reshape(1, -1)).flatten()

    # -----------------------------
    # Collaborative filtering scores
    # -----------------------------
    def cf_score(self, user_id, candidate_movie_ids):
        return np.array([self.svd.predict(user_id, mid).est for mid in candidate_movie_ids])

    # -----------------------------
    # Top-N recommendations
    # -----------------------------
    def recommend(self, user_id, top_n=10, alpha=0.5):
        user_rated_movies = set(self.ratings[self.ratings['userid']==user_id]['movieid'])
        candidate_movies = self.movies[~self.movies['movieid'].isin(user_rated_movies)].copy()

        # Vectorized content score
        content_scores = self.content_score(user_id)

        # CF score for candidate movies only
        candidate_ids = candidate_movies['movieid'].values
        cf_scores = self.cf_score(user_id, candidate_ids)

        # Blend
        blended_scores = alpha * content_scores[[self.movie_indices[mid] for mid in candidate_ids]] + (1-alpha) * cf_scores
        candidate_movies['score'] = blended_scores

        return candidate_movies.sort_values('score', ascending=False).head(top_n)[['movieid','title','genre','score']]

    # -----------------------------
    # Explanations for each movie
    # -----------------------------
    def explain(self, user_id, movie_id, top_n_sources=3):
        explanations = []

        # 1️⃣ Genre overlap
        user_rated_movies = self.ratings[self.ratings['userid']==user_id]['movieid'].tolist()
        movie_genres = set(self.movies.loc[self.movies['movieid']==movie_id, 'genre'].values[0].split())

        overlap_genres = {}
        for rated_movie in user_rated_movies:
            rated_genres = set(self.movies.loc[self.movies['movieid']==rated_movie, 'genre'].values[0].split())
            common = movie_genres & rated_genres
            if common:
                overlap_genres[rated_movie] = common

        sorted_overlap = sorted(overlap_genres.items(), key=lambda x: len(x[1]), reverse=True)[:top_n_sources]
        for rated_movie, common_genre in sorted_overlap:
            explanations.append(
                f"Because you liked '{self.movies.loc[self.movies['movieid']==rated_movie, 'title'].values[0]}' "
                f"which shares genres {', '.join(common_genre)}"
            )

        # 2️⃣ Collaborative signal
        similar_users = self.ratings[self.ratings['movieid']==movie_id].sort_values('rating', ascending=False)['userid'].tolist()
        similar_users = [u for u in similar_users if u != user_id]
        if similar_users:
            explanations.append(
                f"Users similar to you also rated '{self.movies.loc[self.movies['movieid']==movie_id, 'title'].values[0]}' highly"
            )

        return explanations

    # -----------------------------
    # Full pipeline: top-N + explanations
    # -----------------------------
    def pipeline(self, user_id, top_n=10, alpha=0.5):
        top_movies = self.recommend(user_id, top_n, alpha)
        top_movies['explanations'] = top_movies['movieid'].apply(lambda m: self.explain(user_id, m))
        return top_movies

# -----------------------------
# Example Usage
# -----------------------------
recommender = FastHybridRecommender(ratings, movies, movie_indices, tfidf_matrix=tfidf.transform(movies['genre'].fillna('')), 
                                    cosine_sim=cosine_sim, svd=svd)

top_movies = recommender.pipeline(user_id=1, top_n=5, alpha=0.5)

for idx, row in top_movies.iterrows():
    print(f"Movie: {row['title']} (Score: {row['score']:.2f})")
    for exp in row['explanations']:
        print(" -", exp)
    print()


In [None]:
def get_hybrid_scores(user_id, alpha=0.5):
    user_rated = ratings[ratings['userid'] == user_id]['movieid'].tolist()
    
    # Content scores
    if user_rated:
        rated_indices = [movie_indices[m] for m in user_rated]
        weights = ratings[ratings['userid']==user_id]['rating'].values
        user_profile = np.average(tfidf_matrix[rated_indices].toarray(), axis=0, weights=weights)
        content_scores = cosine_similarity(tfidf_matrix, user_profile.reshape(1, -1)).flatten()
        # Normalize content scores
        content_scores = (content_scores - content_scores.min()) / (content_scores.max() - content_scores.min())
    else:
        content_scores = np.zeros(len(movies))
    
    # CF scores
    cf_scores = np.array([svd.predict(user_id, m).est for m in movies['movieid']])
    cf_scores = (cf_scores - cf_scores.min()) / (cf_scores.max() - cf_scores.min())
    
    # Hybrid blend
    hybrid_scores = alpha * content_scores + (1 - alpha) * cf_scores
    return hybrid_scores


def top_k_recommend(user_id, top_n=10, alpha=0.5, threshold=4):
    hybrid_scores = get_hybrid_scores(user_id, alpha)
    
    # Exclude movies the user already rated
    rated_movies = set(ratings[ratings['userid']==user_id]['movieid'])
    movies_to_consider = [(m, s) for m, s in zip(movies['movieid'], hybrid_scores) if m not in rated_movies]
    
    # Sort by score
    top_movies = sorted(movies_to_consider, key=lambda x: x[1], reverse=True)[:top_n]
    return top_movies


def evaluate_top_k(user_ids, top_n=10, threshold=4):
    precisions, recalls = [], []
    
    for u in user_ids:
        top_movies = top_k_recommend(u, top_n)
        top_ids = [m for m, _ in top_movies]
        test_items = ratings[(ratings['userid']==u) & (ratings['rating']>=threshold)]['movieid'].tolist()
        
        if not test_items:
            continue
        
        hits = len(set(top_ids) & set(test_items))
        precisions.append(hits / top_n)
        recalls.append(hits / len(test_items))
    
    print(f"Precision@{top_n}: {np.mean(precisions):.4f}")
    print(f"Recall@{top_n}: {np.mean(recalls):.4f}")

