In [2]:
# Import basic libraries
import pandas as pd
import numpy as np

In [3]:
# Import datasets
movies = pd.read_csv("../data/processed/movies_enriched.csv")
ratings = pd.read_csv("../data/processed/ratings.csv")

In [4]:

# def clean_text(x):
#     if isinstance(x, str):
#         return x.replace(" ", "").lower()
#     return ""

# Combines all relevant text into one string for vectorization
movies["combined"] = (
    movies["genres"].fillna("") + " " +
    movies["keywords"].fillna("") + " " +
    movies["overview"].fillna("") + " " +
    movies["directors"].fillna("") + " " +
    movies["actors"].fillna("")
)

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Vectorize combined text into a sparse matrix
tfidf = TfidfVectorizer(max_features=5000, stop_words="english")
tfidf_matrix = tfidf.fit_transform(movies["combined"])

import pickle

# Save the TF-IDF vectorizer
with open("../model/tfidf.pkl", "wb") as f:
    pickle.dump(tfidf, f)

# Save the matrix
from scipy import sparse
sparse.save_npz("../model/tfidf_matrix.npz", tfidf_matrix)

In [6]:
from sklearn.metrics.pairwise import cosine_similarity

# Create a mapping for movie name to index in matrix
indices = pd.Series(movies.index, index=movies["title"])

def recommend_content(title, top_n=10):
    if title not in indices:
        return "Movie not found"
    
    idx = indices[title]

    # Compute cosine similarity only for this movie
    sim_scores = cosine_similarity(
        tfidf_matrix[idx],
        tfidf_matrix
    ).flatten()

    # Get top N similar movies (excluding itself)
    similar_indices = np.argsort(sim_scores)[::-1][1:top_n+1]

    return movies.iloc[similar_indices][["title"]]

In [7]:
from surprise import SVD, Dataset, Reader

reader = Reader(rating_scale=(0.5, 5))
data = Dataset.load_from_df(ratings[["userId", "movieId", "rating"]], reader)

trainset = data.build_full_trainset()

model = SVD(n_factors=100, n_epochs=20, random_state=42)
model.fit(trainset)

KeyboardInterrupt: 

In [None]:
# model.predict(user_id, movie_id)

In [None]:
def recommend_hybrid(user_id, title, top_n=10, alpha=0.5):
    if title not in indices:
        return "Movie not found"

    idx = indices[title]

    # Content similarity (on demand)
    sim_scores = cosine_similarity(
        tfidf_matrix[idx],
        tfidf_matrix
    ).flatten()

    # Get candidate movies
    candidate_indices = np.argsort(sim_scores)[::-1][1:top_n*5]

    scores = []

    for i in candidate_indices:
        movie_id = movies.iloc[i]["movieId"]

        collab_score = model.predict(user_id, movie_id).est
        content_score = sim_scores[i]

        final_score = alpha * content_score + (1 - alpha) * collab_score
        scores.append((i, final_score))

    scores = sorted(scores, key=lambda x: x[1], reverse=True)[:top_n]

    movie_indices = [i[0] for i in scores]

    return movies.iloc[movie_indices][["title"]]

In [None]:
# Save TF-IDF
pickle.dump(tfidf, open("../model/tfidf.pkl", "wb"))

# Save SVD model
pickle.dump(model, open("../model/svd_model.pkl", "wb"))

# Save movie index mapping
pickle.dump(indices, open("../model/indices.pkl", "wb"))


NameError: name 'tfidf' is not defined

In [None]:
# from surprise.model_selection import cross_validate

# cross_validate(model, data, measures=["RMSE"], cv=5)