In [None]:
import pandas as pd

In [None]:
movies = pd.read_csv("../data/processed/movies_enriched.csv")
ratings = pd.read_csv("../data/processed/ratings.csv")

In [None]:
def clean_text(x):
    if isinstance(x, str):
        return x.replace(" ", "").lower()
    return ""

movies["combined"] = (
    movies["genres"].fillna("") + " " +
    movies["keywords"].fillna("") + " " +
    movies["overview"].fillna("") + " " +
    movies["directors"].fillna("") + " " +
    movies["actors"].fillna("")
)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=5000, stop_words="english")
tfidf_matrix = tfidf.fit_transform(movies["combined"])

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [None]:
indices = pd.Series(movies.index, index=movies["title"]).drop_duplicates()

def recommend_content(title, top_n=10):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]
    movie_indices = [i[0] for i in sim_scores]
    return movies["title"].iloc[movie_indices]

In [None]:
from scipy.sparse import csr_matrix

user_ids = ratings["userId"].astype("category").cat.codes
movie_ids = ratings["movieId"].astype("category").cat.codes

sparse_matrix = csr_matrix(
    (ratings["rating"], (user_ids, movie_ids))
)

In [None]:
from surprise import SVD, Dataset, Reader

reader = Reader(rating_scale=(0.5, 5))
data = Dataset.load_from_df(ratings[["userId", "movieId", "rating"]], reader)

trainset = data.build_full_trainset()

model = SVD()
model.fit(trainset)

In [None]:
model.predict(user_id, movie_id)

In [None]:
def hybrid_score(user_id, movie_id, content_score, alpha=0.5):
    collab_score = model.predict(user_id, movie_id).est
    return alpha * content_score + (1 - alpha) * collab_score

In [None]:
import pickle

# Save TF-IDF
pickle.dump(tfidf, open("tfidf.pkl", "wb"))

# Save similarity matrix (optional)
pickle.dump(cosine_sim, open("cosine_sim.pkl", "wb"))

# Save SVD model
pickle.dump(model, open("svd_model.pkl", "wb"))

# Save movie index mapping
pickle.dump(indices, open("indices.pkl", "wb"))

In [None]:
from surprise.model_selection import cross_validate

cross_validate(model, data, measures=["RMSE"], cv=5)