In [5]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from surprise import Dataset, Reader, SVD
import pickle

# ------------------------
# Load datasets
# ------------------------
ratings = pd.read_csv("../data/ratings.csv")
movies = pd.read_csv("../data/movies.csv")
movie_genre = pd.read_csv("../data/movie_genre.csv")
genre = pd.read_csv("../data/genre.csv")


In [6]:
# Normalize column names
for df in [ratings, movies, movie_genre, genre]:
    df.columns = df.columns.str.lower()

# ------------------------
# Merge genres into movies
# ------------------------
movie_content = movie_genre.merge(genre, on='genreid', how='left')
movie_content = movie_content.rename(columns={'genre_y': 'genre'})
movie_content = movie_content.groupby('movieid')['genre'].apply(
    lambda x: ' '.join(x.dropna().astype(str))
).reset_index()

movies = movies.merge(movie_content, on='movieid', how='left')

In [7]:
# ------------------------
# Train TF-IDF
# ------------------------
tfidf = TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf.fit_transform(movies['genre'].fillna(''))
movie_indices = pd.Series(movies.index, index=movies['movieid']).drop_duplicates()

In [8]:
# ------------------------
# Train SVD (Collaborative Filtering)
# ------------------------
reader = Reader(rating_scale=(ratings.rating.min(), ratings.rating.max()))
data = Dataset.load_from_df(ratings[['userid', 'movieid', 'rating']], reader)
trainset = data.build_full_trainset()

svd = SVD(n_factors=50, n_epochs=20, random_state=42)
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2302a0c70d0>

In [9]:
# ------------------------
# Bundle everything into a single dictionary
# ------------------------
hybrid_bundle = {
    "movies": movies,
    "ratings": ratings,
    "tfidf": tfidf,
    "tfidf_matrix": tfidf_matrix,
    "movie_indices": movie_indices,
    "svd": svd
}

In [12]:
# ------------------------
# Save the bundle in models folder
# ------------------------
model_path = "../models/hybrid_bundle.pkl"
with open(model_path, "wb") as f:
    pickle.dump(hybrid_bundle, f)

print(f"✅ Training complete and hybrid bundle saved at {model_path}")

✅ Training complete and hybrid bundle saved at ../models/hybrid_bundle.pkl
