In [1]:
import pandas as pd
import pickle
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import NearestNeighbors

# Load dataset
ratings_df = pd.read_csv("../../data/hybrid_model_data.csv", index_col=0)
explicit_ratings = ratings_df.dropna(subset=["rating_x"])

explicit_ratings

Unnamed: 0,timestamp,user_id,movie_title,rating_x,movie_id,movie_title_id,title,year,rating_y,genres,plot
0,2025-02-28T03:36:48,265143,rare exports a christmas tale 2010,4,,,,,,,
1,2025-02-28T03:36:49,284982,far away 2001,2,,,,,,,
2,2025-02-28T03:36:49,301905,the princess and the frog 2009,4,780521.0,the+princess+and+the+frog+2009,The Princess and the Frog,2009.0,7.2,"Animation, Adventure, Comedy, Family, Fantasy,...",A waitress desperate to fulfill her dreams as ...
3,2025-02-28T03:36:50,104416,soul assassin 2001,3,,,,,,,
4,2025-02-28T03:36:51,251315,civil brand 2003,3,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
462959,2025-03-03T00:35:35,318940,the vanishing 1993,4,108473.0,the+vanishing+1993,The Vanishing,1993.0,6.3,"Drama, Horror, Mystery, Thriller",The boyfriend of an abducted woman never gives...
462960,2025-03-03T00:35:35,20815,up in arms 1944,4,,,,,,,
462961,2025-03-03T00:35:35,290958,lucky 7 2003,3,370904.0,lucky+7+2003,Lucky 7,2003.0,6.6,"Comedy, Romance",Instead of really living her life since losing...
462962,2025-03-03T00:35:35,220777,the corruptor 1999,2,142192.0,the+corruptor+1999,The Corruptor,1999.0,6.1,"Action, Crime, Thriller","With the aid from a New York City policeman, a..."


In [5]:
import pandas as pd
from scipy.sparse import csr_matrix

# Assuming explicit_ratings is already defined
# Group by user_id and movie_title, and aggregate ratings
grouped = explicit_ratings.groupby(['user_id', 'movie_title'])['rating_x'].mean().reset_index()

# Create a sparse matrix
user_ids = grouped['user_id'].astype('category').cat.codes
movie_titles = grouped['movie_title'].astype('category').cat.codes

# Create a sparse matrix
user_item_matrix = csr_matrix((grouped['rating_x'], (user_ids, movie_titles)),
                               shape=(len(grouped['user_id'].unique()), len(grouped['movie_title'].unique())))

In [7]:
# Method 1: Matrix Factorization with TruncatedSVD
svd = TruncatedSVD(n_components=50, random_state=42)
user_factors = svd.fit_transform(user_item_matrix)
item_factors = svd.components_



In [11]:
svd.components_

array([[ 2.99696938e-05,  9.88670229e-06,  2.69434269e-04, ...,
         3.56634327e-05,  2.38774549e-06,  8.36298787e-06],
       [ 5.42593473e-05,  1.17403967e-05, -2.35246314e-05, ...,
         3.70782843e-04,  3.39939955e-06,  1.45280332e-05],
       [ 2.18537986e-04,  1.87685801e-05,  2.60364443e-06, ...,
         3.29146189e-04,  7.68685089e-06,  2.33518977e-05],
       ...,
       [-6.96673792e-05, -3.57852820e-04,  6.07891305e-04, ...,
        -1.44844294e-07,  5.45801519e-06, -2.55396220e-05],
       [ 4.07084149e-05,  2.78906541e-04,  2.36554354e-04, ...,
        -2.13583210e-05, -4.02151387e-06, -1.78905535e-05],
       [ 6.50230845e-05,  2.05918543e-04, -1.25852690e-05, ...,
         2.45101560e-05,  1.09208131e-05, -8.61769097e-06]],
      shape=(50, 26548))

In [8]:
# Method 2: User-User Collaborative Filtering with KNN
knn_model = NearestNeighbors(metric='cosine', algorithm='brute')
knn_model.fit(user_item_matrix)



In [9]:
# Save components for prediction
model_artifacts = {
    'svd': svd,
    'user_item_matrix': user_item_matrix,
    'user_factors': user_factors,
    'item_factors': item_factors,
    'knn_model': knn_model
}



In [12]:
with open("models/knn_cf_model.pkl", "wb") as f:
    pickle.dump(model_artifacts, f)

print("Collaborative Filtering model saved using scikit-learn.")

Collaborative Filtering model saved using scikit-learn.
