In [12]:
import pandas as pd

train_df = pd.read_csv("../data/processed/train_ratings.csv")
test_df = pd.read_csv("../data/processed/test_ratings.csv")

print(train_df.columns)
train_df.head()


Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='str')


Unnamed: 0,userId,movieId,rating,timestamp
0,1,804,4.0,2000-07-30 18:08:19
1,1,1210,5.0,2000-07-30 18:08:19
2,1,2018,5.0,2000-07-30 18:08:43
3,1,2628,4.0,2000-07-30 18:08:43
4,1,2826,4.0,2000-07-30 18:08:43


In [14]:
from surprise import Dataset, Reader

reader = Reader(rating_scale=(0.5, 5.0))

data = Dataset.load_from_df(
    train_df[["userId", "movieId", "rating"]],
    reader
)

trainset = data.build_full_trainset()

print("Surprise trainset ready ")


Surprise trainset ready 


In [15]:
from surprise import SVD

svd = SVD(
    n_factors=100,
    n_epochs=20,
    lr_all=0.005,
    reg_all=0.02,
    random_state=42
)

svd.fit(trainset)

print("SVD training completed ")


SVD training completed 


In [16]:
from surprise import accuracy

testset = list(
    zip(
        test_df["userId"],
        test_df["movieId"],
        test_df["rating"]
    )
)

predictions = svd.test(testset)

rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)

rmse, mae


RMSE: 0.9623
MAE:  0.7443


(0.9623420728495634, 0.744337438715399)

Matrix Factorization (SVD):
A latent-factor collaborative filtering model was trained using Singular Value Decomposition on a temporally split training set. The model was evaluated on a held-out test set using RMSE and MAE, ensuring realistic performance estimation.

In [17]:
import pickle
import os

os.makedirs("../models", exist_ok=True)

with open("../models/svd_model.pkl", "wb") as f:
    pickle.dump(svd, f)

print("SVD model saved ")


SVD model saved 
