In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import pairwise_distances
# Load your dataset here
# For simplicity, we're using the Surprise library's built-in dataset.
ratings_df = pd.read_csv('../../data/lens_tmdb/ratings_small.csv')

movie_matrix = ratings_df.pivot(index='userId', columns='movieId', values='rating').fillna(0)
user_similarity = pairwise_distances(movie_matrix, metric='cosine')

def predict(ratings, similarity):
    mean_user_rating = ratings.mean(axis=1)
    ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
    pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    return pred

user_prediction = predict(movie_matrix.values, user_similarity)


In [8]:
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate


reader = Reader(rating_scale=(0.5, 5))
data = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)

# Use the SVD algorithm.
svd = SVD()

# Compute the RMSE of the SVD algorithm.
cross_validate(svd, data, measures=['RMSE'], cv=5, verbose=True)


Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8901  0.9010  0.9009  0.8967  0.8931  0.8964  0.0043  
Fit time          0.78    0.75    0.73    0.71    0.72    0.74    0.02    
Test time         0.07    0.08    0.07    0.07    0.07    0.07    0.00    


{'test_rmse': array([0.89011192, 0.90099057, 0.90087766, 0.89674713, 0.89305157]),
 'fit_time': (0.7760848999023438,
  0.7513971328735352,
  0.7261896133422852,
  0.7066209316253662,
  0.722975492477417),
 'test_time': (0.06829524040222168,
  0.07726073265075684,
  0.06988143920898438,
  0.07341194152832031,
  0.06951284408569336)}

In [7]:
ratings_df.columns

Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')