In [10]:
%pip install pandas numpy

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [11]:
import pandas as pd
import numpy as np

# MovieLens data set
This dataset (ml-latest-small) describes 5-star rating and free-text tagging activity from MovieLens, a movie recommendation service. It contains 100836 ratings and 3683 tag applications across 9742 movies. These data were created by 610 users between March 29, 1996 and September 24, 2018. This dataset was generated on September 26, 2018.

Users were selected at random for inclusion. All selected users had rated at least 20 movies. No demographic information is included. Each user is represented by an id, and no other information is provided.

The data are contained in the files links.csv, movies.csv, ratings.csv and tags.csv.

In [12]:
from zipfile import ZipFile
import os

# Upload and extract the ml-latest-small.zip file
!wget -q https://files.grouplens.org/datasets/movielens/ml-latest-small.zip

with ZipFile('ml-latest-small.zip', 'r') as zip_ref:
    zip_ref.extractall()

# Load the dataset
movies = pd.read_csv('ml-latest-small/movies.csv')
ratings = pd.read_csv('ml-latest-small/ratings.csv')

# Check basic data
print("Movies dataset shape:", movies.shape)
display(movies.head())
print("Ratings dataset shape:", ratings.shape)
display(ratings.head())

Movies dataset shape: (9742, 3)


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


Ratings dataset shape: (100836, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [13]:
# Check for missing values
print("Missing values in ratings:", ratings.isnull().sum())
print("Missing values in Moviesd data:", movies.isnull().sum())

Missing values in ratings: userId       0
movieId      0
rating       0
timestamp    0
dtype: int64
Missing values in Moviesd data: movieId    0
title      0
genres     0
dtype: int64


In [14]:
# identify duplicates if any
print("Number of duplicate records in ratings:", ratings.duplicated().sum())
print("Number of duplicate records in movies:", movies.duplicated().sum())

Number of duplicate records in ratings: 0
Number of duplicate records in movies: 0


In [15]:
# Create a user-item interaction matrix
user_item_matrix = ratings.pivot_table(index='userId', columns='movieId', values='rating', fill_value=0)
display(user_item_matrix.head())

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
# Define the custom RMSE scoring function
def rmse_scorer(model, X, y):
    # Predict ratings by multiplying user-item matrix with SVD components
    predicted_ratings = model.transform(X).dot(model.components_)
    # Compute RMSE
    return np.sqrt(mean_squared_error(y, predicted_ratings))

In [23]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from scipy.sparse.linalg import svds

# Load the datasets
movies = pd.read_csv('ml-latest-small/movies.csv')
ratings = pd.read_csv('ml-latest-small/ratings.csv')

# Create a user-item interaction matrix
user_item_matrix = ratings.pivot_table(index='userId', columns='movieId', values='rating')

# Create a mask for missing values
missing_mask = user_item_matrix.isna()

# Fill missing values with 0 for SVD
user_item_matrix_filled = user_item_matrix.fillna(0)

# Normalize the ratings
user_ratings_mean = np.mean(user_item_matrix_filled, axis=1)
user_item_matrix_normalized = user_item_matrix_filled - user_ratings_mean.values.reshape(-1, 1)

# Apply SVD
U, sigma, Vt = svds(user_item_matrix_normalized, k=50)
sigma = np.diag(sigma)

# Predict ratings
predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.values.reshape(-1, 1)
predicted_ratings_df = pd.DataFrame(predicted_ratings, columns=user_item_matrix.columns)

# Extract the predicted values for the missing entries
predicted_missing_values = predicted_ratings_df[missing_mask]

# Display the predicted missing values
print(predicted_missing_values)

# Define the custom RMSE scoring function
def rmse_scorer(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Evaluate the model
actual_ratings = user_item_matrix.values.flatten()
predicted_ratings_flat = predicted_ratings.flatten()
rmse = rmse_scorer(actual_ratings[~np.isnan(actual_ratings)], predicted_ratings_flat[~np.isnan(actual_ratings)])
print(f'RMSE: {rmse}')

TypeError: type not understood