In [81]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import scipy.sparse as sps
from eals import ElementwiseAlternatingLeastSquares, load_model

from pathlib import Path

In [50]:
ratings = pd.read_csv('ratings.csv', sep='\t', encoding='latin-1', usecols=['user_id', 'movie_id', 'rating', 'timestamp'])
users = pd.read_csv('users.csv', sep='\t', encoding='latin-1', usecols=['user_id', 'gender', 'zipcode', 'age_desc', 'occ_desc'])
movies = pd.read_csv('movies.csv', sep='\t', encoding='latin-1', usecols=['movie_id', 'title', 'genres'])

In [41]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [None]:
df = ratings.pivot(index = 'user_id', columns ='movie_id', values = 'rating').fillna(0)
num_users, num_movies = df.shape
df.shape

(6040, 3706)

In [43]:
df_sparse = sps.csr_matrix(df.values, dtype=np.float32)

In [69]:
ratings['movie_id'].nunique()

3706

In [None]:
train_df, test_df = train_test_split(ratings, test_size=0.1, random_state=42)

# Get the unique user IDs and movie IDs
unique_user_ids = ratings['user_id'].unique()
unique_movie_ids = ratings['movie_id'].unique()

# Create a mapping from the actual user and movie IDs to new continuous indices
user_id_mapping = {user_id: idx for idx, user_id in enumerate(unique_user_ids)}
movie_id_mapping = {movie_id: idx for idx, movie_id in enumerate(unique_movie_ids)}

# Get the dimensions for the sparse matrices
num_users = len(unique_user_ids)
num_movies = len(unique_movie_ids)

# Initialize the train and test matrices as empty numpy arrays
train_data = np.zeros((num_users, num_movies), dtype=np.float32)
test_data = np.zeros((num_users, num_movies), dtype=np.float32)

# Populate the training data matrix
for row in train_df.itertuples():
    train_data[user_id_mapping[row.user_id], movie_id_mapping[row.movie_id]] = row.rating

# Populate the testing data matrix
for row in test_df.itertuples():
    test_data[user_id_mapping[row.user_id], movie_id_mapping[row.movie_id]] = row.rating

# Convert the numpy arrays into sparse matrices
train_sparse = sps.csr_matrix(train_data)
test_sparse = sps.csr_matrix(test_data)

# Print the resulting shapes
print(train_sparse.shape, test_sparse.shape)

(6040, 3706) (6040, 3706)


In [71]:
train_sparse.shape, test_sparse.shape

((6040, 3706), (6040, 3706))

In [72]:
model = ElementwiseAlternatingLeastSquares(factors=20, num_iter=20)  # Default parameters
model.fit(train_sparse, show_loss=True)

iter=1 update_user loss=13.2122 (0.0390 sec)
iter=1 update_item loss=5.9660 (0.1272 sec)
iter=2 update_user loss=2.3586 (0.0850 sec)
iter=2 update_item loss=1.6455 (0.0686 sec)
iter=3 update_user loss=1.4333 (0.0570 sec)
iter=3 update_item loss=1.3770 (0.0732 sec)
iter=4 update_user loss=1.3202 (0.0878 sec)
iter=4 update_item loss=1.2913 (0.0990 sec)
iter=5 update_user loss=1.2590 (0.0761 sec)
iter=5 update_item loss=1.2393 (0.0948 sec)
iter=6 update_user loss=1.2166 (0.0747 sec)
iter=6 update_item loss=1.2020 (0.1139 sec)
iter=7 update_user loss=1.1847 (0.0873 sec)
iter=7 update_item loss=1.1733 (0.0930 sec)
iter=8 update_user loss=1.1594 (0.0701 sec)
iter=8 update_item loss=1.1504 (0.1381 sec)
iter=9 update_user loss=1.1390 (0.0920 sec)
iter=9 update_item loss=1.1317 (0.0718 sec)
iter=10 update_user loss=1.1221 (0.0540 sec)
iter=10 update_item loss=1.1161 (0.0730 sec)
iter=11 update_user loss=1.1078 (0.1002 sec)
iter=11 update_item loss=1.1028 (0.1047 sec)
iter=12 update_user loss=1.

In [73]:
def predict_ratings(model, df_sparse):
    """
    Generate predictions only for the non-zero entries in the test set.
    """
    user_factors = model.user_factors
    item_factors = model.item_factors
    
    # Initialize an empty sparse matrix with the same shape as the test set
    predictions_sparse = sps.lil_matrix(df_sparse.shape, dtype=np.float32)
    
    test_indices = df_sparse.nonzero()
    
    # Predict ratings only for the existing user-item pairs in the test set
    for user, item in zip(test_indices[0], test_indices[1]):
        predictions_sparse[user, item] = user_factors[user] @ item_factors[item]
    
    return predictions_sparse.tocsr()


def calculate_rmse(df_sparse, predictions_sparse):
    """
    Calculate the Root Mean Squared Error between the actual and predicted ratings.
    """
    # Ensure inputs are csr_matrix
    df_sparse = sps.csr_matrix(df_sparse)
    predictions_sparse = sps.csr_matrix(predictions_sparse)
    
    # Get the non-zero indices from the test set
    test_indices = df_sparse.nonzero()
    
    # Extract actual and predicted ratings
    actual_ratings = np.array(df_sparse[test_indices]).flatten()
    predicted_ratings = np.array(predictions_sparse[test_indices]).flatten()
    
    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(actual_ratings, predicted_ratings))
    return rmse

In [74]:
predictions_sparse = predict_ratings(model, test_sparse)
rmse = calculate_rmse(test_sparse, predictions_sparse)
print(f"RMSE on test set: {rmse:.4f}")

RMSE on test set: 0.9804


In [None]:
class eAlsPredictor:
    """
    A class to generate movie recommendations using the ElementwiseAlternatingLeastSquares model.

    Parameters:
    - model (ElementwiseAlternatingLeastSquares): A pre-trained model for generating user and item latent factors.
    - train_sparse (sps.csr_matrix): A sparse matrix representing the training data (user-item ratings).
    - movies (pd.DataFrame): A DataFrame containing information about movies (e.g., movie ID, title).
    """
    def __init__(self, model: ElementwiseAlternatingLeastSquares, train_sparse: sps.csr_matrix, movies: pd.DataFrame):
        """
        Initializes the eAlsPredictor class with a trained model, training data, and movie data.
        
        Parameters:
        - model (ElementwiseAlternatingLeastSquares): The trained ALS model used for making predictions.
        - train_sparse (sps.csr_matrix): A sparse matrix of training data.
        - movies (pd.DataFrame): DataFrame containing movie details (columns include 'movie_id', 'title').
        """
        self._model = model
        self._train_sparse = train_sparse
        self._movies = movies


    def _update_model(self, user_id: int, rated_items: dict[int, int]) -> None:
        """
        Updates the model based on new ratings provided by the user.
        
        Parameters:
        - user_id (int): The ID of the user.
        - rated_items (dict[int, int]): A dictionary containing item IDs and corresponding ratings to update the model.
        """
        for item_id, rating in rated_items.items():
            self._model.update_model(user_id, item_id, rating)


    def _get_recommend_items_ids(self, user_id: int, num_recommendations: int, num_random_items: int=0):
        """
        Returns recommended item IDs for a given user.
        
        Parameters:
        - user_id (int): The ID of the user for whom to generate recommendations.
        - num_recommendations (int): The number of top recommendations to generate.
        - num_random_items (int, optional): Number of random items to add to recommendations (default is 0).
        
        Returns:
        - np.ndarray: Array of recommended item IDs.
        """
        user_factors = self._model.user_factors
        item_factors = self._model.item_factors

        if user_id >= user_factors.shape[0]:
            raise ValueError(f"User ID {user_id} is out of range.")
        
        # Compute scores for all items for the given user
        user_vector = user_factors[user_id]
        scores = user_vector @ item_factors.T

        # Get the user's rated items from the training data
        user_rated_items = self._train_sparse[user_id].nonzero()[1]

        # Set scores for already rated items to a very low value to exclude them
        scores[user_rated_items] = -np.inf

        # Get the indices of the top N items with the highest scores
        top_items_indices = np.argsort(scores)[-num_recommendations:][::-1]

        # Add random items to the recommendations if num_random_items > 0
        if num_random_items > 0:
            random_items = np.random.choice(
                np.setdiff1d(np.arange(item_factors.shape[0]), top_items_indices),
                num_random_items,
                replace=False
            )
            top_items_indices = np.concatenate([top_items_indices, random_items])
        
        return top_items_indices


    def add_user(self, user_id: int, rated_items: dict[int, int]) -> None:
        new_user_ratings = np.zeros(self._train_sparse.shape[1], dtype=np.float32)

        # Update ratings for the rated items
        for item_id, rating in rated_items.items():
            new_user_ratings[item_id] = rating

        # Convert to a csr_matrix and stack with the existing training matrix
        new_user_sparse = sps.csr_matrix([new_user_ratings])
        self._train_sparse = sps.vstack([self._train_sparse, new_user_sparse])

        self._update_model(user_id, rated_items)


    def add_rating_for_user(self, user_id: int, item_id: int, rating: int) -> None:
        if user_id >= self._train_sparse.shape[0]:
            raise ValueError(f"User ID {user_id} is out of range. You may need to add the user first.")

        train_sparse_lil = self._train_sparse.tolil()
        train_sparse_lil[user_id, item_id] = rating

        self._train_sparse = train_sparse_lil.tocsr()
        self._update_model(user_id, {item_id: rating})


    def recommend_items(self, user_id: int, num_recommendations: int, num_random_items: int=0) -> pd.DataFrame:
        if user_id >= self._model.user_factors.shape[0]:
            raise ValueError(f"User ID {user_id} is out of range. You may need to add the user first.")
        recommended_items = self._get_recommend_items_ids(user_id, num_recommendations, num_random_items)
        
        return self._movies[self._movies['movie_id'].isin(recommended_items)]
        

In [76]:
pred = eAlsPredictor(model, df_sparse, movies)

In [78]:
pred.add_rating_for_user(1, 1, 5)
pred.recommend_items(1, 5)
pred.add_user(2, {1: 5, 2: 4, 3: 3})

iter=1 update_model loss=1.0338 (0.0005 sec)
iter=1 update_model loss=1.0338 (0.0010 sec)
iter=1 update_model loss=1.0338 (0.0000 sec)
iter=1 update_model loss=1.0338 (0.0010 sec)


In [82]:
# Get the current data in format of m-d-h to add to file name
from datetime import datetime
now = datetime.now()
dt_string = now.strftime("%d-%m-%Y_%H-%M")

# Save the model
model_path = Path("models") / f"model_eALS_{dt_string}.joblib"
# convert model path to string
model.save(model_path)