Imports

In [5]:
import pandas as pd
import os
from joblib import Parallel, delayed, load
import time

Load data and pickled model

In [6]:
test_data = pd.read_csv(r"test.csv")
anime_data = pd.read_csv(r"anime.csv")
model_knn = load(open(os.path.join(r"pickles/model_knn.pkl"), "rb"))
util_matrix_norm = load(open(os.path.join(r"pickles/util_matrix_norm.pkl"), "rb"))
util_matrix_filtered = load(
    open(os.path.join(r"pickles/util_matrix_filtered.pkl"), "rb")
)

Define a function to predict user ratings for a particular anime using ratings data on users most similar to the user of interest

In [7]:
def predict_rating(user_id, anime_id, sim_threshold=0.0):

    try:
        # Select the normalised review data for the user of interest
        user_data = util_matrix_norm.loc[user_id].to_numpy().reshape(1, -1)

        # Determine the indices of the similar users and their cosine distances from the user of interest
        sim_distances, sim_user_indices = model_knn.kneighbors(user_data, n_neighbors=21)

        # Calculate the similarity scores of the similar users (sim score = 1-distance), convert to a list and remove the first element
        sim_scores = (1-sim_distances)[0].tolist()
        sim_scores.pop(0)

        # Convert the indices for similar users to a list and remove the first element
        sim_user_indices = sim_user_indices[0].tolist()
        sim_user_indices.pop(0)

        # Retrieve the ids of the similar users using the list of indices
        sim_user_ids = util_matrix_norm.reset_index().iloc[sim_user_indices]['user_id'].values.tolist()

        ratings = []
        weights = []

        # For every index, id in user_ids
        for i, sim_user_id in enumerate(sim_user_ids):
            
            # Get the similar user's rating for the anime of interest
            rating = util_matrix_filtered.loc[sim_user_id, anime_id]

            # Get the user's similarity score using the index value
            sim_score = sim_scores[i]

            # Verify if the user's rating is valid and if the similarity to the target user is below the defined threshold.
            # If both conditions are met, append the weighted rating and similarity score to the lists; otherwise, skip this user.
            if not pd.isnull(rating) and sim_score > sim_threshold:
                ratings.append(rating * sim_score)
                weights.append(sim_score)

        try:
            # Calculate the predicted rating for the user of interest
            predicted_rating = sum(ratings) / sum(weights)

        except ZeroDivisionError:
            # If there are no valid ratings, return the average predicted rating given by all users
            predicted_rating = anime_data[anime_data['anime_id'] == anime_id]['rating'].values[0]
            
    except KeyError:
        # If the user ID or anime ID was not present in the training data, return the average predicted given by all users
        predicted_rating = anime_data[anime_data['anime_id'] == anime_id]['rating'].values[0]

    return predicted_rating

In [10]:
collab_knn_predicted_ratings = test_data.copy()
# test_data_sample = test_data.iloc[:1000, :]

# Parallel processing using joblib
start = time.time()
collab_knn_predicted_ratings['rating'] = Parallel(n_jobs=-3)(delayed(predict_rating)(
    row['user_id'], row['anime_id'], sim_threshold=0.0) for _, row in collab_knn_predicted_ratings.iterrows())
end = time.time()
total_time = end - start
print(total_time)

# Save output to a CSV file
collab_knn_predicted_ratings.to_csv('collab_knn_predicted_ratings.csv', header=True, index=False)