In [1]:
import pandas as pd
import numpy as np

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" # to make jupyter print all outputs, not just the last one
from IPython.core.display import HTML # to pretty print pandas df and be able to copy them over (e.g. to ppt slides)

In [3]:
netflix_df = pd.read_parquet('cleaned/netflix_parquet')
movielens_df = pd.read_parquet('cleaned/movielens_parquet')

In [4]:
# netflix_df = netflix_df[netflix_df['review_data'].apply(lambda x: len(x) if x is not None else 0) > 500]
netflix_df = netflix_df[netflix_df['review_data'].apply(lambda x: 30 <= len(x) <= 375 if x is not None else False)]
movielens_df = movielens_df[movielens_df['review_data'].apply(lambda x: 30 <= len(x) <= 375 if x is not None else False)]

In [5]:
n_rows = 10

In [6]:
df = (netflix_df.sample(n=n_rows,random_state=42))[['movieId','review_data','genres']]
df
df2 = (movielens_df.sample(n=n_rows,random_state=42))[['movieId','review_data','genres']]
df2

Unnamed: 0,movieId,review_data,genres
1471,1472,"[{'date': 2005-07-19, 'rating': 5.0, 'userId':...",[Documentary]
427,428,"[{'date': 2005-01-20, 'rating': 4.0, 'userId':...",[Family]
145,146,"[{'date': 2001-05-13, 'rating': 3.0, 'userId':...",
401,402,"[{'date': 2005-05-18, 'rating': 3.0, 'userId':...","[Drama, History, Romance, War]"
687,688,"[{'date': 2004-05-13, 'rating': 2.0, 'userId':...",[Documentary]
827,828,"[{'date': 2004-03-27, 'rating': 5.0, 'userId':...","[Documentary, Music]"
1939,1940,"[{'date': 2005-07-11, 'rating': 5.0, 'userId':...","[Drama, Film-Noir]"
185,186,"[{'date': 2005-08-17, 'rating': 3.0, 'userId':...","[Crime, Drama, Film-Noir, Thriller]"
1610,1611,"[{'date': 2003-08-25, 'rating': 4.0, 'userId':...",
1719,1720,"[{'date': 2004-06-08, 'rating': 4.0, 'userId':...",


Unnamed: 0,movieId,review_data,genres
4611,4736,"[{'date': 2003-02-06, 'rating': 1.0, 'userId':...","[Comedy, Drama, Romance]"
5373,5505,"[{'date': 2005-06-27, 'rating': 3.5, 'userId':...","[Comedy, Drama]"
7056,7235,"[{'date': 2010-12-29, 'rating': 4.5, 'userId':...","[Action, Comedy, Crime, Drama, Horror, Thriller]"
878,901,"[{'date': 2007-06-15, 'rating': 5.0, 'userId':...","[Comedy, Musical]"
23737,130636,"[{'date': 2016-08-03, 'rating': 3.5, 'userId':...","[Horror, Mystery, Thriller]"
8029,8815,"[{'date': 2007-03-08, 'rating': 1.5, 'userId':...","[Horror, Thriller]"
4644,4769,"[{'date': 2003-04-14, 'rating': 3.0, 'userId':...",[Documentary]
3147,3249,"[{'date': 2007-02-03, 'rating': 2.0, 'userId':...","[Drama, Thriller]"
6842,7019,"[{'date': 2004-08-09, 'rating': 0.5, 'userId':...","[Comedy, Drama]"
8997,27156,"[{'date': 2022-01-20, 'rating': 4.0, 'userId':...","[Action, Animation, Drama, Fantasy, Sci-Fi]"


In [7]:
# netflix
review_data = df['review_data'].values
user_ids = np.concatenate([np.array([entry['userId'] for entry in row]) for row in review_data])
ratings = np.concatenate([np.array([entry['rating'] for entry in row]) for row in review_data])
movieIds = np.concatenate([[movieId] * len(row) for movieId, row in zip(df['movieId'], review_data)])
print(f"{len(user_ids)} unique userIds are handled from the Netflix dataset.")
print(f"{len(np.unique(movieIds))} unique movieIds are handled from the Netflix dataset.")
print()

# movielens
review_data2 = df2['review_data'].values
user_ids2 = np.concatenate([np.array([entry['userId'] for entry in row]) for row in review_data2])
ratings2 = np.concatenate([np.array([entry['rating'] for entry in row]) for row in review_data2])
movieIds2 = np.concatenate([[movieId] * len(row) for movieId, row in zip(df['movieId'], review_data2)])
print(f"{len(user_ids2)} unique userIds are handled from the Movielens dataset.")
print(f"{len(np.unique(movieIds2))} unique movieIds are handled from the Movielens dataset.")

2289 unique userIds are handled from the Netflix dataset.
10 unique movieIds are handled from the Netflix dataset.

1627 unique userIds are handled from the Movielens dataset.
10 unique movieIds are handled from the Movielens dataset.


### Train,validation and test split:

### Function Explanation

`train_val_test_split`

1. **Shuffle the Data**:
   - The input data is shuffled using `data.sample(frac=1, random_state=42)` to ensure randomness. `random_state=42` ensures reproducibility.

2. **Calculate Set Sizes**:
   - The sizes of each set (training, validation, and test) are calculated based on the provided ratios and the total number of samples in the data.

3. **Split the Data**:
   - The shuffled data is split into three sets: training, validation, and test.
   - The training data contains the first `num_train` samples.
   - The validation data contains the next `num_val` samples, starting from the index immediately following the last training sample.
   - The test data contains the remaining samples, starting from the index immediately following the last validation sample.

4. **Reset Index**:
   - The index of each set is reset to ensure that it starts from 0 and increases incrementally.

5. **Return Sets**:
   - The function returns the training, validation, and test sets as pandas DataFrames.

In [8]:
def train_val_test_split(data, train_ratio=0.8, val_ratio=0.1, test_ratio=0.1):
    """
    Splits the data into training, validation, and test sets, simultaneously ensuring no training data flows into validation or test data.

    Parameters:
    - data: pandas DataFrame containing the data to be split.
    - train_ratio: float, ratio of the training set size to the total data size (default: 0.8).
    - val_ratio: float, ratio of the validation set size to the total data size (default: 0.1).
    - test_ratio: float, ratio of the test set size to the total data size (default: 0.1).

    Returns:
    - train_data: pandas DataFrame, training set.
    - val_data: pandas DataFrame, validation set.
    - test_data: pandas DataFrame, test set.
    """
    # Shuffle the data
    data_shuffled = data.sample(frac=1, random_state=42)

    # Calculate the sizes of each set
    num_samples = len(data_shuffled)
    num_train = int(train_ratio * num_samples)
    num_val = int(val_ratio * num_samples)
    num_test = num_samples - num_train - num_val

    # Split the data into train, validation, and test sets
    train_data = data_shuffled[:num_train]
    # Below is ensured the validation data and the test data starts after the indices which are already in the training data, ensuring that no training data will flow into validation of test data.
    val_data = data_shuffled[num_train:num_train+num_val]
    test_data = data_shuffled[num_train+num_val:]

    # Reset index for each set
    train_data.reset_index(drop=True, inplace=True)
    val_data.reset_index(drop=True, inplace=True)
    test_data.reset_index(drop=True, inplace=True)

    return train_data, val_data, test_data

### Set-up user-item matrix
First we will create a user-item matrix which records all the user-item interactions.


### `create_user_item_matrix` Function Explanation

### Steps:
1. **Extract Review Data**:
   - Extract the review data from the provided DataFrame, which contains user IDs, ratings, and movie IDs.

2. **Create User and Movie IDs Arrays**:
   - Extract user IDs, ratings, and movie IDs from the review data and concatenate them into separate arrays.
   - Generate dictionaries to map user IDs and movie IDs to unique indices in the user-item matrix.

3. **Initialize User-Item Matrix**:
   - Determine the dimensions of the user-item matrix based on the number of unique users and movies.
   - Initialize an empty user-item matrix filled with NaN values.

4. **Populate User-Item Matrix**:
   - Iterate through the review data and populate the user-item matrix with ratings.
   - Map user and movie IDs to their corresponding indices in the matrix and insert the ratings.

5. **Return Results**:
   - Return the user-item matrix along with dictionaries mapping user and movie IDs to indices, and arrays containing user and movie IDs.
  
### Functions Used and Purpose:

- **`np.concatenate()`**: Used to concatenate arrays containing user IDs, ratings, and movie IDs extracted from the review data.
- **`enumerate()`**: Used to iterate over the unique user IDs and movie IDs and generate indices for mapping.
- **`np.unique()`**: Used to find the unique user IDs and movie IDs in the review data.
- **`np.full()`**: Used to initialize an empty user-item matrix filled with NaN values.
- **`zip()`**: Used to iterate over multiple iterables simultaneously (user IDs, movie IDs, ratings).
- **`enumerate()`**: Used to iterate over the indices and elements of an iterable (user IDs, movie IDs) simultaneously.
- **Indexing and Slicing**: Used to access and modify elements in arrays and matrices.

In [9]:
def create_user_item_matrix(train_test_val_set):
    """
    Creates a user-item matrix from the provided dataset containing review data.

    Parameters:
    train_test_val_set (DataFrame): DataFrame containing review data with columns 'review_data',
                                    which is a list of dictionaries with keys 'userId', 'rating',
                                    and 'movieId'.

    Returns:
    user_item_matrix (numpy.ndarray): Matrix representing users' ratings for items (movies), the matrix is an NumPy array which contains lists of user-item interactions, meaning a user and their corresponding ratings to the movieIds.    
    
    user_id_dict (dict): Dictionary mapping user IDs to unique indices in the user-item matrix.
    
    movie_id_dict (dict): Dictionary mapping movie IDs to unique indices in the user-item matrix.
    
    user_ids (numpy.ndarray): Array containing user IDs corresponding to each rating in the matrix.
    
    movie_ids (numpy.ndarray): Array containing movie IDs corresponding to each rating in the matrix.

    """
    train_test_val_set = train_test_val_set.drop(['genres'],axis=1)
    review_data = train_test_val_set['review_data'].values
    user_ids = np.concatenate([np.array([entry['userId'] for entry in row]) for row in review_data])
    ratings = np.concatenate([np.array([entry['rating'] for entry in row]) for row in review_data])
    movieIds = np.concatenate([[movieId] * len(row) for movieId, row in zip(train_test_val_set['movieId'], review_data)])

    # create dictionaries to map user IDs and movie IDs to unique indices to map over
    user_id_dict = {user_id: index for index, user_id in enumerate(np.unique(user_ids))}
    movie_id_dict = {movie_id: index for index, movie_id in enumerate(np.unique(movieIds))}

    # initialize an empty user-item matrix
    user_count = len(user_id_dict)
    movie_count = len(movie_id_dict)
    user_item_matrix = np.full((user_count, movie_count), np.nan)

    # populate the user-item matrix with ratings from the dataset
    for i, (user_id, movie_id, rating) in enumerate(zip(user_ids, movieIds, ratings)):
        user_index = user_id_dict[user_id]
        movie_index = movie_id_dict[movie_id]
        user_item_matrix[user_index, movie_index] = rating

    return user_item_matrix, user_id_dict, movie_id_dict, user_ids, movieIds

In the previous two notebooks about userKNN we only included the similarties of rating patterns, so to compute similarity only the regular user-item matrix was needed. Now we want to include genre preference as well. Before we can include it in the similarity calculation, we first need to initiate a user-genre matrix with 0s and 1s indicating a user has watched a movie with that genre.

### Function explanation:

`create_user_genre_matrix`

1. **Extracting User IDs and Unique Genres**:
   - It collects user IDs from the `review_data` column of the DataFrame `df` and gathers unique genres from the `genres` column.

2. **Creating Indexing Dictionaries**:
   - Two dictionaries are created:
     - `genre_to_index`: Maps genres to their respective indices.
     - `movieid_to_genres`: Maps movie IDs to their associated genres.

3. **Initializing User-Genre Matrix**:
   - It initializes a matrix, `user_genre_matrix`, with dimensions `(num_users, num_genres)` to represent users' genre preferences.

4. **Mapping User IDs to Indices**:
   - A dictionary, `user_id_to_index`, is created to map each user ID to a unique index in the `user_genre_matrix`.

5. **Iterating Through Users**:
   - The function iterates through each user in the dataset and extracts movie IDs reviewed by that user.

6. **Updating User-Genre Matrix**:
   - For each user, it finds the genres associated with the reviewed movies and updates the corresponding entries in the `user_genre_matrix` to indicate the user's genre preferences.

7. **Returning User-Genre Matrix**:
   - Finally, it returns the populated `user_genre_matrix`, which can be used in collaborative filtering recommendation systems to incorporate genre information into the recommendation process.

In [10]:
def create_user_genre_matrix(df):
    """
    Create a user-genre matrix based on movie reviews in the DataFrame.

    Parameters:
    df (pandas.DataFrame): DataFrame containing movie review data.

    Returns:
    user_genre_matrix (numpy.ndarray): Matrix representing users' genre preferences.
    
    This function extracts user IDs and unique genres from the DataFrame and constructs a user-genre matrix based on the movies reviewed by each user.
    It iterates through each user in the dataset, extracts the movie IDs reviewed by that user, and finds the genres associated with those movie IDs.
    Then, it updates the user-genre matrix based on the genre interactions, where each row represents a user and each column represents a genre.
    The values in the matrix indicate whether a user has reviewed a movie belonging to a particular genre (1 if yes, 0 if no).
    """

    user_ids = np.concatenate([np.array([entry['userId'] for entry in row]) for row in df['review_data'].values])
    unique_genres = set(genre for sublist in df['genres'] if sublist is not None for genre in sublist)
    
    genre_to_index = {genre: index for index, genre in enumerate(unique_genres)}
    movieid_to_genres = dict(zip(df['movieId'], df['genres']))
    
    num_users = len(np.unique(user_ids))
    num_genres = len(unique_genres)
    user_genre_matrix = np.zeros((num_users, num_genres))
    
    user_id_to_index = {user_id: index for index, user_id in enumerate(np.unique(user_ids))}
    
    for i, user_id in enumerate(np.unique(user_ids)):
        user_reviews = df[df['review_data'].apply(lambda x: any(entry['userId'] == user_id for entry in x))]
        reviewed_movie_ids = user_reviews['movieId'].values
        
        user_genres = []
        for movie_id in reviewed_movie_ids:
            genres = movieid_to_genres.get(movie_id)
            if genres is not None:
                user_genres.extend(genres)
        
        for genre in user_genres:
            if genre in genre_to_index:
                genre_index = genre_to_index[genre]
                user_genre_matrix[user_id_to_index[user_id], genre_index] = 1
    
    return user_genre_matrix

Okay now we will compute the similarity between ratings patterns AND genre preferences of users:

### Compute similarity:
Regularly, cosine similarity is often used to measure the similarity between users based on their preferences or ratings for items (in this case, movies). Cosine similarity ranges from -1 to 1, where:

- 1 indicates perfect similarity,
- 0 indicates no similarity, and
- -1 indicates perfect dissimilarity.

### Interpretation:

- **Positive Cosine Similarity**: Users are similar in their preferences or ratings for movies.
- **Zero Cosine Similarity**: Users have no similarity in their preferences.
- **Negative Cosine Similarity**: Users are dissimilar in their preferences, tending towards opposite ratings for movies.

### Practical Implication:

If one user likes certain types of movies, the other user tends to dislike them, or vice versa. In other words, users with negative cosine similarities have contrasting preferences, making them less suitable for recommending movies to each other.

_____ 
In the function below we will include genre preference as a binary vector when computing the similarity between users. The dot product of row factors of users essentially is added up by one more row vector in terms of genre preference per user. As long as the rows of the shape of the matrix stay the same, we could think of adding more user features to compute user similarity.

### Function explanation

`calculate_similarity_users_genres`

1. **Filling Missing Data**: 
   - The function fills missing values in the `user_ratings_matrix` and `user_genre_matrix` with zeros. This is necessary to ensure consistency in subsequent calculations.

2. **Computing Dot Products**:
   - It computes the dot product of each pair of row vectors in the `user_ratings_matrix` to capture the similarity between users based on their ratings for items. This is done considering only values above a specified threshold, indicating the relevance of ratings.
   - Additionally, it computes the dot product of `user_genre_matrix` to incorporate user genre preferences. This captures the similarity between users based on their genre preferences.

3. **Calculating Norms**:
   - The function calculates the norms of each row vector in the `user_ratings_matrix` and `user_genre_matrix`. This represents the magnitude of each user's ratings and genre preferences, respectively.
   - For the ratings matrix, it uses the Manhattan norm, which is the sum of absolute distances.

4. **Handling Zero Norms**:
   - It replaces zero norms with a small value to avoid division by zero in subsequent calculations. If a user has no ratings for any item or no preference for any genre, their norm would be zero. Replacing it with a small value ensures stability in the calculations.

5. **Computing Similarity Matrix**:
   - Using broadcasting, the function calculates the similarity matrix. It computes the similarity between users based on both their ratings for items and their genre preferences.
   - The similarity between users i and j is given by the dot product of their ratings (divided by the product of their norms) plus the dot product of their genre preferences (divided by the product of their genre norms).

6. **Setting Diagonal Elements to 0**:
   - Finally, the function sets diagonal elements of the similarity matrix to 0 to avoid self-similarity. This ensures that each user's similarity with themselves is not considered in the recommendation process.

In [11]:
def calculate_similarity_users_genres(user_ratings_matrix, user_genre_matrix, threshold):
    """
    Calculate user similarity using Manhattan distance-based similarity measure.

    Parameters:
    user_ratings_matrix (numpy.ndarray): Matrix representing users' ratings for items (movies).
    user_genre_matrix (numpy.ndarray): Matrix representing users' genre preferences.
    threshold (float): Threshold value for considering ratings in the similarity calculation.

    Returns:
    similarity_matrix (numpy.ndarray): Matrix representing similarity between users based on the Manhattan distance.

    The Manhattan distance-based similarity measure is calculated as follows:
    1. Compute the dot product of each pair of row vectors in the user_ratings_matrix, considering only values above the threshold.
    2. Calculate the norms of each row vector, considering only values above the threshold.
    3. Replace zero norms with a small value to avoid division by zero.
    4. Compute the dot product of user_genre_matrix to include genre preferences.
    5. Calculate the similarity matrix using broadcasting, where the similarity between users i and j is given by the dot product
       divided by the product of their norms and genre preferences.
    6. Set diagonal elements to 0 to avoid self-similarity.

    """
    # fill in the missing data with 0s
    user_ratings_matrix = np.nan_to_num(user_ratings_matrix, nan=0)
    
    # fill in the missing data with 0s for user_genre_matrix
    user_genre_matrix = np.nan_to_num(user_genre_matrix, nan=0)
    
    # compute dot product of user_ratings_matrix, in other words: similarity between users and genres by perform dot product of each user row vector containing their ratings
    dot_products_ratings = np.dot(np.where(user_ratings_matrix >= threshold, user_ratings_matrix, 0), user_ratings_matrix.T)
    
    # compute dot product of user_genre_matrix, in other words: similarity between users and genres by perform dot product of each user row vector containing their genre preferences
    dot_products_genre = np.dot(user_genre_matrix, user_genre_matrix.T)
    
    # calculate norms of user_ratings_matrix, using the manhatten norm, which is the sum of the absolute distances
    norms_ratings = np.sum(np.abs(np.where(user_ratings_matrix >= threshold, user_ratings_matrix, 0)), axis=1)
    
    # replace zero norms with a small value to avoid division by zero
    norms_ratings[norms_ratings == 0] = 1e-8
    
    # calculate norms of user_genre_matrix
    norms_genre = np.sum(user_genre_matrix, axis=1)
    
    # Replace zero norms with a small value to avoid division by zero
    norms_genre[norms_genre == 0] = 1e-8
    
    # Compute similarity matrix using broadcasting
    similarity_matrix = (dot_products_ratings / norms_ratings[:, None]) + (dot_products_genre / norms_genre[:, None])
    
    # Set diagonal elements to 0 to avoid self-similarity
    np.fill_diagonal(similarity_matrix, 0)
    
    return similarity_matrix

## Perform User-Based KNN - rating prediction

The top five recommendations of a user are based on the five highest predicted ratings. The highest rating is found by finding the nearest neighbour based on cosine similarity, followed by computing the weighted average of the rating of the nearest neighbors of the specific item. The items with the highest weighted average will be the five recommendations to the user.

### Explanation `generate_user_knn_regressor` function:

### Function Workflow
1. **Validation**: Checks whether the provided user ID exists in the user ID dictionary. If not found, it returns an empty list.

2. **Finding Similar Users**:
   - Retrieves the index of the user in the user-item matrix using the user ID.
   - Computes the similarity scores between the target user and all other users.
   - Selects the top-k most similar users based on similarity scores.

3. **Finding Rated Movies by Similar Users**:
   - Identifies movies that the similar users have rated.

4. **Calculating Weighted Average Ratings**:
   - For each movie rated by similar users:
     - Computes the weighted sum of ratings, where weights are similarity scores between the target user and similar users.
     - Accumulates the sum of similarities.
     - Calculates the weighted average rating for each movie.

5. **Sorting Recommendations**:
   - Sorts movies by their weighted average ratings in descending order.

6. **Conversion and Return**:
   - Converts movie indices back to movie IDs using the movie ID dictionary.
   - Returns a list of recommended movie IDs along with their predicted ratings for the given user.

### Explanation numpy functions used:

1. `np.argsort()`
- **Usage**: `np.argsort(array)`
- **Explanation**: Returns the indices that would sort an array in ascending order.
- **Example**: `np.argsort([30, 10, 20])` returns `[1, 2, 0]`, indicating that the smallest element is at index 1, the second smallest at index 2, and the largest at index 0.

2. `np.where()`
- **Usage**: `np.where(condition)`
- **Explanation**: Returns the indices where a specified condition is true.
- **Example**: `np.where([True, False, True])` returns `(array([0, 2]),)`, indicating that the condition is true at indices 0 and 2.

3. `np.isnan()`
- **Usage**: `np.isnan(array)`
- **Explanation**: Returns a boolean array indicating whether each element is NaN (Not a Number).
- **Example**: `np.isnan([1, np.nan, 3])` returns `[False, True, False]`, indicating that the second element is NaN.

4. `np.zeros_like()`
- **Usage**: `np.zeros_like(array)`
- **Explanation**: Returns an array of zeros with the same shape and type as the input array.
- **Example**: `np.zeros_like([1, 2, 3])` returns `[0, 0, 0]`, creating an array of zeros with the same shape as `[1, 2, 3]`.

5. `np.sum()`
- **Usage**: `np.sum(array)`
- **Explanation**: Computes the sum of array elements over a specified axis or the entire array.
- **Example**: `np.sum([1, 2, 3])` returns `6`, summing all elements in the array.

In [12]:
def generate_user_knn_regressor(user_id, user_item_matrix, user_similarity_matrix, user_id_dict, movie_id_dict, k):
    """
    Generates movie recommendations for a given user using user-based k-nearest neighbors (KNN) collaborative filtering with weighted average.

    Parameters:
    user_id (int): ID of the user for whom recommendations are to be generated.
    user_item_matrix (numpy.ndarray): Matrix representing users' ratings for items (movies).
    user_similarity_matrix (numpy.ndarray): Matrix representing cosine similarity between users.
    user_id_dict (dict): Dictionary mapping user IDs to unique indices in the user-item matrix.
    movie_id_dict (dict): Dictionary mapping movie IDs to unique indices in the user-item matrix.
    k (int): Number of nearest neighbors to consider for recommendations.

    Returns:
    recommendations (list): List of tuples containing recommended movie IDs and their predicted ratings for the given user.
    """
    # Ensure user ID exists in the dictionary
    if user_id not in user_id_dict:
        print(f"User with ID {user_id} not found.")
        return []

    # Find the index of the user in the user-item matrix
    user_index = user_id_dict[user_id]

    # calculate the similarity scores between the user and the k nearest neighbours by performing dot product of row vectors of each user with target user
    similar_users_indices = np.argsort(user_similarity_matrix[user_index])[::-1][:k]

    # find the rated movies by filtering NOT nan values
    rated_movies = np.where(~np.isnan(user_item_matrix[similar_users_indices]))[1]

    # intialize 0 matrix before weighted average computation
    weighted_avg_ratings = np.zeros_like(user_item_matrix[0])
    similarity_sum = 0
    
    for movie in rated_movies:
        # this loop iterates over each movie rated within the k similar users with the target user and computes the weighted sum of similarities to then compute the weighted average
        weighted_sum = np.sum(user_item_matrix[similar_users_indices, movie] * user_similarity_matrix[user_index, similar_users_indices])
        similarity_sum += np.sum(user_similarity_matrix[user_index, similar_users_indices])
        weighted_avg_ratings[movie] = weighted_sum / similarity_sum if similarity_sum != 0 else 0

    # the result is sorted descendingly to find out which movies would be the best recommendations
    sorted_indices = np.argsort(weighted_avg_ratings)[::-1]

    # convert movie indices back to movie IDs and return recommendations
    recommendations_regressor = [(list(movie_id_dict.keys())[list(movie_id_dict.values()).index(movie_index)], weighted_avg_ratings[movie_index])
                       for movie_index in sorted_indices[:5]]
    return recommendations_regressor

### See a first batch of recommendations:

By using the functions above to recommend movies above the following results are generated for each dataset:

In [13]:
train_data_netflix, val_data_netflix, test_data_netflix = train_val_test_split(df)
train_data_movielens, val_data_movielens, test_data_movielens = train_val_test_split(df2)

# set up paramters and threshold for similarity
k=1
threshold=3

### `Netflix:`

Training data:

In [14]:
# set up
user_item_matrix_train1, user_id_dict_train1, movie_id_dict_train1, user_ids_train1, movieIds_train1 = create_user_item_matrix(train_data_netflix)
# _, user_ratings_matrix_classified_train1 = computing_neutral_scores(user_item_matrix_train1)
netflix_user_genre_matrix_train1 = create_user_genre_matrix(train_data_netflix)
similarity_matrix_genres_train1 = calculate_similarity_users_genres(user_item_matrix_train1, netflix_user_genre_matrix_train1, threshold)
# user_similarity_matrix_manhattan_train1 = calculate_user_similarity_manhattan(user_item_matrix_train1, threshold)

# generate recommendations
user_id_train1 = user_ids_train1[1]
top5_pred_train1 = generate_user_knn_regressor(user_id_train1, user_item_matrix_train1, similarity_matrix_genres_train1, user_id_dict_train1, movie_id_dict_train1, k)

# print result
print(f"UserKNN regressor Recommendations: \nTop 5 recommended movies with predicted ratings for user {user_id_train1}:")
for movie_id, predicted_rating in top5_pred_train1:
    print(f"Movie ID: {movie_id}, Predicted Rating: {predicted_rating}")

UserKNN regressor Recommendations: 
Top 5 recommended movies with predicted ratings for user 1174811:
Movie ID: 1720, Predicted Rating: 0.0
Movie ID: 1611, Predicted Rating: 0.0
Movie ID: 1472, Predicted Rating: 0.0
Movie ID: 828, Predicted Rating: 0.0
Movie ID: 688, Predicted Rating: 0.0


Validation data

In [15]:
# set up
user_item_matrix_val1, user_id_dict_val1, movie_id_dict_val1, user_ids_val1, movieIds_val1 = create_user_item_matrix(val_data_netflix)
netflix_user_genre_matrix_val1 = create_user_genre_matrix(val_data_netflix)
# _, user_ratings_matrix_classified_val1 = computing_neutral_scores(user_item_matrix_val1)
# user_similarity_matrix_manhattan_val1 = calculate_user_similarity_manhattan(user_item_matrix_val1, threshold)
similarity_matrix_genres_val1 = calculate_similarity_users_genres(user_item_matrix_val1, netflix_user_genre_matrix_val1, threshold)

# generate recommendations
user_id_val1 = user_ids_val1[1]
top5_pred_val1 = generate_user_knn_regressor(user_id_val1, user_item_matrix_val1, similarity_matrix_genres_val1, user_id_dict_val1, movie_id_dict_val1, k)

# print result
print(f"UserKNN regressor Recommendations: \nTop 5 recommended movies with predicted ratings for user {user_id_val1}:")
for movie_id, rating in top5_pred_val1:
    print(f"Movie ID: {movie_id}, Predicted Rating: {rating}")

UserKNN regressor Recommendations: 
Top 5 recommended movies with predicted ratings for user 1998800:
Movie ID: 402, Predicted Rating: 5.0


### `Movielens`

Training data

In [16]:
# set up
user_item_matrix_train2, user_id_dict_train2, movie_id_dict_train2, user_ids_train2, movieIds_train2 = create_user_item_matrix(train_data_movielens)
movielens_user_genre_matrix_train2 = create_user_genre_matrix(train_data_movielens)
# _, user_ratings_matrix_classified_train2 = computing_neutral_scores(user_item_matrix_train2)
# user_similarity_matrix_manhattan_train2 = calculate_user_similarity_manhattan(user_item_matrix_train2, threshold)
similarity_matrix_genres_train2 = calculate_similarity_users_genres(user_item_matrix_train2, movielens_user_genre_matrix_train2, threshold)

# generate recommendations
user_id_train2 = user_ids_train2[1]
top5_pred_train2 = generate_user_knn_regressor(user_id_train2, user_item_matrix_train2, similarity_matrix_genres_train2, user_id_dict_train2, movie_id_dict_train2, k)

# print result
print(f"UserKNN regressor Recommendations: \nTop 5 recommended movies with predicted ratings for user {user_id_train2}:")
for movie_id, rating in top5_pred_train2:
    print(f"Movie ID: {movie_id}, Predicted Rating: {rating}")

UserKNN regressor Recommendations: 
Top 5 recommended movies with predicted ratings for user 306139:
Movie ID: 7235, Predicted Rating: 2.0
Movie ID: 130636, Predicted Rating: 0.0
Movie ID: 27156, Predicted Rating: 0.0
Movie ID: 8815, Predicted Rating: 0.0
Movie ID: 7019, Predicted Rating: 0.0


Validation data

In [17]:
# set up
user_item_matrix_val2, user_id_dict_val2, movie_id_dict_val2, user_ids_val2, movieIds_val2 = create_user_item_matrix(val_data_movielens)
movielens_user_genre_matrix_val2 = create_user_genre_matrix(val_data_movielens)
# _, user_ratings_matrix_classified_val2 = computing_neutral_scores(user_item_matrix_val2)
# user_similarity_matrix_manhattan_val2 = calculate_user_similarity_manhattan(user_item_matrix_val2, threshold)
similarity_matrix_genres_val2 = calculate_similarity_users_genres(user_item_matrix_val2, movielens_user_genre_matrix_val2, threshold)

# generate recommendations
user_id_val2 = user_ids_val2[1]
top5_pred_val2 = generate_user_knn_regressor(user_id_val2, user_item_matrix_val2, similarity_matrix_genres_val2, user_id_dict_val2, movie_id_dict_val2, k)

# print result
print(f"UserKNN regressor Recommendations: \nTop 5 recommended movies with predicted ratings for user {user_id_val2}:")
for movie_id, rating in top5_pred_val2:
    print(f"Movie ID: {movie_id}, Predicted Rating: {rating}")

UserKNN regressor Recommendations: 
Top 5 recommended movies with predicted ratings for user 39104:
Movie ID: 901, Predicted Rating: 5.0


Even though the movieIds which are recommended are the same, the predicted rating differs somewhat, already indicating a difference between the two userKNN models.

## Baseline performance

To assess performance, we are going to compare the original ratings matrix with the predicted one after the userKnn model. In order to do so, we will generate a predicted rating matrix with the two functions below. Essentially an array of predicted ratings is generated in contrast with the tuple in the previous function `generate_user_knn_regressor` with the top 5 results. The secon functions will append these in a new matrix. 

`generate_array_of_pred_ratings`

1. **Check User Existence**:
   - Ensure that the given `user_id` exists in the `user_id_dict`.

2. **Get User Index**:
   - Retrieve the index of the user in the user-item matrix based on the `user_id`.

3. **Find Similar Users**:
   - Calculate the similarity scores between the target user and other users, sort these scores in descending order, and select the top `k` most similar users.

4. **Find Rated Movies by Similar Users**:
   - Identify the movies that have been rated by the selected similar users.

5. **Calculate Weighted Average Ratings**:
   - For each movie rated by the similar users, calculate the weighted sum of ratings and the sum of similarities.

6. **Calculate Predicted Ratings**:
   - Divide the weighted sum of ratings by the sum of similarities to compute the predicted ratings for every movie.

7. **Return Predicted Ratings Array**:
   - Return the array containing predicted ratings for every movie in `movie_id_dict`.

`generate_pred_rating_matrix`

1. **Initialize Predicted Ratings Matrix**:
   - Initialize a matrix to store predicted ratings for every user and movie.

2. **Iterate Over Users**:
   - For each user, generate predicted ratings using the `generate_array_of_pred_ratings` function and fill the corresponding row in the predicted ratings matrix.

3. **Return Predicted Ratings Matrix**:
   - Return the matrix containing predicted ratings for every user and movie.

In [18]:
def generate_array_of_pred_ratings(user_id, user_item_matrix, user_similarity_matrix, user_id_dict, movie_id_dict, k):
    """
    Generates movie recommendations for a given user using user-based k-nearest neighbors (KNN) collaborative filtering with weighted average.

    Parameters:
    user_id (int): ID of the user for whom recommendations are to be generated.
    user_item_matrix (numpy.ndarray): Matrix representing users' ratings for items (movies).
    user_similarity_matrix (numpy.ndarray): Matrix representing cosine similarity between users.
    user_id_dict (dict): Dictionary mapping user IDs to unique indices in the user-item matrix.
    movie_id_dict (dict): Dictionary mapping movie IDs to unique indices in the user-item matrix.
    k (int): Number of nearest neighbors to consider for recommendations.

    Returns:
    predicted_ratings (numpy.ndarray): Array containing predicted ratings for every movie in movie_id_dict.
    """
    # Ensure user ID exists in the dictionary
    if user_id not in user_id_dict:
        print(f"User with ID {user_id} not found.")
        return []

    # Find the index of the user in the user-item matrix
    user_index = user_id_dict[user_id]

    # Get similarity scores of the user with other users and sort indices
    similar_users_indices = np.argsort(user_similarity_matrix[user_index])[::-1][:k]

    # Find movies rated by similar users
    rated_movies = np.where(~np.isnan(user_item_matrix[similar_users_indices]))[1]

    # Calculate weighted average ratings for each movie
    weighted_avg_ratings = np.zeros_like(user_item_matrix[0])
    similarity_sum = np.zeros_like(user_item_matrix[0])
    
    for movie in rated_movies:
        # Calculate weighted sum of ratings and sum of similarities
        weighted_sum = np.sum(user_item_matrix[similar_users_indices, movie] * user_similarity_matrix[user_index, similar_users_indices])
        similarity_sum[movie] += np.sum(user_similarity_matrix[user_index, similar_users_indices])
        weighted_avg_ratings[movie] += weighted_sum

    # Calculate predicted ratings
    predicted_ratings_array = np.divide(weighted_avg_ratings, similarity_sum, out=np.zeros_like(weighted_avg_ratings), where=(similarity_sum != 0))

    return predicted_ratings_array

def generate_pred_rating_matrix(user_item_matrix, user_similarity_matrix, user_id_dict, movie_id_dict, k):
    """
    Generates predicted rating matrix for all users.

    Parameters:
    user_item_matrix (numpy.ndarray): Matrix representing users' ratings for items (movies).
    user_similarity_matrix (numpy.ndarray): Matrix representing cosine similarity between users.
    user_id_dict (dict): Dictionary mapping user IDs to unique indices in the user-item matrix.
    movie_id_dict (dict): Dictionary mapping movie IDs to unique indices in the user-item matrix.
    k (int): Number of nearest neighbors to consider for recommendations.

    Returns:
    predicted_ratings_matrix (numpy.ndarray): Matrix containing predicted ratings for every user and movie.
    """
    num_users = len(user_id_dict)
    num_movies = len(movie_id_dict)
    predicted_ratings_matrix = np.zeros((num_users, num_movies))

    for user_id in user_id_dict:
        user_index = user_id_dict[user_id]
        predicted_ratings = generate_array_of_pred_ratings(user_id, user_item_matrix, user_similarity_matrix, user_id_dict, movie_id_dict, k)
        predicted_ratings_matrix[user_index] = predicted_ratings

    return predicted_ratings_matrix

**We will use RMSE as performance metric, using the function below to compute it:**

### Function explanation

`compute_rmse`
1. **Handle Implicit Ratings**: 
   - Convert `NaN` values in both `original_ratings` and `predicted_ratings` arrays to 0s. This is done using `np.nan_to_num()` function to ensure that non-rated items are treated as having a rating of 0 for comparison.
   
2. **Flatten Arrays**:
   - Flatten both `original_ratings` and `predicted_ratings` arrays into 1D arrays to facilitate making masks.

3. **Remove Unrated Items**:
   - Create a mask to filter out entries where the original rating is 0 (unrated items). Only ratings for rated items are considered for RMSE calculation.

4. **Compute Squared Differences**:
   - Calculate the squared differences between original and predicted ratings for the rated items.

5. **Compute Mean Squared Error (MSE)**:
   - Compute the mean squared error (MSE) by averaging the squared differences.

6. **Compute RMSE**:
   - Compute the square root of the mean squared error to obtain the RMSE value, which indicates the average difference between the original and predicted ratings.

7. **Return RMSE**:
   - Return the computed RMSE value as the output of the function.

In [19]:
def compute_rmse(original_ratings, predicted_ratings):
    """
    Computes the Root Mean Square Error (RMSE) between the original ratings and the predicted ratings. MovieIds a user has not interacted with is turned into 0 for now.

    Parameters:
    original_ratings (numpy.ndarray): Array containing the original ratings.
    predicted_ratings (numpy.ndarray): Array containing the predicted ratings.

    Returns:
    float: The RMSE value.
    
    """
    # handle implicit ratings with 0s for now
    original_ratings = np.nan_to_num(original_ratings, nan=0, posinf=0, neginf=0)
    predicted_ratings = np.nan_to_num(predicted_ratings, nan=0, posinf=0, neginf=0)

    # make 1d arrays by flattening them to be able to make masks
    original_ratings_flat = original_ratings.flatten()
    predicted_ratings_flat = predicted_ratings.flatten()
    
    # remove entries with no original rating (unrated items)
    mask = original_ratings_flat != 0
    original_ratings_flat = original_ratings_flat[mask]
    predicted_ratings_flat = predicted_ratings_flat[mask]
    
    # Compute the squared differences
    squared_diff = np.square(original_ratings_flat - predicted_ratings_flat)
    
    # Compute the mean squared error
    mse = np.mean(squared_diff)
    
    # Compute the square root of the mean squared error to get RMSE
    rmse = np.sqrt(mse)
    
    return rmse

`Netflix`

In [20]:
predicted_ratings_matrix_train1 = generate_pred_rating_matrix(user_item_matrix_train1, similarity_matrix_genres_train1, user_id_dict_train1, movie_id_dict_train1, k=1)
predicted_ratings_matrix_val1 = generate_pred_rating_matrix(user_item_matrix_val1, similarity_matrix_genres_val1, user_id_dict_val1, movie_id_dict_val1, k=1)

In [21]:
train1_rmse = compute_rmse(user_item_matrix_train1, predicted_ratings_matrix_train1)
print("RMSE on training set:", train1_rmse)
val1_rmse = compute_rmse(user_item_matrix_val1, predicted_ratings_matrix_val1)
print("RMSE on validation set:", val1_rmse)

RMSE on training set: 1.6593534200351532
RMSE on validation set: 1.4598450400585696


`Movielens`

In [22]:
predicted_ratings_matrix_train2 = generate_pred_rating_matrix(user_item_matrix_train2, similarity_matrix_genres_train2, user_id_dict_train2, movie_id_dict_train2, k=1)
predicted_ratings_matrix_val2 = generate_pred_rating_matrix(user_item_matrix_val2, similarity_matrix_genres_val2, user_id_dict_val2, movie_id_dict_val2, k=1)

In [23]:
train2_rmse = compute_rmse(user_item_matrix_train2, predicted_ratings_matrix_train2)
print("RMSE on training set:", train2_rmse)
val2_rmse = compute_rmse(user_item_matrix_val2, predicted_ratings_matrix_val2)
print("RMSE on validation set:", val2_rmse)

RMSE on training set: 1.4995171803206377
RMSE on validation set: 1.2193039521216908


## Hyperparameter tuning

Now we have recorded some baseline performance let's find the optimal value for K by loping over different k values while generating the predicted rating matrix:

`Netflix`

In [24]:
k_list = [1, 4, 10, 15]
rmse_list = []
best_k_train1 = None
best_rmse = float('inf')

for k in k_list:
    predicted_item_matrix = generate_pred_rating_matrix(user_item_matrix_train1, similarity_matrix_genres_train1, user_id_dict_train1, movie_id_dict_train1, k=k)
    
    # Compute Root Mean Square Error (RMSE)
    rmse = compute_rmse(user_item_matrix_train1, predicted_item_matrix)
    
    # Append the RMSE value to the list
    rmse_list.append(rmse)
    
    # Check if current k gives the best RMSE
    if rmse < best_rmse:
        best_rmse = rmse
        best_k_train1 = k

# Print the result descendingly
for i, k_value in enumerate(k_list):
    print(f"K-value: {k_value} | RMSE: {rmse_list[i]}")

print(f"\nBest K-value: {best_k_train1} | Best RMSE: {best_rmse}")

K-value: 1 | RMSE: 1.6593534200351532
K-value: 4 | RMSE: 1.5902267018848553
K-value: 10 | RMSE: 1.6083235470793062
K-value: 15 | RMSE: 1.6003897186980665

Best K-value: 4 | Best RMSE: 1.5902267018848553


`Movielens`

In [25]:
k_list = [1, 4, 10, 15]
rmse_list = []
best_k_train2 = None
best_rmse = float('inf')

for k in k_list:
    predicted_item_matrix = generate_pred_rating_matrix(user_item_matrix_train2, similarity_matrix_genres_train2, user_id_dict_train2, movie_id_dict_train2, k=k)
    
    # Compute Root Mean Square Error (RMSE)
    rmse = compute_rmse(user_item_matrix_train2, predicted_item_matrix)
    
    # Append the RMSE value to the list
    rmse_list.append(rmse)
    
    # Check if current k gives the best RMSE
    if rmse < best_rmse:
        best_rmse = rmse
        best_k_train2 = k

# Print the result descendingly
for i, k_value in enumerate(k_list):
    print(f"K-value: {k_value} | RMSE: {rmse_list[i]}")

print(f"\nBest K-value: {best_k_train2} | Best RMSE: {best_rmse}")

K-value: 1 | RMSE: 1.4995171803206377
K-value: 4 | RMSE: 1.4890477827048683
K-value: 10 | RMSE: 1.4880920911260074
K-value: 15 | RMSE: 1.4657350551873696

Best K-value: 15 | Best RMSE: 1.4657350551873696


## Final predictions on test set:

`Netflix`

In [26]:
# set up
user_item_matrix_test1, user_id_dict_test1, movie_id_dict_test1, user_ids_test1, movieIds_test1 = create_user_item_matrix(test_data_netflix)
netflix_user_genre_matrix_test1 = create_user_genre_matrix(test_data_netflix)
# _, user_ratings_matrix_classified_test1 = computing_neutral_scores(user_item_matrix_test1)
# user_similarity_matrix_manhattan_test1 = calculate_user_similarity_manhattan(user_item_matrix_test1, threshold)
similarity_matrix_genres_test1 = calculate_similarity_users_genres(user_item_matrix_test1, netflix_user_genre_matrix_test1, threshold)

# set up predictions matrix
predicted_item_matrix_test1 = generate_pred_rating_matrix(user_item_matrix_test1, similarity_matrix_genres_test1, user_id_dict_test1, movie_id_dict_test1, k=best_k_train1)

# compute Root Mean Square Error (RMSE)
rmse_test1 = compute_rmse(user_item_matrix_test1, predicted_item_matrix_test1)
# print result on test set
print("RMSE on test set:", rmse_test1)

RMSE on test set: 1.4820564186528715


`Movielens`

In [27]:
# set up
user_item_matrix_test2, user_id_dict_test2, movie_id_dict_test2, user_ids_test2, movieIds_test2 = create_user_item_matrix(test_data_movielens)
movielens_user_genre_matrix_test2 = create_user_genre_matrix(test_data_movielens)
# _, user_ratings_matrix_classified_test2 = computing_neutral_scores(user_item_matrix_test2)
# user_similarity_matrix_manhattan_test2 = calculate_user_similarity_manhattan(user_item_matrix_test2, threshold)
similarity_matrix_genres_test2 = calculate_similarity_users_genres(user_item_matrix_test2, movielens_user_genre_matrix_test2, threshold)

# set up predictions matrix
predicted_item_matrix_test2 = generate_pred_rating_matrix(user_item_matrix_test2, similarity_matrix_genres_test2, user_id_dict_test2, movie_id_dict_test2, k=best_k_train2)

# compute Root Mean Square Error (RMSE)
rmse_test2 = compute_rmse(user_item_matrix_test2, predicted_item_matrix_test2)
# print result on test set
print("RMSE on test set:", rmse_test2)

RMSE on test set: 1.0886557170871796


# Fit on UserKNN regression

Go to UserKNN CLASS modelling-Stijn.ipynb for function explanations.

In [31]:
def generate_user_knn_classifier_with_movies(user_id, user_item_matrix, user_similarity_matrix, user_id_dict, movie_id_dict, k):
    """
    Generates movie recommendations for a given user using user-based k-nearest neighbors (KNN) collaborative filtering with neighborhood-based classification.

    Parameters:
    user_id (int): ID of the user for whom recommendations are to be generated.
    user_item_matrix (numpy.ndarray): Matrix representing users' ratings for items (movies).
    user_similarity_matrix (numpy.ndarray): Matrix representing cosine similarity between users.
    user_id_dict (dict): Dictionary mapping user IDs to unique indices in the user-item matrix.
    movie_id_dict (dict): Dictionary mapping movie IDs to unique indices in the user-item matrix.
    k (int): Number of nearest neighbors to consider for recommendations.

    Returns:
    recommendations (list): List of tuples containing recommended movie IDs and their predicted ratings for the given user.
    """
    # Ensure user ID exists in the dictionary
    if user_id not in user_id_dict:
        print(f"User with ID {user_id} not found.")
        return []

    # find the index of the user in the user-item matrix
    user_index = user_id_dict[user_id]

    # this line calculates the similarity score between the target and other users and sorts it descendingly
    similar_users_indices = np.argsort(user_similarity_matrix[user_index])[::-1][:k]

    # the indices of users in the previous line are then used here to find the ratings of those users
    rated_movies = np.where(~np.isnan(user_item_matrix[similar_users_indices]))[1]

    # this line is selecting the ratings of the similar users which are the similar users indices
    neighbor_ratings = user_item_matrix[similar_users_indices][:, rated_movies]
    # thie line is computing similarity weights
    similarity_weights = user_similarity_matrix[user_index, similar_users_indices][:, np.newaxis] # by converting to column vector the matrix multiplication can be performed because the shape is now the same as neighbor_ratings
    # this line is performing a dot product of the ratings and the weights to compute the predicted rating
    votes = np.dot(neighbor_ratings.T, similarity_weights).flatten()

    # Find the top 5 ratings with the maximum votes
    top_indices = np.argsort(votes)[-5:][::-1]
    top_recommendations = [(list(movie_id_dict.keys())[list(movie_id_dict.values()).index(rated_movies[idx])], votes[idx]) for idx in top_indices]

    return top_recommendations

def generate_predictions_array(user_id, user_item_matrix, user_similarity_matrix, user_id_dict, movie_id_dict, k):
    """
    Generates movie ratings predictions for a given user using user-based k-nearest neighbors (KNN) collaborative filtering with neighborhood-based classification.

    Parameters:
    user_id (int): ID of the user for whom ratings are to be predicted.
    user_item_matrix (numpy.ndarray): Matrix representing users' ratings for items (movies).
    user_similarity_matrix (numpy.ndarray): Matrix representing cosine similarity between users.
    user_id_dict (dict): Dictionary mapping user IDs to unique indices in the user-item matrix.
    movie_id_dict (dict): Dictionary mapping movie IDs to unique indices in the user-item matrix.
    k (int): Number of nearest neighbors to consider for recommendations.

    Returns:
    predicted_ratings (numpy.ndarray): Array containing predicted ratings for the given user and all movies in movie_id_dict.
    """
    # Ensure user ID exists in the dictionary
    if user_id not in user_id_dict:
        print(f"User with ID {user_id} not found.")
        return np.array([])

    # Find the index of the user in the user-item matrix
    user_index = user_id_dict[user_id]

    # Get similarity scores of the user with other users and sort indices
    similar_users_indices = np.argsort(user_similarity_matrix[user_index])[::-1][:k]

    # Find movies rated by similar users
    rated_movies = np.where(~np.isnan(user_item_matrix[similar_users_indices]))[1]

    # Calculate votes from neighbors
    neighbor_ratings = user_item_matrix[similar_users_indices][:, rated_movies]
    similarity_weights = user_similarity_matrix[user_index, similar_users_indices][:, np.newaxis]
    votes = np.dot(neighbor_ratings.T, similarity_weights).flatten()

    # Find the predicted ratings for the given user and all movies
    predicted_ratings = np.full(len(movie_id_dict), np.nan)  # Initialize with NaN for unrated movies
    for movie_id, movie_index in movie_id_dict.items():
        if movie_index in rated_movies:
            idx = np.where(rated_movies == movie_index)[0][0]
            predicted_ratings[movie_index] = votes[idx]

    return predicted_ratings

def generate_predicted_user_item_matrix(user_item_matrix, user_similarity_matrix, user_id_dict, movie_id_dict, k):
    """
    Generates the predicted user-item matrix using user-based k-nearest neighbors (KNN) collaborative filtering with neighborhood-based classification for all users.

    Parameters:
    user_item_matrix (numpy.ndarray): Matrix representing users' ratings for items (movies).
    user_similarity_matrix (numpy.ndarray): Matrix representing cosine similarity between users.
    user_id_dict (dict): Dictionary mapping user IDs to unique indices in the user-item matrix.
    movie_id_dict (dict): Dictionary mapping movie IDs to unique indices in the user-item matrix.
    k (int): Number of nearest neighbors to consider for recommendations.

    Returns:
    predicted_matrix (numpy.ndarray): Predicted user-item matrix containing ratings for all users and movies.
    """
    num_users = user_item_matrix.shape[0]
    num_movies = len(movie_id_dict)
    predicted_matrix = np.zeros((num_users, num_movies))

    # Iterate over each user
    for user_id in user_id_dict:
        predicted_ratings = generate_predictions_array(user_id, user_item_matrix, user_similarity_matrix, user_id_dict, movie_id_dict, k)
        predicted_matrix[user_id_dict[user_id]] = predicted_ratings

    return predicted_matrix

`Netflix`

In [33]:
predicted_ratings_matrix_train1_class = generate_pred_rating_matrix(user_item_matrix_train1, similarity_matrix_genres_train1, user_id_dict_train1, movie_id_dict_train1, k=1)
predicted_ratings_matrix_val1_class = generate_pred_rating_matrix(user_item_matrix_val1, similarity_matrix_genres_val1, user_id_dict_val1, movie_id_dict_val1, k=1)

In [34]:
train1_rmse_class = compute_rmse(user_item_matrix_train1, predicted_ratings_matrix_train1_class)
print("RMSE on training set:", train1_rmse)
val1_rmse_class = compute_rmse(user_item_matrix_val1, predicted_ratings_matrix_val1_class)
print("RMSE on validation set:", val1_rmse)

RMSE on training set: 1.6593534200351532
RMSE on validation set: 1.4598450400585696


`Movielens`

In [37]:
predicted_ratings_matrix_train2_class = generate_pred_rating_matrix(user_item_matrix_train2, similarity_matrix_genres_train2, user_id_dict_train2, movie_id_dict_train2, k=1)
predicted_ratings_matrix_val2_class = generate_pred_rating_matrix(user_item_matrix_val2, similarity_matrix_genres_val2, user_id_dict_val2, movie_id_dict_val2, k=1)

In [38]:
train2_rmse_class = compute_rmse(user_item_matrix_train2, predicted_ratings_matrix_train2_class)
print("RMSE on training set:", train2_rmse)
val2_rmse_class = compute_rmse(user_item_matrix_val2, predicted_ratings_matrix_val2_class)
print("RMSE on validation set:", val2_rmse)

RMSE on training set: 1.4995171803206377
RMSE on validation set: 1.2193039521216908


## Hyperparameter tuning (knn classification)

Now we have recorded some baseline performance let's find the optimal value for K by loping over different k values while generating the predicted rating matrix:

`Netflix`

In [None]:
k_list = [1, 4, 10, 15]
rmse_list = []
best_k_train1_class = None
best_rmse = float('inf')

for k in k_list:
    predicted_item_matrix = generate_pred_rating_matrix(user_item_matrix_train1, similarity_matrix_genres_train1, user_id_dict_train1, movie_id_dict_train1, k=k)
    
    # Compute Root Mean Square Error (RMSE)
    rmse = compute_rmse(user_item_matrix_train1, predicted_item_matrix)
    
    # Append the RMSE value to the list
    rmse_list.append(rmse)
    
    # Check if current k gives the best RMSE
    if rmse < best_rmse:
        best_rmse = rmse
        best_k_train1_class = k

# Print the result descendingly
for i, k_value in enumerate(k_list):
    print(f"K-value: {k_value} | RMSE: {rmse_list[i]}")

print(f"\nBest K-value: {best_k_train1_class} | Best RMSE: {best_rmse}")

K-value: 1 | RMSE: 1.4958319704233833
K-value: 4 | RMSE: 1.2760958283885289
K-value: 10 | RMSE: 1.205793224704754
K-value: 15 | RMSE: 1.2003227023096785

Best K-value: 15 | Best RMSE: 1.2003227023096785


`Movielens`

In [None]:
k_list = [1, 4, 10, 15]
rmse_list = []
best_k_train2_class = None
best_rmse = float('inf')

for k in k_list:
    predicted_item_matrix = generate_pred_rating_matrix(user_item_matrix_train2, similarity_matrix_genres_train2, user_id_dict_train2, movie_id_dict_train2, k=k)
    
    # Compute Root Mean Square Error (RMSE)
    rmse = compute_rmse(user_item_matrix_train2, predicted_item_matrix)
    
    # Append the RMSE value to the list
    rmse_list.append(rmse)
    
    # Check if current k gives the best RMSE
    if rmse < best_rmse:
        best_rmse = rmse
        best_k_train2_class = k

# Print the result descendingly
for i, k_value in enumerate(k_list):
    print(f"K-value: {k_value} | RMSE: {rmse_list[i]}")

print(f"\nBest K-value: {best_k_train2_class} | Best RMSE: {best_rmse}")

K-value: 1 | RMSE: 1.3946208857308595
K-value: 4 | RMSE: 1.1699329944236059
K-value: 10 | RMSE: 1.0826256066618114
K-value: 15 | RMSE: 1.0864868970475208

Best K-value: 10 | Best RMSE: 1.0826256066618114


## Final predictions on test set (knn classification):

`Netflix`

In [39]:
# set up
user_item_matrix_test1, user_id_dict_test1, movie_id_dict_test1, user_ids_test1, movieIds_test1 = create_user_item_matrix(test_data_netflix)
# _, user_ratings_matrix_classified_test1 = computing_neutral_scores(user_item_matrix_test1)
similarity_matrix_genres_test1_class = calculate_similarity_users_genres(user_item_matrix_test1, user_item_matrix_test1, threshold)

# set up predictions matrix
predicted_item_matrix_test1_class = generate_pred_rating_matrix(user_item_matrix_test1, similarity_matrix_genres_test1_class, user_id_dict_test1, movie_id_dict_test1, k=best_k_train1_class)

# compute Root Mean Square Error (RMSE)
rmse_test1_class = compute_rmse(user_item_matrix_test1, predicted_item_matrix_test1_class)
# print result on test set
print("RMSE on test set:", rmse_test1_class)

RMSE on test set: 1.6469003314573671


`Movielens`

In [40]:
# set up
user_item_matrix_test2, user_id_dict_test2, movie_id_dict_test2, user_ids_test2, movieIds_test2 = create_user_item_matrix(test_data_movielens)
# _, user_ratings_matrix_classified_test2 = computing_neutral_scores(user_item_matrix_test2)
similarity_matrix_genres_test2_class = calculate_similarity_users_genres(user_item_matrix_test2, user_item_matrix_test2, threshold)

# set up predictions matrix
predicted_item_matrix_test2_class = generate_pred_rating_matrix(user_item_matrix_test2, similarity_matrix_genres_test2_class, user_id_dict_test2, movie_id_dict_test2, k=best_k_train2_class)

# compute Root Mean Square Error (RMSE)
rmse_test2_class = compute_rmse(user_item_matrix_test2, predicted_item_matrix_test2_class)
# print result on test set
print("RMSE on test set:", rmse_test2_class)

RMSE on test set: 1.1992819268729882


# OLD

old function to normalize ratings:

### Preprocessing of ratings in user-item matrix:
We might suggest filling the empty values with 0s, but that can create issues with recommendation engines. 

If we were to fill this NaN with a 0, we would be incorrectly implying they greatly disliked! We are going to center each user’s ratings around 0 by deducting the row average and then fill in the missing values with 0. This means the missing data is replaced with neutral scores.

### `computing_neutral_scores` Function Explanation

### Functions Used and Purpose:
- **`np.nanmean()`**: Used to calculate the average rating for each user while handling NaN (missing) values.
  - **`axis=1`**: Specifies that the calculation is done along the rows (i.e., for each user).
- **`np.nan_to_num()`**: Used to fill in missing data (NaN) with zeros while preserving non-NaN values.
- **`np.reshape(-1, 1)`**: Used to reshape the array to ensure proper broadcasting during subtraction.
- **Indexing and Slicing**: Used to access elements in arrays and matrices.

### Steps:
1. **Calculate Average Ratings**:
   - Use `np.nanmean()` to compute the average rating for each user along the rows of the user-item matrix. This handles missing ratings (NaN) gracefully, computing the mean while ignoring NaN values.

2. **Center Ratings Around 0**:
   - Subtract the average ratings from each user's ratings in the user-item matrix. This centers each user's ratings around 0, effectively removing the user bias from the ratings.

3. **Fill Missing Data with Zeros**:
   - Use `np.nan_to_num()` to replace missing data (NaN) with zeros while preserving the existing non-NaN values. This ensures that missing ratings are treated neutrally (i.e., as if the user has not rated the item).

4. **Return Normalized User Ratings**:
   - Return the resulting normalized user ratings matrix, where missing ratings have been replaced with zeros and each user's ratings are centered around 0.

In [None]:
def computing_neutral_scores(user_item_matrix):
    """
    Compute neutral scores for user-item interactions in a user-item matrix.

    Parameters:
    user_item_matrix (numpy.ndarray): Matrix representing users' ratings for items (movies).

    Returns:
    user_ratings_matrix_normed (numpy.ndarray): Matrix representing users' ratings normalized with neutral scores.
    """
    # Calculate the average rating for each user
    avg_ratings = np.nanmean(user_item_matrix, axis=1)

    # Center each user's ratings around 0
    user_ratings_matrix_centered = user_item_matrix - avg_ratings.reshape(-1, 1)

    # Fill in the missing data with 0s
    user_ratings_matrix_normed = np.nan_to_num(user_ratings_matrix_centered, nan=0)

    return user_ratings_matrix_normed