In [218]:
import pandas as pd
import numpy as np

In [219]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" # to make jupyter print all outputs, not just the last one
from IPython.core.display import HTML # to pretty print pandas df and be able to copy them over (e.g. to ppt slides)

In [220]:
netflix_df = pd.read_parquet('cleaned/netflix_parquet')
movielens_df = pd.read_parquet('cleaned/movielens_parquet')

In [221]:
# netflix_df = netflix_df[netflix_df['review_data'].apply(lambda x: len(x) if x is not None else 0) > 500]
netflix_df = netflix_df[netflix_df['review_data'].apply(lambda x: 30 <= len(x) <= 150 if x is not None else False)]
movielens_df = movielens_df[movielens_df['review_data'].apply(lambda x: 30 <= len(x) <= 150 if x is not None else False)]

In [222]:
n_rows = 10

In [223]:
df = (netflix_df.sample(n=n_rows,random_state=42))[['movieId','review_data','genres']]
df
df2 = (movielens_df.sample(n=n_rows,random_state=42))[['movieId','review_data','genres']]
df2

Unnamed: 0,movieId,review_data,genres
1310,1311,"[{'date': 2005-09-17, 'rating': 1.0, 'userId':...",
235,236,"[{'date': 2002-09-12, 'rating': 2.0, 'userId':...",
1448,1449,"[{'date': 2003-07-14, 'rating': 1.0, 'userId':...","[Fantasy, Romance, Action]"
1697,1698,"[{'date': 2004-10-04, 'rating': 4.0, 'userId':...","[Action, Thriller]"
314,315,"[{'date': 2005-09-17, 'rating': 1.0, 'userId':...",
1679,1680,"[{'date': 2004-11-06, 'rating': 2.0, 'userId':...",
68,69,"[{'date': 2005-08-12, 'rating': 3.0, 'userId':...","[Action, Sport]"
316,317,"[{'date': 2005-07-28, 'rating': 1.0, 'userId':...",[Drama]
1403,1404,"[{'date': 2005-09-17, 'rating': 1.0, 'userId':...","[Drama, Fantasy, Horror, Mystery, Sci-Fi, Thri..."
739,740,"[{'date': 2003-06-27, 'rating': 4.0, 'userId':...",[Talk-Show]


Unnamed: 0,movieId,review_data,genres
4326,4450,"[{'date': 2012-08-17, 'rating': 4.0, 'userId':...","[Crime, Drama, Thriller]"
5861,6005,"[{'date': 2006-05-11, 'rating': 3.0, 'userId':...","[Comedy, Documentary]"
13331,70994,"[{'date': 2021-02-27, 'rating': 1.0, 'userId':...","[Horror, Thriller]"
43540,209497,"[{'date': 2020-07-16, 'rating': 3.0, 'userId':...","[Fantasy, Mystery, Sci-Fi]"
13683,73000,"[{'date': 2010-04-08, 'rating': 3.0, 'userId':...","[Drama, Musical, Romance]"
1372,1415,"[{'date': 1996-12-28, 'rating': 4.0, 'userId':...","[Crime, Drama, Romance]"
17462,97328,"[{'date': 2013-01-02, 'rating': 4.0, 'userId':...","[Comedy, Drama]"
8274,25793,"[{'date': 2019-04-27, 'rating': 4.0, 'userId':...","[Fantasy, Horror]"
9474,31698,"[{'date': 2010-07-06, 'rating': 3.0, 'userId':...","[Adventure, Children, Comedy, Fantasy]"
3645,3761,"[{'date': 2001-02-18, 'rating': 3.0, 'userId':...","[Action, Crime, Drama, Thriller]"


In [224]:
# netflix
review_data = df['review_data'].values
user_ids = np.concatenate([np.array([entry['userId'] for entry in row]) for row in review_data])
ratings = np.concatenate([np.array([entry['rating'] for entry in row]) for row in review_data])
movieIds = np.concatenate([[movieId] * len(row) for movieId, row in zip(df['movieId'], review_data)])
print(f"{len(user_ids)} unique userIds are handled from the Netflix dataset.")
print(f"{len(np.unique(movieIds))} unique movieIds are handled from the Netflix dataset.")
print()

# movielens
review_data2 = df2['review_data'].values
user_ids2 = np.concatenate([np.array([entry['userId'] for entry in row]) for row in review_data2])
ratings2 = np.concatenate([np.array([entry['rating'] for entry in row]) for row in review_data2])
movieIds2 = np.concatenate([[movieId] * len(row) for movieId, row in zip(df['movieId'], review_data2)])
print(f"{len(user_ids2)} unique userIds are handled from the Netflix dataset.")
print(f"{len(np.unique(movieIds2))} unique movieIds are handled from the Netflix dataset.")

1228 unique userIds are handled from the Netflix dataset.
10 unique movieIds are handled from the Netflix dataset.

678 unique userIds are handled from the Netflix dataset.
10 unique movieIds are handled from the Netflix dataset.


### Function Explanation

`train_val_test_split`

1. **Shuffle the Data**:
   - The input data is shuffled using `data.sample(frac=1, random_state=42)` to ensure randomness. `random_state=42` ensures reproducibility.

2. **Calculate Set Sizes**:
   - The sizes of each set (training, validation, and test) are calculated based on the provided ratios and the total number of samples in the data.

3. **Split the Data**:
   - The shuffled data is split into three sets: training, validation, and test.
   - The training data contains the first `num_train` samples.
   - The validation data contains the next `num_val` samples, starting from the index immediately following the last training sample.
   - The test data contains the remaining samples, starting from the index immediately following the last validation sample.

4. **Reset Index**:
   - The index of each set is reset to ensure that it starts from 0 and increases incrementally.

5. **Return Sets**:
   - The function returns the training, validation, and test sets as pandas DataFrames.

In [225]:
def train_val_test_split(data, train_ratio=0.8, val_ratio=0.1, test_ratio=0.1):
    """
    Splits the data into training, validation, and test sets, simultaneously ensuring no training data flows into validation or test data.

    Parameters:
    - data: pandas DataFrame containing the data to be split.
    - train_ratio: float, ratio of the training set size to the total data size (default: 0.8).
    - val_ratio: float, ratio of the validation set size to the total data size (default: 0.1).
    - test_ratio: float, ratio of the test set size to the total data size (default: 0.1).

    Returns:
    - train_data: pandas DataFrame, training set.
    - val_data: pandas DataFrame, validation set.
    - test_data: pandas DataFrame, test set.
    """
    # Shuffle the data
    data_shuffled = data.sample(frac=1, random_state=42)

    # Calculate the sizes of each set
    num_samples = len(data_shuffled)
    num_train = int(train_ratio * num_samples)
    num_val = int(val_ratio * num_samples)
    num_test = num_samples - num_train - num_val

    # Split the data into train, validation, and test sets
    train_data = data_shuffled[:num_train]
    # Below is ensured the validation data and the test data starts after the indices which are already in the training data, ensuring that no training data will flow into validation of test data.
    val_data = data_shuffled[num_train:num_train+num_val]
    test_data = data_shuffled[num_train+num_val:]

    # Reset index for each set
    train_data.reset_index(drop=True, inplace=True)
    val_data.reset_index(drop=True, inplace=True)
    test_data.reset_index(drop=True, inplace=True)

    return train_data, val_data, test_data

### Set-up user-item matrix
First we will create a user-item matrix which records all the user-item interactions.


### `create_user_item_matrix` Function Explanation

### Steps:
1. **Extract Review Data**:
   - Extract the review data from the provided DataFrame, which contains user IDs, ratings, and movie IDs.

2. **Create User and Movie IDs Arrays**:
   - Extract user IDs, ratings, and movie IDs from the review data and concatenate them into separate arrays.
   - Generate dictionaries to map user IDs and movie IDs to unique indices in the user-item matrix.

3. **Initialize User-Item Matrix**:
   - Determine the dimensions of the user-item matrix based on the number of unique users and movies.
   - Initialize an empty user-item matrix filled with NaN values.

4. **Populate User-Item Matrix**:
   - Iterate through the review data and populate the user-item matrix with ratings.
   - Map user and movie IDs to their corresponding indices in the matrix and insert the ratings.

5. **Return Results**:
   - Return the user-item matrix along with dictionaries mapping user and movie IDs to indices, and arrays containing user and movie IDs.
  
### Functions Used and Purpose:

- **`np.concatenate()`**: Used to concatenate arrays containing user IDs, ratings, and movie IDs extracted from the review data.
- **`enumerate()`**: Used to iterate over the unique user IDs and movie IDs and generate indices for mapping.
- **`np.unique()`**: Used to find the unique user IDs and movie IDs in the review data.
- **`np.full()`**: Used to initialize an empty user-item matrix filled with NaN values.
- **`zip()`**: Used to iterate over multiple iterables simultaneously (user IDs, movie IDs, ratings).
- **`enumerate()`**: Used to iterate over the indices and elements of an iterable (user IDs, movie IDs) simultaneously.
- **Indexing and Slicing**: Used to access and modify elements in arrays and matrices.

In [226]:
def create_user_item_matrix(train_test_val_set):
    """
    Creates a user-item matrix from the provided dataset containing review data.

    Parameters:
    train_test_val_set (DataFrame): DataFrame containing review data with columns 'review_data',
                                    which is a list of dictionaries with keys 'userId', 'rating',
                                    and 'movieId'.

    Returns:
    user_item_matrix (numpy.ndarray): Matrix representing users' ratings for items (movies), the matrix is an NumPy array which contains lists of user-item interactions, meaning a user and their corresponding ratings to the movieIds.    
    
    user_id_dict (dict): Dictionary mapping user IDs to unique indices in the user-item matrix.
    
    movie_id_dict (dict): Dictionary mapping movie IDs to unique indices in the user-item matrix.
    
    user_ids (numpy.ndarray): Array containing user IDs corresponding to each rating in the matrix.
    
    movie_ids (numpy.ndarray): Array containing movie IDs corresponding to each rating in the matrix.

    """
    train_test_val_set = train_test_val_set.drop(['genres'],axis=1)
    review_data = train_test_val_set['review_data'].values
    user_ids = np.concatenate([np.array([entry['userId'] for entry in row]) for row in review_data])
    ratings = np.concatenate([np.array([entry['rating'] for entry in row]) for row in review_data])
    movieIds = np.concatenate([[movieId] * len(row) for movieId, row in zip(train_test_val_set['movieId'], review_data)])

    # create dictionaries to map user IDs and movie IDs to unique indices to map over
    user_id_dict = {user_id: index for index, user_id in enumerate(np.unique(user_ids))}
    movie_id_dict = {movie_id: index for index, movie_id in enumerate(np.unique(movieIds))}

    # initialize an empty user-item matrix
    user_count = len(user_id_dict)
    movie_count = len(movie_id_dict)
    user_item_matrix = np.full((user_count, movie_count), np.nan)

    # populate the user-item matrix with ratings from the dataset
    for i, (user_id, movie_id, rating) in enumerate(zip(user_ids, movieIds, ratings)):
        user_index = user_id_dict[user_id]
        movie_index = movie_id_dict[movie_id]
        user_item_matrix[user_index, movie_index] = rating

    return user_item_matrix, user_id_dict, movie_id_dict, user_ids, movieIds

### Compute similarity:
Regularly, cosine similarity is often used to measure the similarity between users based on their preferences or ratings for items (in this case, movies). Cosine similarity ranges from -1 to 1, where:

- 1 indicates perfect similarity,
- 0 indicates no similarity, and
- -1 indicates perfect dissimilarity.

### Interpretation:

- **Positive Cosine Similarity**: Users are similar in their preferences or ratings for movies.
- **Zero Cosine Similarity**: Users have no similarity in their preferences.
- **Negative Cosine Similarity**: Users are dissimilar in their preferences, tending towards opposite ratings for movies.

### Practical Implication:

If one user likes certain types of movies, the other user tends to dislike them, or vice versa. In other words, users with negative cosine similarities have contrasting preferences, making them less suitable for recommending movies to each other.

___

To see how similar users are we will compute the similarity between them. I will use cosine similarity as distance measure. The manhatten norm will be used to decrease computational weight instead of euclidian norm.

### Explanation `calculate_user_similarity_manhattan` Function

This function calculates the cosine similarity matrix between users based on their ratings using the Manhattan norm.

1. **Threshold**: First, the function applies a threshold to the user ratings matrix. Ratings below the threshold are set to 0, ensuring that only significant ratings are considered as well as decreasing computational weight.

2. **Dot Product Calculation**: It then computes the dot product of each pair of row vectors (users) in the thresholded matrix. This represents the similarity between users based on their common rated items.

3. **Norm Calculation**: Next, it calculates the norms (magnitude) of each row vector, considering only values above the threshold. This step prepares for the normalization process.

4. **Normalization**: The dot products are divided by the norms of the corresponding row vectors, effectively normalizing the similarity values. This step ensures that users with a large number of ratings are not favored over users with fewer ratings.

5. **Setting Diagonal to 0**: Finally, the diagonal elements of the similarity matrix are set to 0 to avoid self-similarity, as a user's rating should not be compared to itself.

### Explanation of NumPy Functions

- **np.dot**: Computes the dot product of arrays. Here, it calculates the dot product of the thresholded user ratings matrix with its transpose, resulting in the similarity matrix.
  
- **np.where**: Returns indices where a condition is true. It's used here to apply thresholding to the user ratings matrix.
  
- **np.sum**: Computes the sum of array elements. It calculates the norms of each row vector after thresholding, which are then used for normalization.
  
- **np.abs**: Computes the absolute value element-wise. Used to ensure positive values for norms calculation.
  
- **np.fill_diagonal**: Fills the diagonal of an array with a specified value. It's used to set diagonal elements of the similarity matrix to 0 to avoid self-similarity.

In [227]:
def calculate_user_similarity_manhattan(user_ratings_matrix, threshold):
    """
    Calculate user similarity using Manhattan distance-based similarity measure.

    Parameters:
    user_ratings_matrix (numpy.ndarray): Matrix representing users' ratings for items (movies).
    threshold (float): Threshold value for considering ratings in the similarity calculation.

    Returns:
    similarity_matrix (numpy.ndarray): Matrix representing similarity between users based on the Manhattan distance.

    The Manhattan distance-based similarity measure is calculated as follows:
    1. Compute the dot product of each pair of row vectors in the user_ratings_matrix, considering only values above the threshold.
    2. Calculate the norms of each row vector, considering only values above the threshold.
    3. Replace zero norms with a small value to avoid division by zero.
    4. Calculate the similarity matrix using broadcasting, where the similarity between users i and j is given by the dot product
       divided by the product of their norms.
    5. Set diagonal elements to 0 to avoid self-similarity.

    """
    # Fill in the missing data with 0s
    user_ratings_matrix = np.nan_to_num(user_ratings_matrix, nan=0)
    
    # this line calculates the dot product of each pair of row vectors, therefore how similar they are, considering only values above the threshold to decrease computational weight
    dot_products = np.dot(np.where(user_ratings_matrix >= threshold, user_ratings_matrix, 0), user_ratings_matrix.T)
    
    # this line calculates the distance using the manhatten norm, because the absolute values of the elements are summed up
    norms = np.sum(np.abs(np.where(user_ratings_matrix >= threshold, user_ratings_matrix, 0)), axis=1)
    
    # Replace zero norms with a small value to avoid division by zero
    norms[norms == 0] = 1e-8
    
    # compute similarity matrix using broadcasting
    similarity_matrix = dot_products / (norms[:, None] * norms)
    
    # Set diagonal elements to 0 to avoid self-similarity
    np.fill_diagonal(similarity_matrix, 0)
    
    return similarity_matrix

## Perform User-Based KNN - rating prediction

The top five recommendations of a user are based on the five highest predicted ratings. The highest rating is found by finding the nearest neighbour based on cosine similarity, followed by computing the weighted average of the rating of the nearest neighbors of the specific item. The items with the highest weighted average will be the five recommendations to the user.

### Explanation `generate_user_knn_regressor` function:

### Function Workflow
1. **Validation**: Checks whether the provided user ID exists in the user ID dictionary. If not found, it returns an empty list.

2. **Finding Similar Users**:
   - Retrieves the index of the user in the user-item matrix using the user ID.
   - Computes the similarity scores between the target user and all other users.
   - Selects the top-k most similar users based on similarity scores.

3. **Finding Rated Movies by Similar Users**:
   - Identifies movies that the similar users have rated.

4. **Calculating Weighted Average Ratings**:
   - For each movie rated by similar users:
     - Computes the weighted sum of ratings, where weights are similarity scores between the target user and similar users.
     - Accumulates the sum of similarities.
     - Calculates the weighted average rating for each movie.

5. **Sorting Recommendations**:
   - Sorts movies by their weighted average ratings in descending order.

6. **Conversion and Return**:
   - Converts movie indices back to movie IDs using the movie ID dictionary.
   - Returns a list of recommended movie IDs along with their predicted ratings for the given user.

### Explanation numpy functions used:

1. `np.argsort()`
- **Usage**: `np.argsort(array)`
- **Explanation**: Returns the indices that would sort an array in ascending order.
- **Example**: `np.argsort([30, 10, 20])` returns `[1, 2, 0]`, indicating that the smallest element is at index 1, the second smallest at index 2, and the largest at index 0.

2. `np.where()`
- **Usage**: `np.where(condition)`
- **Explanation**: Returns the indices where a specified condition is true.
- **Example**: `np.where([True, False, True])` returns `(array([0, 2]),)`, indicating that the condition is true at indices 0 and 2.

3. `np.isnan()`
- **Usage**: `np.isnan(array)`
- **Explanation**: Returns a boolean array indicating whether each element is NaN (Not a Number).
- **Example**: `np.isnan([1, np.nan, 3])` returns `[False, True, False]`, indicating that the second element is NaN.

4. `np.zeros_like()`
- **Usage**: `np.zeros_like(array)`
- **Explanation**: Returns an array of zeros with the same shape and type as the input array.
- **Example**: `np.zeros_like([1, 2, 3])` returns `[0, 0, 0]`, creating an array of zeros with the same shape as `[1, 2, 3]`.

5. `np.sum()`
- **Usage**: `np.sum(array)`
- **Explanation**: Computes the sum of array elements over a specified axis or the entire array.
- **Example**: `np.sum([1, 2, 3])` returns `6`, summing all elements in the array.

In [228]:
def generate_user_knn_regressor(user_id, user_item_matrix, user_similarity_matrix, user_id_dict, movie_id_dict, k):
    """
    Generates movie recommendations for a given user using user-based k-nearest neighbors (KNN) collaborative filtering with weighted average.

    Parameters:
    user_id (int): ID of the user for whom recommendations are to be generated.
    user_item_matrix (numpy.ndarray): Matrix representing users' ratings for items (movies).
    user_similarity_matrix (numpy.ndarray): Matrix representing cosine similarity between users.
    user_id_dict (dict): Dictionary mapping user IDs to unique indices in the user-item matrix.
    movie_id_dict (dict): Dictionary mapping movie IDs to unique indices in the user-item matrix.
    k (int): Number of nearest neighbors to consider for recommendations.

    Returns:
    recommendations (list): List of tuples containing recommended movie IDs and their predicted ratings for the given user.
    """
    # Ensure user ID exists in the dictionary
    if user_id not in user_id_dict:
        print(f"User with ID {user_id} not found.")
        return []

    # Find the index of the user in the user-item matrix
    user_index = user_id_dict[user_id]

    # calculate the similarity scores between the user and the k nearest neighbours by performing dot product of row vectors of each user with target user
    similar_users_indices = np.argsort(user_similarity_matrix[user_index])[::-1][:k]

    # find the rated movies by filtering NOT nan values
    rated_movies = np.where(~np.isnan(user_item_matrix[similar_users_indices]))[1]

    # intialize 0 matrix before weighted average computation
    weighted_avg_ratings = np.zeros_like(user_item_matrix[0])
    similarity_sum = 0
    
    for movie in rated_movies:
        # this loop iterates over each movie rated within the k similar users with the target user and computes the weighted sum of similarities to then compute the weighted average
        weighted_sum = np.sum(user_item_matrix[similar_users_indices, movie] * user_similarity_matrix[user_index, similar_users_indices])
        similarity_sum += np.sum(user_similarity_matrix[user_index, similar_users_indices])
        weighted_avg_ratings[movie] = weighted_sum / similarity_sum if similarity_sum != 0 else 0

    # the result is sorted descendingly to find out which movies would be the best recommendations
    sorted_indices = np.argsort(weighted_avg_ratings)[::-1]

    # convert movie indices back to movie IDs and return recommendations
    recommendations_regressor = [(list(movie_id_dict.keys())[list(movie_id_dict.values()).index(movie_index)], weighted_avg_ratings[movie_index])
                       for movie_index in sorted_indices[:5]]
    return recommendations_regressor

### See a first batch of recommendations:

By using the functions above to recommend movies above the following results are generated for each dataset:

In [229]:
train_data_netflix, val_data_netflix, test_data_netflix = train_val_test_split(df)
train_data_movielens, val_data_movielens, test_data_movielens = train_val_test_split(df2)

# set up paramters and threshold for similarity
k=1
threshold=3

In [230]:
def calculate_user_similarity_manhattanTEST(user_ratings_matrix, user_genre_matrix, threshold):
    """
    Calculate user similarity using Manhattan distance-based similarity measure.

    Parameters:
    user_ratings_matrix (numpy.ndarray): Matrix representing users' ratings for items (movies).
    user_genre_matrix (numpy.ndarray): Matrix representing users' genre preferences.
    threshold (float): Threshold value for considering ratings in the similarity calculation.

    Returns:
    similarity_matrix (numpy.ndarray): Matrix representing similarity between users based on the Manhattan distance.

    The Manhattan distance-based similarity measure is calculated as follows:
    1. Compute the dot product of each pair of row vectors in the user_ratings_matrix, considering only values above the threshold.
    2. Calculate the norms of each row vector, considering only values above the threshold.
    3. Replace zero norms with a small value to avoid division by zero.
    4. Compute the dot product of user_genre_matrix to include genre preferences.
    5. Calculate the similarity matrix using broadcasting, where the similarity between users i and j is given by the dot product
       divided by the product of their norms and genre preferences.
    6. Set diagonal elements to 0 to avoid self-similarity.

    """
    # Fill in the missing data with 0s
    user_ratings_matrix = np.nan_to_num(user_ratings_matrix, nan=0)
    
    # Fill in the missing data with 0s for user_genre_matrix
    user_genre_matrix = np.nan_to_num(user_genre_matrix, nan=0)
    
    # Compute dot product of user_ratings_matrix
    dot_products_ratings = np.dot(np.where(user_ratings_matrix >= threshold, user_ratings_matrix, 0), user_ratings_matrix.T)
    
    # Compute dot product of user_genre_matrix
    dot_products_genre = np.dot(user_genre_matrix, user_genre_matrix.T)
    
    # Calculate norms of user_ratings_matrix
    norms_ratings = np.sum(np.abs(np.where(user_ratings_matrix >= threshold, user_ratings_matrix, 0)), axis=1)
    
    # Replace zero norms with a small value to avoid division by zero
    norms_ratings[norms_ratings == 0] = 1e-8
    
    # Calculate norms of user_genre_matrix
    norms_genre = np.sum(user_genre_matrix, axis=1)
    
    # Replace zero norms with a small value to avoid division by zero
    norms_genre[norms_genre == 0] = 1e-8
    
    # Compute similarity matrix using broadcasting
    similarity_matrix = (dot_products_ratings / norms_ratings[:, None]) + (dot_products_genre / norms_genre[:, None])
    
    # Set diagonal elements to 0 to avoid self-similarity
    np.fill_diagonal(similarity_matrix, 0)
    
    return similarity_matrix

In [239]:
review_data = train_data_netflix['review_data'].values
user_ids = np.concatenate([np.array([entry['userId'] for entry in row]) for row in review_data])
len(user_ids)

# Extract unique genres
unique_genres = set(genre for sublist in train_data_netflix['genres'] if sublist is not None for genre in sublist)

# Create a dictionary to map each genre to a unique index
genre_to_index = {genre: index for index, genre in enumerate(unique_genres)}

# Create a dictionary to map each movieId to its genres
movieid_to_genres = dict(zip(train_data_netflix['movieId'], train_data_netflix['genres']))

# Initialize user-genre matrix
num_users = len(np.unique(user_ids))  # Count unique user IDs
num_genres = len(unique_genres)
netflix_user_genre_matrix = np.zeros((num_users, num_genres))

# Create a dictionary to map each user ID to a unique index
user_id_to_index = {user_id: index for index, user_id in enumerate(np.unique(user_ids))}

# Iterate through each user in the dataset
for i, user_id in enumerate(np.unique(user_ids)):
    # Extract movie IDs reviewed by the user
    user_reviews = train_data_netflix[train_data_netflix['review_data'].apply(lambda x: any(entry['userId'] == user_id for entry in x))]
    reviewed_movie_ids = user_reviews['movieId'].values
    
    # Find genres associated with the reviewed movie IDs
    user_genres = []
    for movie_id in reviewed_movie_ids:
        genres = movieid_to_genres.get(movie_id)
        if genres is not None:
            user_genres.extend(genres)
    
    # Update user-genre matrix based on genre interactions
    for genre in user_genres:
        if genre in genre_to_index:  # Ensure the genre is in the mapping dictionary
            genre_index = genre_to_index[genre]
            netflix_user_genre_matrix[user_id_to_index[user_id], genre_index] = 1

netflix_user_genre_matrix.shape
netflix_user_genre_matrix

user_item_matrix_test, user_id_dict_test, movie_id_dict_test, user_ids_test, movieIds_test = create_user_item_matrix(train_data_netflix)
user_item_matrix_test
user_item_matrix_test.shape


similarity_matrix_2 = calculate_user_similarity_manhattanTEST(user_item_matrix_test,netflix_user_genre_matrix,3)

1003

(895, 9)

array([[0., 0., 1., ..., 0., 1., 1.],
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 1., 1., ..., 1., 1., 1.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 1., 0., ..., 1., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

array([[nan, nan, nan, ..., nan,  1., nan],
       [nan, nan, nan, ..., nan, nan,  3.],
       [nan, nan,  2., ...,  2.,  2., nan],
       ...,
       [ 3., nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ...,  3., nan, nan],
       [nan, nan, nan, ..., nan, nan,  1.]])

(895, 8)

In [None]:
calculate_user_similarity_manhattanTEST(user_item_matrix_test,netflix_user_genre_matrix,3)

array([[0.        , 0.        , 1.        , ..., 0.        , 0.33333333,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ],
       [0.375     , 0.        , 0.        , ..., 0.        , 0.75      ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.16666667, 0.        , 3.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

### `Netflix:`

Training data:

In [237]:
# set up
user_item_matrix_train1, user_id_dict_train1, movie_id_dict_train1, user_ids_train1, movieIds_train1 = create_user_item_matrix(train_data_netflix)
# _, user_ratings_matrix_classified_train1 = computing_neutral_scores(user_item_matrix_train1)
user_similarity_matrix_manhattan_train1 = calculate_user_similarity_manhattan(user_item_matrix_train1, threshold)

# generate recommendations
user_id_train1 = user_ids_train1[1]
top5_pred_train1 = generate_user_knn_regressor(user_id_train1, user_item_matrix_train1, user_similarity_matrix_manhattan_train1, user_id_dict_train1, movie_id_dict_train1, k)

# print result
print(f"UserKNN regressor Recommendations: \nTop 5 recommended movies with predicted ratings for user {user_id_train1}:")
for movie_id, predicted_rating in top5_pred_train1:
    print(f"Movie ID: {movie_id}, Predicted Rating: {predicted_rating}")

UserKNN regressor Recommendations: 
Top 5 recommended movies with predicted ratings for user 2201062:
Movie ID: 1404, Predicted Rating: 2.0
Movie ID: 1680, Predicted Rating: 0.0
Movie ID: 1449, Predicted Rating: 0.0
Movie ID: 1311, Predicted Rating: 0.0
Movie ID: 740, Predicted Rating: 0.0


Validation data

In [None]:
# set up
user_item_matrix_val1, user_id_dict_val1, movie_id_dict_val1, user_ids_val1, movieIds_val1 = create_user_item_matrix(val_data_netflix)
# _, user_ratings_matrix_classified_val1 = computing_neutral_scores(user_item_matrix_val1)
user_similarity_matrix_manhattan_val1 = calculate_user_similarity_manhattan(user_item_matrix_val1, threshold)

# generate recommendations
user_id_val1 = user_ids_val1[1]
top5_pred_val1 = generate_user_knn_regressor(user_id_val1, user_item_matrix_val1, user_similarity_matrix_manhattan_val1, user_id_dict_val1, movie_id_dict_val1, k)

# print result
print(f"UserKNN regressor Recommendations: \nTop 5 recommended movies with predicted ratings for user {user_id_val1}:")
for movie_id, rating in top5_pred_val1:
    print(f"Movie ID: {movie_id}, Predicted Rating: {rating}")

UserKNN regressor Recommendations: 
Top 5 recommended movies with predicted ratings for user 1189241:
Movie ID: 1698, Predicted Rating: 0.0


### `Movielens`

Training data

In [None]:
# set up
user_item_matrix_train2, user_id_dict_train2, movie_id_dict_train2, user_ids_train2, movieIds_train2 = create_user_item_matrix(train_data_movielens)
# _, user_ratings_matrix_classified_train2 = computing_neutral_scores(user_item_matrix_train2)
user_similarity_matrix_manhattan_train2 = calculate_user_similarity_manhattan(user_item_matrix_train2, threshold)

# generate recommendations
user_id_train2 = user_ids_train2[1]
top5_pred_train2 = generate_user_knn_regressor(user_id_train2, user_item_matrix_train2, user_similarity_matrix_manhattan_train2, user_id_dict_train2, movie_id_dict_train2, k)

# print result
print(f"UserKNN regressor Recommendations: \nTop 5 recommended movies with predicted ratings for user {user_id_train2}:")
for movie_id, rating in top5_pred_train2:
    print(f"Movie ID: {movie_id}, Predicted Rating: {rating}")

UserKNN regressor Recommendations: 
Top 5 recommended movies with predicted ratings for user 240267:
Movie ID: 73000, Predicted Rating: 0.0
Movie ID: 70994, Predicted Rating: 0.0
Movie ID: 31698, Predicted Rating: 0.0
Movie ID: 25793, Predicted Rating: 0.0
Movie ID: 6005, Predicted Rating: 0.0


Validation data

In [None]:
# set up
user_item_matrix_val2, user_id_dict_val2, movie_id_dict_val2, user_ids_val2, movieIds_val2 = create_user_item_matrix(val_data_movielens)
# _, user_ratings_matrix_classified_val2 = computing_neutral_scores(user_item_matrix_val2)
user_similarity_matrix_manhattan_val2 = calculate_user_similarity_manhattan(user_item_matrix_val2, threshold)

# generate recommendations
user_id_val2 = user_ids_val2[1]
top5_pred_val2 = generate_user_knn_regressor(user_id_val2, user_item_matrix_val2, user_similarity_matrix_manhattan_val2, user_id_dict_val2, movie_id_dict_val2, k)

# print result
print(f"UserKNN regressor Recommendations: \nTop 5 recommended movies with predicted ratings for user {user_id_val2}:")
for movie_id, rating in top5_pred_val2:
    print(f"Movie ID: {movie_id}, Predicted Rating: {rating}")

UserKNN regressor Recommendations: 
Top 5 recommended movies with predicted ratings for user 21507:
Movie ID: 209497, Predicted Rating: 2.5


Even though the movieIds which are recommended are the same, the predicted rating differs somewhat, already indicating a difference between the two userKNN models.

## Baseline performance

To assess performance, we are going to compare the original ratings matrix with the predicted one after the userKnn model. In order to do so, we will generate a predicted rating matrix with the two functions below. Essentially an array of predicted ratings is generated in contrast with the tuple in the previous function `generate_user_knn_regressor` with the top 5 results. The secon functions will append these in a new matrix. 

`generate_array_of_pred_ratings`

1. **Check User Existence**:
   - Ensure that the given `user_id` exists in the `user_id_dict`.

2. **Get User Index**:
   - Retrieve the index of the user in the user-item matrix based on the `user_id`.

3. **Find Similar Users**:
   - Calculate the similarity scores between the target user and other users, sort these scores in descending order, and select the top `k` most similar users.

4. **Find Rated Movies by Similar Users**:
   - Identify the movies that have been rated by the selected similar users.

5. **Calculate Weighted Average Ratings**:
   - For each movie rated by the similar users, calculate the weighted sum of ratings and the sum of similarities.

6. **Calculate Predicted Ratings**:
   - Divide the weighted sum of ratings by the sum of similarities to compute the predicted ratings for every movie.

7. **Return Predicted Ratings Array**:
   - Return the array containing predicted ratings for every movie in `movie_id_dict`.

`generate_pred_rating_matrix`

1. **Initialize Predicted Ratings Matrix**:
   - Initialize a matrix to store predicted ratings for every user and movie.

2. **Iterate Over Users**:
   - For each user, generate predicted ratings using the `generate_array_of_pred_ratings` function and fill the corresponding row in the predicted ratings matrix.

3. **Return Predicted Ratings Matrix**:
   - Return the matrix containing predicted ratings for every user and movie.

In [None]:
def generate_array_of_pred_ratings(user_id, user_item_matrix, user_similarity_matrix, user_id_dict, movie_id_dict, k):
    """
    Generates movie recommendations for a given user using user-based k-nearest neighbors (KNN) collaborative filtering with weighted average.

    Parameters:
    user_id (int): ID of the user for whom recommendations are to be generated.
    user_item_matrix (numpy.ndarray): Matrix representing users' ratings for items (movies).
    user_similarity_matrix (numpy.ndarray): Matrix representing cosine similarity between users.
    user_id_dict (dict): Dictionary mapping user IDs to unique indices in the user-item matrix.
    movie_id_dict (dict): Dictionary mapping movie IDs to unique indices in the user-item matrix.
    k (int): Number of nearest neighbors to consider for recommendations.

    Returns:
    predicted_ratings (numpy.ndarray): Array containing predicted ratings for every movie in movie_id_dict.
    """
    # Ensure user ID exists in the dictionary
    if user_id not in user_id_dict:
        print(f"User with ID {user_id} not found.")
        return []

    # Find the index of the user in the user-item matrix
    user_index = user_id_dict[user_id]

    # Get similarity scores of the user with other users and sort indices
    similar_users_indices = np.argsort(user_similarity_matrix[user_index])[::-1][:k]

    # Find movies rated by similar users
    rated_movies = np.where(~np.isnan(user_item_matrix[similar_users_indices]))[1]

    # Calculate weighted average ratings for each movie
    weighted_avg_ratings = np.zeros_like(user_item_matrix[0])
    similarity_sum = np.zeros_like(user_item_matrix[0])
    
    for movie in rated_movies:
        # Calculate weighted sum of ratings and sum of similarities
        weighted_sum = np.sum(user_item_matrix[similar_users_indices, movie] * user_similarity_matrix[user_index, similar_users_indices])
        similarity_sum[movie] += np.sum(user_similarity_matrix[user_index, similar_users_indices])
        weighted_avg_ratings[movie] += weighted_sum

    # Calculate predicted ratings
    predicted_ratings_array = np.divide(weighted_avg_ratings, similarity_sum, out=np.zeros_like(weighted_avg_ratings), where=(similarity_sum != 0))

    return predicted_ratings_array

def generate_pred_rating_matrix(user_item_matrix, user_similarity_matrix, user_id_dict, movie_id_dict, k):
    """
    Generates predicted rating matrix for all users.

    Parameters:
    user_item_matrix (numpy.ndarray): Matrix representing users' ratings for items (movies).
    user_similarity_matrix (numpy.ndarray): Matrix representing cosine similarity between users.
    user_id_dict (dict): Dictionary mapping user IDs to unique indices in the user-item matrix.
    movie_id_dict (dict): Dictionary mapping movie IDs to unique indices in the user-item matrix.
    k (int): Number of nearest neighbors to consider for recommendations.

    Returns:
    predicted_ratings_matrix (numpy.ndarray): Matrix containing predicted ratings for every user and movie.
    """
    num_users = len(user_id_dict)
    num_movies = len(movie_id_dict)
    predicted_ratings_matrix = np.zeros((num_users, num_movies))

    for user_id in user_id_dict:
        user_index = user_id_dict[user_id]
        predicted_ratings = generate_array_of_pred_ratings(user_id, user_item_matrix, user_similarity_matrix, user_id_dict, movie_id_dict, k)
        predicted_ratings_matrix[user_index] = predicted_ratings

    return predicted_ratings_matrix

**We will use RMSE as performance metric, using the function below to compute it:**

### Function explanation

`compute_rmse`
1. **Handle Implicit Ratings**: 
   - Convert `NaN` values in both `original_ratings` and `predicted_ratings` arrays to 0s. This is done using `np.nan_to_num()` function to ensure that non-rated items are treated as having a rating of 0 for comparison.
   
2. **Flatten Arrays**:
   - Flatten both `original_ratings` and `predicted_ratings` arrays into 1D arrays to facilitate making masks.

3. **Remove Unrated Items**:
   - Create a mask to filter out entries where the original rating is 0 (unrated items). Only ratings for rated items are considered for RMSE calculation.

4. **Compute Squared Differences**:
   - Calculate the squared differences between original and predicted ratings for the rated items.

5. **Compute Mean Squared Error (MSE)**:
   - Compute the mean squared error (MSE) by averaging the squared differences.

6. **Compute RMSE**:
   - Compute the square root of the mean squared error to obtain the RMSE value, which indicates the average difference between the original and predicted ratings.

7. **Return RMSE**:
   - Return the computed RMSE value as the output of the function.

In [None]:
def compute_rmse(original_ratings, predicted_ratings):
    """
    Computes the Root Mean Square Error (RMSE) between the original ratings and the predicted ratings. MovieIds a user has not interacted with is turned into 0 for now.

    Parameters:
    original_ratings (numpy.ndarray): Array containing the original ratings.
    predicted_ratings (numpy.ndarray): Array containing the predicted ratings.

    Returns:
    float: The RMSE value.
    
    """
    # handle implicit ratings with 0s for now
    original_ratings = np.nan_to_num(original_ratings, nan=0, posinf=0, neginf=0)
    predicted_ratings = np.nan_to_num(predicted_ratings, nan=0, posinf=0, neginf=0)

    # make 1d arrays by flattening them to be able to make masks
    original_ratings_flat = original_ratings.flatten()
    predicted_ratings_flat = predicted_ratings.flatten()
    
    # remove entries with no original rating (unrated items)
    mask = original_ratings_flat != 0
    original_ratings_flat = original_ratings_flat[mask]
    predicted_ratings_flat = predicted_ratings_flat[mask]
    
    # Compute the squared differences
    squared_diff = np.square(original_ratings_flat - predicted_ratings_flat)
    
    # Compute the mean squared error
    mse = np.mean(squared_diff)
    
    # Compute the square root of the mean squared error to get RMSE
    rmse = np.sqrt(mse)
    
    return rmse

`Netflix`

In [None]:
predicted_ratings_matrix_train1 = generate_pred_rating_matrix(user_item_matrix_train1, user_similarity_matrix_manhattan_train1, user_id_dict_train1, movie_id_dict_train1, k=1)
predicted_ratings_matrix_val1 = generate_pred_rating_matrix(user_item_matrix_val1, user_similarity_matrix_manhattan_val1, user_id_dict_val1, movie_id_dict_val1, k=1)

In [None]:
train1_rmse = compute_rmse(user_item_matrix_train1, predicted_ratings_matrix_train1)
print("RMSE on training set:", train1_rmse)
val1_rmse = compute_rmse(user_item_matrix_val1, predicted_ratings_matrix_val1)
print("RMSE on validation set:", val1_rmse)

RMSE on training set: 1.7228162156368902
RMSE on validation set: 1.7504914120001385


`Movielens`

In [None]:
predicted_ratings_matrix_train2 = generate_pred_rating_matrix(user_item_matrix_train2, user_similarity_matrix_manhattan_train2, user_id_dict_train2, movie_id_dict_train2, k=1)
predicted_ratings_matrix_val2 = generate_pred_rating_matrix(user_item_matrix_val2, user_similarity_matrix_manhattan_val2, user_id_dict_val2, movie_id_dict_val2, k=1)

In [None]:
train2_rmse = compute_rmse(user_item_matrix_train2, predicted_ratings_matrix_train2)
print("RMSE on training set:", train2_rmse)
val2_rmse = compute_rmse(user_item_matrix_val2, predicted_ratings_matrix_val2)
print("RMSE on validation set:", val2_rmse)

RMSE on training set: 1.6395933031242038
RMSE on validation set: 1.5042313201122957


## Hyperparameter tuning

Now we have recorded some baseline performance let's find the optimal value for K by loping over different k values while generating the predicted rating matrix:

`Netflix`

In [None]:
k_list = [1, 4, 10, 15]
rmse_list = []
best_k_train1 = None
best_rmse = float('inf')

for k in k_list:
    predicted_item_matrix = generate_pred_rating_matrix(user_item_matrix_train1, user_similarity_matrix_manhattan_train1, user_id_dict_train1, movie_id_dict_train1, k=k)
    
    # Compute Root Mean Square Error (RMSE)
    rmse = compute_rmse(user_item_matrix_train1, predicted_item_matrix)
    
    # Append the RMSE value to the list
    rmse_list.append(rmse)
    
    # Check if current k gives the best RMSE
    if rmse < best_rmse:
        best_rmse = rmse
        best_k_train1 = k

# Print the result descendingly
for i, k_value in enumerate(k_list):
    print(f"K-value: {k_value} | RMSE: {rmse_list[i]}")

print(f"\nBest K-value: {best_k_train1} | Best RMSE: {best_rmse}")

K-value: 1 | RMSE: 1.7228162156368902
K-value: 4 | RMSE: 1.8080383793500867
K-value: 10 | RMSE: 1.8735397483548166
K-value: 15 | RMSE: 1.8994989192078393

Best K-value: 1 | Best RMSE: 1.7228162156368902


`Movielens`

In [None]:
k_list = [1, 4, 10, 15]
rmse_list = []
best_k_train2 = None
best_rmse = float('inf')

for k in k_list:
    predicted_item_matrix = generate_pred_rating_matrix(user_item_matrix_train2, user_similarity_matrix_manhattan_train2, user_id_dict_train2, movie_id_dict_train2, k=k)
    
    # Compute Root Mean Square Error (RMSE)
    rmse = compute_rmse(user_item_matrix_train2, predicted_item_matrix)
    
    # Append the RMSE value to the list
    rmse_list.append(rmse)
    
    # Check if current k gives the best RMSE
    if rmse < best_rmse:
        best_rmse = rmse
        best_k_train2 = k

# Print the result descendingly
for i, k_value in enumerate(k_list):
    print(f"K-value: {k_value} | RMSE: {rmse_list[i]}")

print(f"\nBest K-value: {best_k_train2} | Best RMSE: {best_rmse}")

K-value: 1 | RMSE: 1.6395933031242038
K-value: 4 | RMSE: 1.6897712885959526
K-value: 10 | RMSE: 1.7753867847492926
K-value: 15 | RMSE: 1.823134411473603

Best K-value: 1 | Best RMSE: 1.6395933031242038


## Final predictions on test set:

`Netflix`

In [None]:
# set up
user_item_matrix_test1, user_id_dict_test1, movie_id_dict_test1, user_ids_test1, movieIds_test1 = create_user_item_matrix(test_data_netflix)
# _, user_ratings_matrix_classified_test1 = computing_neutral_scores(user_item_matrix_test1)
user_similarity_matrix_manhattan_test1 = calculate_user_similarity_manhattan(user_item_matrix_test1, threshold)

# set up predictions matrix
predicted_item_matrix_test1 = generate_pred_rating_matrix(user_item_matrix_test1, user_similarity_matrix_manhattan_test1, user_id_dict_test1, movie_id_dict_test1, k=best_k_train1)

# compute Root Mean Square Error (RMSE)
rmse_test1 = compute_rmse(user_item_matrix_test1, predicted_item_matrix_test1)
# print result on test set
print("RMSE on test set:", rmse_test1)

RMSE on test set: 2.0086021902359583


`Movielens`

In [None]:
# set up
user_item_matrix_test2, user_id_dict_test2, movie_id_dict_test2, user_ids_test2, movieIds_test2 = create_user_item_matrix(test_data_movielens)
# _, user_ratings_matrix_classified_test2 = computing_neutral_scores(user_item_matrix_test2)
user_similarity_matrix_manhattan_test2 = calculate_user_similarity_manhattan(user_item_matrix_test2, threshold)

# set up predictions matrix
predicted_item_matrix_test2 = generate_pred_rating_matrix(user_item_matrix_test2, user_similarity_matrix_manhattan_test2, user_id_dict_test2, movie_id_dict_test2, k=best_k_train2)

# compute Root Mean Square Error (RMSE)
rmse_test2 = compute_rmse(user_item_matrix_test2, predicted_item_matrix_test2)
# print result on test set
print("RMSE on test set:", rmse_test2)

RMSE on test set: 1.807721533549109


OLD

In [None]:
# def generate_user_knn_regressor(user_id, user_item_matrix, user_similarity_matrix, user_id_dict, movie_id_dict, k):
#     """
#     Generates movie recommendations for a given user using user-based k-nearest neighbors (KNN) collaborative filtering.

#     Parameters:
#     user_id (int): ID of the user for whom recommendations are to be generated.
#     user_item_matrix (numpy.ndarray): Matrix representing users' ratings for items (movies).
#     user_similarity_matrix (numpy.ndarray): Matrix representing cosine similarity between users.
#     user_id_dict (dict): Dictionary mapping user IDs to unique indices in the user-item matrix.
#     movie_id_dict (dict): Dictionary mapping movie IDs to unique indices in the user-item matrix.
#     k (int): Number of nearest neighbors to consider for recommendations.

#     Returns:
#     recommendations (list): List of tuples containing recommended movie IDs and their predicted or expected ratings for the given user.
#     """
#     # Ensure user ID exists in the dictionary
#     if user_id not in user_id_dict:
#         print(f"User with ID {user_id} not found.")
#         return []

#     # Find the index of the user in the user-item matrix
#     user_index = user_id_dict[user_id]

#     # Get similarity scores of the user with other users and sort indices
#     similar_users_indices = np.argsort(user_similarity_matrix[user_index])[::-1][:k]

#     # Find movies rated by similar users
#     rated_movies = np.where(~np.isnan(user_item_matrix[similar_users_indices]))[1]

#     # Calculate average ratings for each movie
#     movie_ratings = np.zeros_like(user_item_matrix[0])
#     movie_counts = np.zeros_like(user_item_matrix[0], dtype=int)
    
#     # Aggregate ratings and counts for each movie
#     for movie in rated_movies:
#         movie_ratings[movie] += np.sum(user_item_matrix[similar_users_indices, movie])
#         movie_counts[movie] += np.sum(~np.isnan(user_item_matrix[similar_users_indices, movie]))
    
#     # Calculate average ratings
#     average_ratings = np.divide(movie_ratings, movie_counts, out=np.zeros_like(movie_ratings), where=movie_counts!=0)

#     # Sort movies by average ratings in descending order
#     sorted_indices = np.argsort(average_ratings)[::-1]

#     # Convert movie indices back to movie IDs and return top 5 recommendations
#     top_recommendations = [(list(movie_id_dict.keys())[list(movie_id_dict.values()).index(movie_index)], average_ratings[movie_index])
#                            for movie_index in sorted_indices[:5]]
#     return top_recommendations

old function to normalize ratings:

### Preprocessing of ratings in user-item matrix:
We might suggest filling the empty values with 0s, but that can create issues with recommendation engines. 

If we were to fill this NaN with a 0, we would be incorrectly implying they greatly disliked! We are going to center each user’s ratings around 0 by deducting the row average and then fill in the missing values with 0. This means the missing data is replaced with neutral scores.

### `computing_neutral_scores` Function Explanation

### Functions Used and Purpose:
- **`np.nanmean()`**: Used to calculate the average rating for each user while handling NaN (missing) values.
  - **`axis=1`**: Specifies that the calculation is done along the rows (i.e., for each user).
- **`np.nan_to_num()`**: Used to fill in missing data (NaN) with zeros while preserving non-NaN values.
- **`np.reshape(-1, 1)`**: Used to reshape the array to ensure proper broadcasting during subtraction.
- **Indexing and Slicing**: Used to access elements in arrays and matrices.

### Steps:
1. **Calculate Average Ratings**:
   - Use `np.nanmean()` to compute the average rating for each user along the rows of the user-item matrix. This handles missing ratings (NaN) gracefully, computing the mean while ignoring NaN values.

2. **Center Ratings Around 0**:
   - Subtract the average ratings from each user's ratings in the user-item matrix. This centers each user's ratings around 0, effectively removing the user bias from the ratings.

3. **Fill Missing Data with Zeros**:
   - Use `np.nan_to_num()` to replace missing data (NaN) with zeros while preserving the existing non-NaN values. This ensures that missing ratings are treated neutrally (i.e., as if the user has not rated the item).

4. **Return Normalized User Ratings**:
   - Return the resulting normalized user ratings matrix, where missing ratings have been replaced with zeros and each user's ratings are centered around 0.

In [None]:
def computing_neutral_scores(user_item_matrix):
    """
    Compute neutral scores for user-item interactions in a user-item matrix.

    Parameters:
    user_item_matrix (numpy.ndarray): Matrix representing users' ratings for items (movies).

    Returns:
    user_ratings_matrix_normed (numpy.ndarray): Matrix representing users' ratings normalized with neutral scores.
    """
    # Calculate the average rating for each user
    avg_ratings = np.nanmean(user_item_matrix, axis=1)

    # Center each user's ratings around 0
    user_ratings_matrix_centered = user_item_matrix - avg_ratings.reshape(-1, 1)

    # Fill in the missing data with 0s
    user_ratings_matrix_normed = np.nan_to_num(user_ratings_matrix_centered, nan=0)

    return user_ratings_matrix_normed