In [143]:
import pandas as pd
import numpy as np

In [144]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" # to make jupyter print all outputs, not just the last one
from IPython.core.display import HTML # to pretty print pandas df and be able to copy them over (e.g. to ppt slides)

In [145]:
netflix_df = pd.read_parquet('cleaned/netflix_parquet')
movielens_df = pd.read_parquet('cleaned/movielens_parquet')

In [None]:
# netflix_df = netflix_df[netflix_df['review_data'].apply(lambda x: len(x) if x is not None else 0) > 500]
netflix_df = netflix_df[netflix_df['review_data'].apply(lambda x: 30 <= len(x) <= 375 if x is not None else False)]
movielens_df = movielens_df[movielens_df['review_data'].apply(lambda x: 30 <= len(x) <= 375 if x is not None else False)]

In [None]:
n_rows = 10

In [None]:
df = (netflix_df.sample(n=n_rows,random_state=42))[['movieId','review_data']]
df
df2 = (movielens_df.sample(n=n_rows,random_state=42))[['movieId','review_data']]
df2

Unnamed: 0,movieId,review_data
1471,1472,"[{'date': 2005-07-19, 'rating': 5.0, 'userId':..."
427,428,"[{'date': 2005-01-20, 'rating': 4.0, 'userId':..."
145,146,"[{'date': 2001-05-13, 'rating': 3.0, 'userId':..."
401,402,"[{'date': 2005-05-18, 'rating': 3.0, 'userId':..."
687,688,"[{'date': 2004-05-13, 'rating': 2.0, 'userId':..."
827,828,"[{'date': 2004-03-27, 'rating': 5.0, 'userId':..."
1939,1940,"[{'date': 2005-07-11, 'rating': 5.0, 'userId':..."
185,186,"[{'date': 2005-08-17, 'rating': 3.0, 'userId':..."
1610,1611,"[{'date': 2003-08-25, 'rating': 4.0, 'userId':..."
1719,1720,"[{'date': 2004-06-08, 'rating': 4.0, 'userId':..."


Unnamed: 0,movieId,review_data
4611,4736,"[{'date': 2003-02-06, 'rating': 1.0, 'userId':..."
5373,5505,"[{'date': 2005-06-27, 'rating': 3.5, 'userId':..."
7056,7235,"[{'date': 2010-12-29, 'rating': 4.5, 'userId':..."
878,901,"[{'date': 2007-06-15, 'rating': 5.0, 'userId':..."
23737,130636,"[{'date': 2016-08-03, 'rating': 3.5, 'userId':..."
8029,8815,"[{'date': 2007-03-08, 'rating': 1.5, 'userId':..."
4644,4769,"[{'date': 2003-04-14, 'rating': 3.0, 'userId':..."
3147,3249,"[{'date': 2007-02-03, 'rating': 2.0, 'userId':..."
6842,7019,"[{'date': 2004-08-09, 'rating': 0.5, 'userId':..."
8997,27156,"[{'date': 2022-01-20, 'rating': 4.0, 'userId':..."


In [None]:
# netflix
review_data = df['review_data'].values
user_ids = np.concatenate([np.array([entry['userId'] for entry in row]) for row in review_data])
ratings = np.concatenate([np.array([entry['rating'] for entry in row]) for row in review_data])
movieIds = np.concatenate([[movieId] * len(row) for movieId, row in zip(df['movieId'], review_data)])
print(f"{len(user_ids)} unique movieIds are handled from the Netflix dataset.")
print(f"{len(np.unique(movieIds))} unique userIds are handled from the Netflix dataset.")
print()

# movielens
review_data2 = df2['review_data'].values
user_ids2 = np.concatenate([np.array([entry['userId'] for entry in row]) for row in review_data2])
ratings2 = np.concatenate([np.array([entry['rating'] for entry in row]) for row in review_data2])
movieIds2 = np.concatenate([[movieId] * len(row) for movieId, row in zip(df['movieId'], review_data2)])
print(f"{len(user_ids2)} unique movieIds are handled from the Netflix dataset.")
print(f"{len(np.unique(movieIds2))} unique userIds are handled from the Netflix dataset.")

2289

10

1627

10

### Function Explanation

`train_val_test_split`

1. **Shuffle the Data**:
   - The input data is shuffled using `data.sample(frac=1, random_state=42)` to ensure randomness. `random_state=42` ensures reproducibility.

2. **Calculate Set Sizes**:
   - The sizes of each set (training, validation, and test) are calculated based on the provided ratios and the total number of samples in the data.

3. **Split the Data**:
   - The shuffled data is split into three sets: training, validation, and test.
   - The training data contains the first `num_train` samples.
   - The validation data contains the next `num_val` samples, starting from the index immediately following the last training sample.
   - The test data contains the remaining samples, starting from the index immediately following the last validation sample.

4. **Reset Index**:
   - The index of each set is reset to ensure that it starts from 0 and increases incrementally.

5. **Return Sets**:
   - The function returns the training, validation, and test sets as pandas DataFrames.

In [None]:
def train_val_test_split(data, train_ratio=0.8, val_ratio=0.1, test_ratio=0.1):
    """
    Splits the data into training, validation, and test sets, simultaneously ensuring no training data flows into validation or test data.

    Parameters:
    - data: pandas DataFrame containing the data to be split.
    - train_ratio: float, ratio of the training set size to the total data size (default: 0.8).
    - val_ratio: float, ratio of the validation set size to the total data size (default: 0.1).
    - test_ratio: float, ratio of the test set size to the total data size (default: 0.1).

    Returns:
    - train_data: pandas DataFrame, training set.
    - val_data: pandas DataFrame, validation set.
    - test_data: pandas DataFrame, test set.
    """
    # Shuffle the data
    data_shuffled = data.sample(frac=1, random_state=42)

    # Calculate the sizes of each set
    num_samples = len(data_shuffled)
    num_train = int(train_ratio * num_samples)
    num_val = int(val_ratio * num_samples)
    num_test = num_samples - num_train - num_val

    # Split the data into train, validation, and test sets
    train_data = data_shuffled[:num_train]
    
    # Below is ensured the validation data and the test data starts after the indices which are already in the training data, ensuring that no training data will flow into validation of test data.
    val_data = data_shuffled[num_train:num_train+num_val]
    test_data = data_shuffled[num_train+num_val:]

    # Reset index for each set
    train_data.reset_index(drop=True, inplace=True)
    val_data.reset_index(drop=True, inplace=True)
    test_data.reset_index(drop=True, inplace=True)

    return train_data, val_data, test_data

### Set-up user-item matrix
First we will create a user-item matrix which records all the user-item interactions.


### `create_user_item_matrix` Function Explanation

### Steps:
1. **Extract Review Data**:
   - Extract the review data from the provided DataFrame, which contains user IDs, ratings, and movie IDs.

2. **Create User and Movie IDs Arrays**:
   - Extract user IDs, ratings, and movie IDs from the review data and concatenate them into separate arrays.
   - Generate dictionaries to map user IDs and movie IDs to unique indices in the user-item matrix.

3. **Initialize User-Item Matrix**:
   - Determine the dimensions of the user-item matrix based on the number of unique users and movies.
   - Initialize an empty user-item matrix filled with NaN values.

4. **Populate User-Item Matrix**:
   - Iterate through the review data and populate the user-item matrix with ratings.
   - Map user and movie IDs to their corresponding indices in the matrix and insert the ratings.

5. **Return Results**:
   - Return the user-item matrix along with dictionaries mapping user and movie IDs to indices, and arrays containing user and movie IDs.
  
### Functions Used and Purpose:

- **`np.concatenate()`**: Used to concatenate arrays containing user IDs, ratings, and movie IDs extracted from the review data.
- **`enumerate()`**: Used to iterate over the unique user IDs and movie IDs and generate indices for mapping.
- **`np.unique()`**: Used to find the unique user IDs and movie IDs in the review data.
- **`np.full()`**: Used to initialize an empty user-item matrix filled with NaN values.
- **`zip()`**: Used to iterate over multiple iterables simultaneously (user IDs, movie IDs, ratings).
- **`enumerate()`**: Used to iterate over the indices and elements of an iterable (user IDs, movie IDs) simultaneously.
- **Indexing and Slicing**: Used to access and modify elements in arrays and matrices.

In [None]:
def create_user_item_matrix(train_test_val_set):
    """
    Creates a user-item matrix from the provided dataset containing review data.

    Parameters:
    train_test_val_set (DataFrame): DataFrame containing review data with columns 'review_data',
                                    which is a list of dictionaries with keys 'userId', 'rating',
                                    and 'movieId'.

    Returns:
    user_item_matrix (numpy.ndarray): Matrix representing users' ratings for items (movies), the matrix is an NumPy array which contains lists of user-item interactions, meaning a user and their corresponding ratings to the movieIds.    
    
    user_id_dict (dict): Dictionary mapping user IDs to unique indices in the user-item matrix.
    
    movie_id_dict (dict): Dictionary mapping movie IDs to unique indices in the user-item matrix.
    
    user_ids (numpy.ndarray): Array containing user IDs corresponding to each rating in the matrix.
    
    movie_ids (numpy.ndarray): Array containing movie IDs corresponding to each rating in the matrix.

    """
    review_data = train_test_val_set['review_data'].values
    user_ids = np.concatenate([np.array([entry['userId'] for entry in row]) for row in review_data])
    ratings = np.concatenate([np.array([entry['rating'] for entry in row]) for row in review_data])
    movieIds = np.concatenate([[movieId] * len(row) for movieId, row in zip(train_test_val_set['movieId'], review_data)])

    # create dictionaries to map user IDs and movie IDs to unique indices to map over
    user_id_dict = {user_id: index for index, user_id in enumerate(np.unique(user_ids))}
    movie_id_dict = {movie_id: index for index, movie_id in enumerate(np.unique(movieIds))}

    # initialize an empty user-item matrix
    user_count = len(user_id_dict)
    movie_count = len(movie_id_dict)
    user_item_matrix = np.full((user_count, movie_count), np.nan)

    # populate the user-item matrix with ratings from the dataset
    for i, (user_id, movie_id, rating) in enumerate(zip(user_ids, movieIds, ratings)):
        user_index = user_id_dict[user_id]
        movie_index = movie_id_dict[movie_id]
        user_item_matrix[user_index, movie_index] = rating

    return user_item_matrix, user_id_dict, movie_id_dict, user_ids, movieIds

### Compute similarity:
Regularly, cosine similarity is often used to measure the similarity between users based on their preferences or ratings for items (in this case, movies). Cosine similarity ranges from -1 to 1, where:

- 1 indicates perfect similarity,
- 0 indicates no similarity, and
- -1 indicates perfect dissimilarity.

### Interpretation:

- **Positive Cosine Similarity**: Users are similar in their preferences or ratings for movies.
- **Zero Cosine Similarity**: Users have no similarity in their preferences.
- **Negative Cosine Similarity**: Users are dissimilar in their preferences, tending towards opposite ratings for movies.

### Practical Implication:

If one user likes certain types of movies, the other user tends to dislike them, or vice versa. In other words, users with negative cosine similarities have contrasting preferences, making them less suitable for recommending movies to each other.

___

To see how similar users are we will compute the similarity between them. I will use cosine similarity as distance measure. The manhatten norm will be used to decrease computational weight instead of euclidian norm.

### Explanation `calculate_user_similarity_manhattan` Function

This function calculates the cosine similarity matrix between users based on their ratings using the Manhattan norm.

1. **Thresholding**: First, the function applies thresholding to the user ratings matrix. Ratings below the threshold are set to 0, ensuring that only significant ratings are considered.

2. **Dot Product Calculation**: It then computes the dot product of each pair of row vectors (users) in the thresholded matrix. This represents the similarity between users based on their common rated items.

3. **Norm Calculation**: Next, it calculates the norms (magnitude) of each row vector, considering only values above the threshold. This step prepares for the normalization process.

4. **Normalization**: The dot products are divided by the norms of the corresponding row vectors, effectively normalizing the similarity values. This step ensures that users with a large number of ratings are not favored over users with fewer ratings.

5. **Setting Diagonal to 0**: Finally, the diagonal elements of the similarity matrix are set to 0 to avoid self-similarity, as a user's rating should not be compared to itself.

### Explanation of NumPy Functions

- **np.dot**: Computes the dot product of arrays. Here, it calculates the dot product of the thresholded user ratings matrix with its transpose, resulting in the similarity matrix.
  
- **np.where**: Returns indices where a condition is true. It's used here to apply thresholding to the user ratings matrix.
  
- **np.sum**: Computes the sum of array elements. It calculates the norms of each row vector after thresholding, which are then used for normalization.
  
- **np.abs**: Computes the absolute value element-wise. Used to ensure positive values for norms calculation.
  
- **np.fill_diagonal**: Fills the diagonal of an array with a specified value. It's used to set diagonal elements of the similarity matrix to 0 to avoid self-similarity.

In [None]:
def calculate_user_similarity_manhattan(user_ratings_matrix, threshold):
    """
    Calculate user similarity using Manhattan distance-based similarity measure.

    Parameters:
    user_ratings_matrix (numpy.ndarray): Matrix representing users' ratings for items (movies).
    threshold (float): Threshold value for considering ratings in the similarity calculation.

    Returns:
    similarity_matrix (numpy.ndarray): Matrix representing similarity between users based on the Manhattan distance.

    The Manhattan distance-based similarity measure is calculated as follows:
    1. Compute the dot product of each pair of row vectors in the user_ratings_matrix, considering only values above the threshold.
    2. Calculate the norms of each row vector, considering only values above the threshold.
    3. Replace zero norms with a small value to avoid division by zero.
    4. Calculate the similarity matrix using broadcasting, where the similarity between users i and j is given by the dot product
       divided by the product of their norms.
    5. Set diagonal elements to 0 to avoid self-similarity.

    """
    # Fill in the missing data with 0s
    user_ratings_matrix = np.nan_to_num(user_ratings_matrix, nan=0)
    
    # this line calculates the dot product of each pair of row vectors, therefore how similar they are, considering only values above the threshold to decrease computational weight
    dot_products = np.dot(np.where(user_ratings_matrix >= threshold, user_ratings_matrix, 0), user_ratings_matrix.T)
    
    # this line calculates the distance using the manhatten norm, because the absolute values of the elements are summed up
    norms = np.sum(np.abs(np.where(user_ratings_matrix >= threshold, user_ratings_matrix, 0)), axis=1)
    
    # Replace zero norms with a small value to avoid division by zero
    norms[norms == 0] = 1e-8
    
    # compute similarity matrix using broadcasting
    similarity_matrix = dot_products / (norms[:, None] * norms)
    
    # Set diagonal elements to 0 to avoid self-similarity
    np.fill_diagonal(similarity_matrix, 0)
    
    return similarity_matrix

## UserKNN classifier:
During user based classification, the most likely rating is found by letting the nearest neighbors 'vote' on the item. Using the similarity weights, each vote carries a different weight. By multiplying the similarity weights with the ratings of an item the user has not watched but its neighbours have, the most likely and therefore predicted weight is computed.
### Function Explanation

`generate_user_knn_recommendations_classifier`

1. **Ensure User ID Existence**:
   - Check if the user ID exists in the provided dictionary. If not found, print an error message and return an empty list.

2. **Find User Index**:
   - Find the index of the user in the user-item matrix based on the provided user ID.

3. **Find Similar Users**:
   - Retrieve the similarity scores between the target user and all other users.
   - Sort the indices based on similarity in descending order and select the top `k` most similar users.

4. **Identify Rated Movies**:
   - Find movies that have been rated by the selected similar users.

5. **Calculate Votes**:
   - Calculate the "votes" for each movie from the similar users based on their ratings and similarity weights.

6. **Select Top Recommendations**:
   - Select the top 5 movies with the highest votes and return them as recommendations.

**Numpy Functions Explanation**

- `np.argsort`: Returns the indices that would sort an array in ascending order. By using `[::-1]`, it sorts the indices in descending order.
- `np.where`: Returns the indices of elements that satisfy a given condition. In this case, it's used to find movies that have been rated by similar users.
- `np.dot`: Computes the dot product of two arrays. Here, it's used to calculate the weighted sum of ratings from similar users.
- `np.argsort` (again): It's used to find the indices that would sort the `votes` array in ascending order. By selecting the last 5 indices (`[-5:]`) and reversing them (`[::-1]`), we get the indices of the top 5 movies.

In [None]:
def generate_user_knn_classifier_with_movies(user_id, user_item_matrix, user_similarity_matrix, user_id_dict, movie_id_dict, k):
    """
    Generates movie recommendations for a given user using user-based k-nearest neighbors (KNN) collaborative filtering with neighborhood-based classification.

    Parameters:
    user_id (int): ID of the user for whom recommendations are to be generated.
    user_item_matrix (numpy.ndarray): Matrix representing users' ratings for items (movies).
    user_similarity_matrix (numpy.ndarray): Matrix representing cosine similarity between users.
    user_id_dict (dict): Dictionary mapping user IDs to unique indices in the user-item matrix.
    movie_id_dict (dict): Dictionary mapping movie IDs to unique indices in the user-item matrix.
    k (int): Number of nearest neighbors to consider for recommendations.

    Returns:
    recommendations (list): List of tuples containing recommended movie IDs and their predicted ratings for the given user.
    """
    # Ensure user ID exists in the dictionary
    if user_id not in user_id_dict:
        print(f"User with ID {user_id} not found.")
        return []

    # find the index of the user in the user-item matrix
    user_index = user_id_dict[user_id]

    # this line calculates the similarity score between the target and other users and sorts it descendingly
    similar_users_indices = np.argsort(user_similarity_matrix[user_index])[::-1][:k]

    # the indices of users in the previous line are then used here to find the ratings of those users
    rated_movies = np.where(~np.isnan(user_item_matrix[similar_users_indices]))[1]

    # this line is selecting the ratings of the similar users which are the similar users indices
    neighbor_ratings = user_item_matrix[similar_users_indices][:, rated_movies]
    # thie line is computing similarity weights
    similarity_weights = user_similarity_matrix[user_index, similar_users_indices][:, np.newaxis] # by converting to column vector the matrix multiplication can be performed because the shape is now the same as neighbor_ratings
    # this line is performing a dot product of the ratings and the weights to compute the predicted rating
    votes = np.dot(neighbor_ratings.T, similarity_weights).flatten()

    # Find the top 5 ratings with the maximum votes
    top_indices = np.argsort(votes)[-5:][::-1]
    top_recommendations = [(list(movie_id_dict.keys())[list(movie_id_dict.values()).index(rated_movies[idx])], votes[idx]) for idx in top_indices]

    return top_recommendations

### See a first batch of recommendations:

By using the functions above to recommend movies above the following results are generated for each dataset:

In [None]:
train_data_netflix, val_data_netflix, test_data_netflix = train_val_test_split(df)
train_data_movielens, val_data_movielens, test_data_movielens = train_val_test_split(df2)

# set up paramters and threshold for similarity
k=1
threshold=0.5

### `Netflix:`

Training data:

In [None]:
# set up
user_item_matrix_train1, user_id_dict_train1, movie_id_dict_train1, user_ids_train1, movieIds_train1 = create_user_item_matrix(train_data_netflix)
# _, user_ratings_matrix_classified_train1 = computing_neutral_scores(user_item_matrix_train1)
user_similarity_matrix_manhattan_train1 = calculate_user_similarity_manhattan(user_item_matrix_train1, threshold)

# generate recommendations
user_id_train1 = user_ids_train1[1]
top5_pred_train1 = generate_user_knn_classifier_with_movies(user_id_train1, user_item_matrix_train1, user_similarity_matrix_manhattan_train1, user_id_dict_train1, movie_id_dict_train1, k)

# print result
print(f"UserKNN classifier Recommendations: \nTop 5 recommended movies with predicted ratings for user {user_id_train1}:")
for movie_id, predicted_rating in top5_pred_train1:
    print(f"Movie ID: {movie_id}, Predicted Rating: {predicted_rating}")

UserKNN classifier Recommendations: 
Top 5 recommended movies with predicted ratings for user 1174811:
Movie ID: 1611, Predicted Rating: 4.0


Validation data

In [None]:
# set up
user_item_matrix_val1, user_id_dict_val1, movie_id_dict_val1, user_ids_val1, movieIds_val1 = create_user_item_matrix(val_data_netflix)
# _, user_ratings_matrix_classified_val1 = computing_neutral_scores(user_item_matrix_val1)
user_similarity_matrix_manhattan_val1 = calculate_user_similarity_manhattan(user_item_matrix_val1, threshold)

# generate recommendations
user_id_val1 = user_ids_val1[1]
top5_pred_val1 = generate_user_knn_classifier_with_movies(user_id_val1, user_item_matrix_val1, user_similarity_matrix_manhattan_val1, user_id_dict_val1, movie_id_dict_val1, k)

# print result
print(f"UserKNN classifier Recommendations: \nTop 5 recommended movies with predicted ratings for user {user_id_val1}:")
for movie_id, rating in top5_pred_val1:
    print(f"Movie ID: {movie_id}, Predicted Rating: {rating}")

UserKNN classifier Recommendations: 
Top 5 recommended movies with predicted ratings for user 1998800:
Movie ID: 402, Predicted Rating: 3.0


### `Movielens`

In [None]:
# set up
user_item_matrix_train2, user_id_dict_train2, movie_id_dict_train2, user_ids_train2, movieIds_train2 = create_user_item_matrix(train_data_movielens)
# _, user_ratings_matrix_classified_train2 = computing_neutral_scores(user_item_matrix_train2)
user_similarity_matrix_manhattan_train2 = calculate_user_similarity_manhattan(user_item_matrix_train2, threshold)

# generate recommendations
user_id_train2 = user_ids_train2[1]
top5_pred_train2 = generate_user_knn_classifier_with_movies(user_id_train2, user_item_matrix_train2, user_similarity_matrix_manhattan_train2, user_id_dict_train2, movie_id_dict_train2, k)

# print result
print(f"UserKNN classifier Recommendations: \nTop 5 recommended movies with predicted ratings for user {user_id_train2}:")
for movie_id, rating in top5_pred_train2:
    print(f"Movie ID: {movie_id}, Predicted Rating: {rating}")

UserKNN classifier Recommendations: 
Top 5 recommended movies with predicted ratings for user 306139:
Movie ID: 7019, Predicted Rating: 3.5


Validation data

In [None]:
# set up
user_item_matrix_val2, user_id_dict_val2, movie_id_dict_val2, user_ids_val2, movieIds_val2 = create_user_item_matrix(val_data_movielens)
# _, user_ratings_matrix_classified_val2 = computing_neutral_scores(user_item_matrix_val2)
user_similarity_matrix_manhattan_val2 = calculate_user_similarity_manhattan(user_item_matrix_val2, threshold)

# generate recommendations
user_id_val2 = user_ids_val2[1]
top5_pred_val2 = generate_user_knn_classifier_with_movies(user_id_val2, user_item_matrix_val2, user_similarity_matrix_manhattan_val2, user_id_dict_val2, movie_id_dict_val2, k)

# print result
print(f"UserKNN classifier Recommendations: \nTop 5 recommended movies with predicted ratings for user {user_id_val2}:")
for movie_id, rating in top5_pred_val2:
    print(f"Movie ID: {movie_id}, Predicted Rating: {rating}")

UserKNN classifier Recommendations: 
Top 5 recommended movies with predicted ratings for user 39104:
Movie ID: 901, Predicted Rating: 1.0


Even though the movieIds which are recommended are the same, the predicted rating differs somewhat, already indicating a difference between the two userKNN models.

## Baseline performance

To assess performance, we are going to compare the original ratings matrix with the predicted one after the userKnn model. In order to do so, we will generate a predicted rating matrix with the two functions below. Essentially an array of predicted ratings is generated in contrast with the tuple in the previous function `generate_user_knn_classifier_with_movies` with the top 5 results. The secon functions will append these in a new matrix. 

### Function Explanation

`generate_predictions_array`
1. Ensure that the provided user ID exists in the dictionary. If not found, print an error message and return an empty array.
2. Find the index of the user in the user-item matrix based on the provided user ID.
3. Get the similarity scores between the target user and all other users, then sort the indices based on similarity in descending order and select the top `k` most similar users.
4. Find movies that have been rated by the selected similar users.
5. Calculate the "votes" for each movie from the similar users based on their ratings and similarity weights.
6. Generate an array of predicted ratings for the given user and all movies. Initialize with NaN for unrated movies, and fill in the predicted ratings for rated movies.

`generate_predicted_user_item_matrix`
1. Initialize an empty matrix to hold the predicted ratings for all users and movies.
2. Iterate over each user in the `user_id_dict`.
3. For each user, generate predictions using the `generate_predictions_array` function and fill in the corresponding row in the predicted matrix.
4. Return the predicted user-item matrix.

In [None]:
def generate_predictions_array(user_id, user_item_matrix, user_similarity_matrix, user_id_dict, movie_id_dict, k):
    """
    Generates movie ratings predictions for a given user using user-based k-nearest neighbors (KNN) collaborative filtering with neighborhood-based classification.

    Parameters:
    user_id (int): ID of the user for whom ratings are to be predicted.
    user_item_matrix (numpy.ndarray): Matrix representing users' ratings for items (movies).
    user_similarity_matrix (numpy.ndarray): Matrix representing cosine similarity between users.
    user_id_dict (dict): Dictionary mapping user IDs to unique indices in the user-item matrix.
    movie_id_dict (dict): Dictionary mapping movie IDs to unique indices in the user-item matrix.
    k (int): Number of nearest neighbors to consider for recommendations.

    Returns:
    predicted_ratings (numpy.ndarray): Array containing predicted ratings for the given user and all movies in movie_id_dict.
    """
    # Ensure user ID exists in the dictionary
    if user_id not in user_id_dict:
        print(f"User with ID {user_id} not found.")
        return np.array([])

    # Find the index of the user in the user-item matrix
    user_index = user_id_dict[user_id]

    # Get similarity scores of the user with other users and sort indices
    similar_users_indices = np.argsort(user_similarity_matrix[user_index])[::-1][:k]

    # Find movies rated by similar users
    rated_movies = np.where(~np.isnan(user_item_matrix[similar_users_indices]))[1]

    # Calculate votes from neighbors
    neighbor_ratings = user_item_matrix[similar_users_indices][:, rated_movies]
    similarity_weights = user_similarity_matrix[user_index, similar_users_indices][:, np.newaxis]
    votes = np.dot(neighbor_ratings.T, similarity_weights).flatten()

    # Find the predicted ratings for the given user and all movies
    predicted_ratings = np.full(len(movie_id_dict), np.nan)  # Initialize with NaN for unrated movies
    for movie_id, movie_index in movie_id_dict.items():
        if movie_index in rated_movies:
            idx = np.where(rated_movies == movie_index)[0][0]
            predicted_ratings[movie_index] = votes[idx]

    return predicted_ratings

def generate_predicted_user_item_matrix(user_item_matrix, user_similarity_matrix, user_id_dict, movie_id_dict, k):
    """
    Generates the predicted user-item matrix using user-based k-nearest neighbors (KNN) collaborative filtering with neighborhood-based classification for all users.

    Parameters:
    user_item_matrix (numpy.ndarray): Matrix representing users' ratings for items (movies).
    user_similarity_matrix (numpy.ndarray): Matrix representing cosine similarity between users.
    user_id_dict (dict): Dictionary mapping user IDs to unique indices in the user-item matrix.
    movie_id_dict (dict): Dictionary mapping movie IDs to unique indices in the user-item matrix.
    k (int): Number of nearest neighbors to consider for recommendations.

    Returns:
    predicted_matrix (numpy.ndarray): Predicted user-item matrix containing ratings for all users and movies.
    """
    num_users = user_item_matrix.shape[0]
    num_movies = len(movie_id_dict)
    predicted_matrix = np.zeros((num_users, num_movies))

    # Iterate over each user
    for user_id in user_id_dict:
        predicted_ratings = generate_predictions_array(user_id, user_item_matrix, user_similarity_matrix, user_id_dict, movie_id_dict, k)
        predicted_matrix[user_id_dict[user_id]] = predicted_ratings

    return predicted_matrix

**We will use RMSE as performance metric, using the function below to compute it:**

### Function explanation

`compute_rmse`
1. **Handle Implicit Ratings**: 
   - Convert `NaN` values in both `original_ratings` and `predicted_ratings` arrays to 0s. This is done using `np.nan_to_num()` function to ensure that non-rated items are treated as having a rating of 0 for comparison.
   
2. **Flatten Arrays**:
   - Flatten both `original_ratings` and `predicted_ratings` arrays into 1D arrays to facilitate making masks.

3. **Remove Unrated Items**:
   - Create a mask to filter out entries where the original rating is 0 (unrated items). Only ratings for rated items are considered for RMSE calculation.

4. **Compute Squared Differences**:
   - Calculate the squared differences between original and predicted ratings for the rated items.

5. **Compute Mean Squared Error (MSE)**:
   - Compute the mean squared error (MSE) by averaging the squared differences.

6. **Compute RMSE**:
   - Compute the square root of the mean squared error to obtain the RMSE value, which indicates the average difference between the original and predicted ratings.

7. **Return RMSE**:
   - Return the computed RMSE value as the output of the function.

In [None]:
def compute_rmse(original_ratings, predicted_ratings):
    """
    Computes the Root Mean Square Error (RMSE) between the original ratings and the predicted ratings. MovieIds a user has not interacted with is turned into 0 for now.

    Parameters:
    original_ratings (numpy.ndarray): Array containing the original ratings.
    predicted_ratings (numpy.ndarray): Array containing the predicted ratings.

    Returns:
    float: The RMSE value.
    
    """
    # handle implicit ratings with 0s for now
    original_ratings = np.nan_to_num(original_ratings, nan=0, posinf=0, neginf=0)
    predicted_ratings = np.nan_to_num(predicted_ratings, nan=0, posinf=0, neginf=0)

    # make 1d arrays by flattening them to be able to make masks
    original_ratings_flat = original_ratings.flatten()
    predicted_ratings_flat = predicted_ratings.flatten()
    
    # remove entries with no original rating (unrated items)
    mask = original_ratings_flat != 0
    original_ratings_flat = original_ratings_flat[mask]
    predicted_ratings_flat = predicted_ratings_flat[mask]
    
    # Compute the squared differences
    squared_diff = np.square(original_ratings_flat - predicted_ratings_flat)
    
    # Compute the mean squared error
    mse = np.mean(squared_diff)
    
    # Compute the square root of the mean squared error to get RMSE
    rmse = np.sqrt(mse)
    
    return rmse

`Netflix`

In [None]:
predicted_ratings_matrix_train1 = generate_predicted_user_item_matrix(user_item_matrix_train1, user_similarity_matrix_manhattan_train1, user_id_dict_train1, movie_id_dict_train1, k=1)
predicted_ratings_matrix_val1 = generate_predicted_user_item_matrix(user_item_matrix_val1, user_similarity_matrix_manhattan_val1, user_id_dict_val1, movie_id_dict_val1, k=1)

In [None]:
train1_rmse = compute_rmse(user_item_matrix_train1, predicted_ratings_matrix_train1)
print("RMSE on training set:", train1_rmse)
val1_rmse = compute_rmse(user_item_matrix_val1, predicted_ratings_matrix_val1)
print("RMSE on validation set:", val1_rmse)

RMSE on training set: 1.4827554220313284
RMSE on validation set: 1.3819681854419696


`Movielens`

In [None]:
predicted_ratings_matrix_train2 = generate_predicted_user_item_matrix(user_item_matrix_train2, user_similarity_matrix_manhattan_train2, user_id_dict_train2, movie_id_dict_train2, k=1)
predicted_ratings_matrix_val2 = generate_predicted_user_item_matrix(user_item_matrix_val2, user_similarity_matrix_manhattan_val2, user_id_dict_val2, movie_id_dict_val2, k=1)

In [None]:
train2_rmse = compute_rmse(user_item_matrix_train2, predicted_ratings_matrix_train2)
print("RMSE on training set:", train2_rmse)
val2_rmse = compute_rmse(user_item_matrix_val2, predicted_ratings_matrix_val2)
print("RMSE on validation set:", val2_rmse)

RMSE on training set: 1.4119215307445996
RMSE on validation set: 2.9022185204469926


## Hyperparameter tuning

Now we have recorded some baseline performance let's find the optimal value for K

`Netflix`

In [None]:
k_list = [1, 4, 10, 15]
rmse_list = []
best_k_train1 = None
best_rmse = float('inf')

for k in k_list:
    predicted_item_matrix = generate_predicted_user_item_matrix(user_item_matrix_train1, user_similarity_matrix_manhattan_train1, user_id_dict_train1, movie_id_dict_train1, k=k)
    
    # Compute Root Mean Square Error (RMSE)
    rmse = compute_rmse(user_item_matrix_train1, predicted_item_matrix)
    
    # Append the RMSE value to the list
    rmse_list.append(rmse)
    
    # Check if current k gives the best RMSE
    if rmse < best_rmse:
        best_rmse = rmse
        best_k_train1 = k

# Print the result descendingly
for i, k_value in enumerate(k_list):
    print(f"K-value: {k_value} | RMSE: {rmse_list[i]}")

print(f"\nBest K-value: {best_k_train1} | Best RMSE: {best_rmse}")

K-value: 1 | RMSE: 1.4827554220313284
K-value: 4 | RMSE: 9.566969571226839
K-value: 10 | RMSE: 27.390711164809886
K-value: 15 | RMSE: 41.754973541219535

Best K-value: 1 | Best RMSE: 1.4827554220313284


`Movielens`

In [None]:
k_list = [1, 4, 10, 15]
rmse_list = []
best_k_train2 = None
best_rmse = float('inf')

for k in k_list:
    predicted_item_matrix = generate_predicted_user_item_matrix(user_item_matrix_train2, user_similarity_matrix_manhattan_train2, user_id_dict_train2, movie_id_dict_train2, k=k)
    
    # Compute Root Mean Square Error (RMSE)
    rmse = compute_rmse(user_item_matrix_train2, predicted_item_matrix)
    
    # Append the RMSE value to the list
    rmse_list.append(rmse)
    
    # Check if current k gives the best RMSE
    if rmse < best_rmse:
        best_rmse = rmse
        best_k_train2 = k

# Print the result descendingly
for i, k_value in enumerate(k_list):
    print(f"K-value: {k_value} | RMSE: {rmse_list[i]}")

print(f"\nBest K-value: {best_k_train2} | Best RMSE: {best_rmse}")

K-value: 1 | RMSE: 1.4119215307445996
K-value: 4 | RMSE: 10.744248400797044
K-value: 10 | RMSE: 30.475285775571844
K-value: 15 | RMSE: 46.74670403580703

Best K-value: 1 | Best RMSE: 1.4119215307445996


## Final predictions on test set:

`Netflix`

In [None]:
# set up
user_item_matrix_test1, user_id_dict_test1, movie_id_dict_test1, user_ids_test1, movieIds_test1 = create_user_item_matrix(test_data_netflix)
# _, user_ratings_matrix_classified_test1 = computing_neutral_scores(user_item_matrix_test1)
user_similarity_matrix_manhattan_test1 = calculate_user_similarity_manhattan(user_item_matrix_test1, threshold)

# set up predictions matrix
predicted_item_matrix_test1 = generate_predicted_user_item_matrix(user_item_matrix_test1, user_similarity_matrix_manhattan_test1, user_id_dict_test1, movie_id_dict_test1, k=best_k_train1)

# compute Root Mean Square Error (RMSE)
rmse_test1 = compute_rmse(user_item_matrix_test1, predicted_item_matrix_test1)
# print result on test set
print("RMSE on test set:", rmse_test1)

RMSE on test set: 1.647965252571853


`Movielens`

In [None]:
# set up
user_item_matrix_test2, user_id_dict_test2, movie_id_dict_test2, user_ids_test2, movieIds_test2 = create_user_item_matrix(test_data_movielens)
# _, user_ratings_matrix_classified_test2 = computing_neutral_scores(user_item_matrix_test2)
user_similarity_matrix_manhattan_test2 = calculate_user_similarity_manhattan(user_item_matrix_test2, threshold)

# set up predictions matrix
predicted_item_matrix_test2 = generate_predicted_user_item_matrix(user_item_matrix_test2, user_similarity_matrix_manhattan_test2, user_id_dict_test2, movie_id_dict_test2, k=best_k_train2)

# compute Root Mean Square Error (RMSE)
rmse_test2 = compute_rmse(user_item_matrix_test2, predicted_item_matrix_test2)
# print result on test set
print("RMSE on test set:", rmse_test2)

RMSE on test set: 1.1902380714238083


# Overall conclusion:

Insert conclusion:

# OLD

In [None]:
# def create_predicted_user_item_matrix(user_id, user_ratings_matrix_normed, user_similarity_matrix, movie_id_dict, user_id_dict, threshold=0.5, k=1):
#     """
#     Recommend movies to a user based on the KNN classifier approach.

#     Parameters:
#     user_id (int): ID of the target user.
#     user_ratings_matrix_normed (numpy.ndarray): Matrix representing users' ratings normalized with neutral scores.
#     user_similarity_matrix (numpy.ndarray): Matrix representing similarity between users.
#     movie_id_dict (dict): Dictionary mapping movie IDs to unique indices in the user-item matrix.
#     user_id_dict (dict): Dictionary mapping user IDs to unique indices in the user-item matrix.
#     threshold (float): Threshold value for considering positive ratings.
#     k (int): Number of neighbors to consider.
#     top_n (int): Number of movies to recommend.

#     Returns:
#     predicted_classes_array (numpy.ndarray): Array of predicted classes for every movie ID in movie_id_dict.
#     """
    
#     # Find the target user's index in the similarity matrix
#     user_index = user_id_dict[user_id]

#     # Find indices of k most similar users (excluding the target user)
#     similar_users_indices = np.argsort(user_similarity_matrix[user_index])[::-1][1:k+1]

#     # Find movies not rated by the target user
#     unrated_movies = np.where(user_ratings_matrix_normed[user_index] == 0)[0]

#     # Predict whether the target user will like or dislike each unrated movie
#     predicted_classes = []
#     for movie_index in unrated_movies:
#         neighbor_ratings = user_ratings_matrix_normed[similar_users_indices, movie_index]
#         positive_votes = np.sum(neighbor_ratings >= threshold)
#         negative_votes = np.sum(neighbor_ratings < threshold)
#         predicted_class = 1 if positive_votes > negative_votes else 0
#         predicted_classes.append(predicted_class)

#     # Create an array of predicted classes for every movie ID in movie_id_dict
#     predicted_classes_array = np.array([''] * len(movie_id_dict))
#     for movie_index, predicted_class in zip(unrated_movies, predicted_classes):
#         predicted_classes_array[movie_index] = predicted_class

#     return predicted_classes_array

# def create_predicted_user_item_matrix(user_ratings_matrix_normed, user_similarity_matrix, movie_id_dict, user_id_dict, threshold=0.5, k=1):
#     """
#     Recommend movies to users based on the KNN classifier approach.

#     Parameters:
#     user_ratings_matrix_normed (numpy.ndarray): Matrix representing users' ratings normalized with neutral scores.
#     user_similarity_matrix (numpy.ndarray): Matrix representing similarity between users.
#     movie_id_dict (dict): Dictionary mapping movie IDs to unique indices in the user-item matrix.
#     user_id_dict (dict): Dictionary mapping user IDs to unique indices in the user-item matrix.
#     threshold (float): Threshold value for considering positive ratings.
#     k (int): Number of neighbors to consider.

#     Returns:
#     predicted_classes_matrix (numpy.ndarray): Matrix of predicted classes for every user and every movie ID in movie_id_dict.
#     """
    
#     num_users = user_ratings_matrix_normed.shape[0]
#     num_movies = len(movie_id_dict)
    
#     # Initialize predicted classes matrix
#     predicted_classes_matrix = np.zeros((num_users, num_movies), dtype=int)

#     for user_id in user_id_dict.keys():
#         # Find the target user's index in the similarity matrix
#         user_index = user_id_dict[user_id]

#         # Find indices of k most similar users (excluding the target user)
#         similar_users_indices = np.argsort(user_similarity_matrix[user_index])[::-1][1:k+1]

#         # Find movies not rated by the target user
#         unrated_movies = np.where(user_ratings_matrix_normed[user_index] == 0)[0]

#         # Predict whether the target user will like or dislike each unrated movie
#         predicted_classes = []
#         for movie_index in unrated_movies:
#             neighbor_ratings = user_ratings_matrix_normed[similar_users_indices, movie_index]
#             positive_votes = np.sum(neighbor_ratings >= threshold)
#             negative_votes = np.sum(neighbor_ratings < threshold)
#             predicted_class = 1 if positive_votes > negative_votes else 0
#             predicted_classes.append(predicted_class)

#         # Update predicted classes matrix for the current user
#         predicted_classes_matrix[user_index, unrated_movies] = predicted_classes

#     return predicted_classes_matrix

In [None]:
# def recommend_movies_classification(user_id, user_ratings_matrix_normed, user_similarity_matrix, movie_id_dict, user_id_dict, threshold=0.5, k=1, top_n=5):
#     """
#     Recommend movies to a user based on the KNN classifier approach.

#     Parameters:
#     user_id (int): ID of the target user.
#     user_ratings_matrix_normed (numpy.ndarray): Matrix representing users' ratings normalized with neutral scores.
#     user_similarity_matrix (numpy.ndarray): Matrix representing similarity between users.
#     movie_id_dict (dict): Dictionary mapping movie IDs to unique indices in the user-item matrix.
#     threshold (float): Threshold value for considering positive ratings.
#     k (int): Number of neighbors to consider.
#     top_n (int): Number of movies to recommend.

#     Returns:
#     recommended_movies (list): List of recommended movie IDs.
#     """

#     # Find the target user's index in the similarity matrix
#     user_index = user_id_dict[user_id]

#     # Find indices of k most similar users (excluding the target user)
#     similar_users_indices = np.argsort(user_similarity_matrix[user_index])[::-1][1:k+1]

#     # Find movies not rated by the target user
#     # unrated_movies = np.where(np.isnan(user_ratings_matrix_normed[user_index]))[0]
#     unrated_movies = np.where(user_ratings_matrix_normed[user_index] == 0)[0]


#     # Predict whether the target user will like or dislike each unrated movie
#     predicted_classes = []
#     for movie_index in unrated_movies:
#         neighbor_ratings = user_ratings_matrix_normed[similar_users_indices, movie_index]
#         positive_votes = np.sum(neighbor_ratings >= threshold)
#         negative_votes = np.sum(neighbor_ratings < threshold)
#         predicted_class = 'positive' if positive_votes > negative_votes else 'negative'
#         predicted_classes.append(predicted_class)

#     # Count the votes for each movie class
#     unique_classes, class_counts = np.unique(predicted_classes, return_counts=True)

#     # Recommend movies with the majority predicted class
#     recommended_movies = []
#     for movie_index, predicted_class in zip(unrated_movies, predicted_classes):
#         if class_counts[np.where(unique_classes == predicted_class)[0][0]] >= k // 2 + 1:  # Majority voting
#             movie_id = list(movie_id_dict.keys())[list(movie_id_dict.values()).index(movie_index)]
#             recommended_movies.append(movie_id)
#             if len(recommended_movies) == top_n:
#                 break

#     return recommended_movies

In [None]:
# threshold = 0
# k=1
# top_n=5

# user_item_matrix, user_id_dict, movie_id_dict, user_ids, movieIds = create_user_item_matrix(df)
# user_ratings_matrix_normed, user_ratings_matrix_classified = computing_neutral_scores(user_item_matrix)
# user_similarity_matrix_manhattan = calculate_user_similarity_manhattan(user_ratings_matrix_normed, threshold=0) # still explain why treshold on 0.5!!!!!!!

# user_id = user_ids[1]  # Specify the target user ID
# recommended_movies = recommend_movies_classification(user_id, user_ratings_matrix_normed, user_similarity_matrix_manhattan, movie_id_dict, user_id_dict, threshold, k, top_n)

# print(f"Recommended Movies for User {user_id} with corresponding rating classification:")
# for movie_id, predicted_class in recommended_movies:
#     print(f"MovieId: {movie_id}, Predicted rating class: {predicted_class}")

old code which works for user knn classifier

In [None]:
# def generate_user_knn_classifier(user_id, user_item_matrix, user_similarity_matrix, user_id_dict, movie_id_dict, k):
#     """
#     Generates movie recommendations for a given user using user-based k-nearest neighbors (KNN) collaborative filtering with neighborhood-based classification.

#     Parameters:
#     user_id (int): ID of the user for whom recommendations are to be generated.
#     user_item_matrix (numpy.ndarray): Matrix representing users' ratings for items (movies).
#     user_similarity_matrix (numpy.ndarray): Matrix representing cosine similarity between users.
#     user_id_dict (dict): Dictionary mapping user IDs to unique indices in the user-item matrix.
#     movie_id_dict (dict): Dictionary mapping movie IDs to unique indices in the user-item matrix.
#     k (int): Number of nearest neighbors to consider for recommendations.

#     Returns:
#     recommendations (list): List of tuples containing recommended movie IDs and their predicted ratings for the given user.
#     """
#     # Ensure user ID exists in the dictionary
#     if user_id not in user_id_dict:
#         print(f"User with ID {user_id} not found.")
#         return []

#     # Find the index of the user in the user-item matrix
#     user_index = user_id_dict[user_id]

#     # Get similarity scores of the user with other users and sort indices
#     similar_users_indices = np.argsort(user_similarity_matrix[user_index])[::-1][:k]

#     # Find movies rated by similar users
#     rated_movies = np.where(~np.isnan(user_item_matrix[similar_users_indices]))[1]

#     # Determine the possible rating values
#     possible_ratings = np.unique(user_item_matrix)

#     # Initialize dictionary to store the votes for each rating
#     votes = {rating: 0 for rating in possible_ratings}

#     # Calculate votes from neighbors
#     for rating in possible_ratings:
#         for movie in rated_movies:
#             # Check if the neighbor has given the rating to the movie
#             neighbor_ratings = user_item_matrix[similar_users_indices, movie]
#             matching_indices = np.where(neighbor_ratings == rating)[0]
            
#             # Sum up the similarity weights of neighbors giving the rating
#             if len(matching_indices) > 0:
#                 votes[rating] += np.sum(user_similarity_matrix[user_index, similar_users_indices[matching_indices]])

#     # Find the rating with the maximum vote
#     predicted_rating = max(votes, key=votes.get)

#     return predicted_rating

# def generate_user_knn_classifier_with_movies(user_id, user_item_matrix, user_similarity_matrix, user_id_dict, movie_id_dict, k):
#     """
#     Generates movie recommendations for a given user using user-based k-nearest neighbors (KNN) collaborative filtering with neighborhood-based classification.

#     Parameters:
#     user_id (int): ID of the user for whom recommendations are to be generated.
#     user_item_matrix (numpy.ndarray): Matrix representing users' ratings for items (movies).
#     user_similarity_matrix (numpy.ndarray): Matrix representing cosine similarity between users.
#     user_id_dict (dict): Dictionary mapping user IDs to unique indices in the user-item matrix.
#     movie_id_dict (dict): Dictionary mapping movie IDs to unique indices in the user-item matrix.
#     k (int): Number of nearest neighbors to consider for recommendations.

#     Returns:
#     recommendations (list): List of tuples containing recommended movie IDs and their predicted ratings for the given user.
#     """
#     # Ensure user ID exists in the dictionary
#     if user_id not in user_id_dict:
#         print(f"User with ID {user_id} not found.")
#         return []

#     # Find the index of the user in the user-item matrix
#     user_index = user_id_dict[user_id]

#     # Get similarity scores of the user with other users and sort indices
#     similar_users_indices = np.argsort(user_similarity_matrix[user_index])[::-1][:k]

#     # Find movies rated by similar users
#     rated_movies = np.where(~np.isnan(user_item_matrix[similar_users_indices]))[1]

#     # Determine the possible rating values
#     possible_ratings = np.unique(user_item_matrix)

#     # Initialize dictionary to store the votes for each rating
#     votes = {rating: 0 for rating in possible_ratings}

#     # Calculate votes from neighbors
#     for rating in possible_ratings:
#         for movie in rated_movies:
#             # Check if the neighbor has given the rating to the movie
#             neighbor_ratings = user_item_matrix[similar_users_indices, movie]
#             matching_indices = np.where(neighbor_ratings == rating)[0]
            
#             # Sum up the similarity weights of neighbors giving the rating
#             if len(matching_indices) > 0:
#                 votes[rating] += np.sum(user_similarity_matrix[user_index, similar_users_indices[matching_indices]])

#     # Find the top 5 ratings with the maximum votes
#     top_ratings = sorted(votes.items(), key=lambda x: x[1], reverse=True)[:5]
    
#     # Convert movie indices back to movie IDs
#     top_recommendations = [(list(movie_id_dict.keys())[list(movie_id_dict.values()).index(movie_index)], rating) for movie_index, rating in top_ratings]

#     return top_recommendations


### Preprocessing of ratings in user-item matrix:
We might suggest filling the empty values with 0s, but that can create issues with recommendation engines. 

If we were to fill this NaN with a 0, we would be incorrectly implying they greatly disliked! We are going to center each user’s ratings around 0 by deducting the row average and then fill in the missing values with 0. This means the missing data is replaced with neutral scores.

### `computing_neutral_scores` Function Explanation

### Functions Used and Purpose:
- **`np.nanmean()`**: Used to calculate the average rating for each user while handling NaN (missing) values.
  - **`axis=1`**: Specifies that the calculation is done along the rows (i.e., for each user).
- **`np.nan_to_num()`**: Used to fill in missing data (NaN) with zeros while preserving non-NaN values.
- **`np.reshape(-1, 1)`**: Used to reshape the array to ensure proper broadcasting during subtraction.
- **Indexing and Slicing**: Used to access elements in arrays and matrices.

### Steps:
1. **Calculate Average Ratings**:
   - Use `np.nanmean()` to compute the average rating for each user along the rows of the user-item matrix. This handles missing ratings (NaN) gracefully, computing the mean while ignoring NaN values.

2. **Center Ratings Around 0**:
   - Subtract the average ratings from each user's ratings in the user-item matrix. This centers each user's ratings around 0, effectively removing the user bias from the ratings.

3. **Fill Missing Data with Zeros**:
   - Use `np.nan_to_num()` to replace missing data (NaN) with zeros while preserving the existing non-NaN values. This ensures that missing ratings are treated neutrally (i.e., as if the user has not rated the item).

4. **Return Normalized User Ratings**:
   - Return the resulting normalized user ratings matrix, where missing ratings have been replaced with zeros and each user's ratings are centered around 0.

In [None]:
# def computing_neutral_scores(user_item_matrix, threshold=0):
#     """
#     Compute neutral scores for user-item interactions in a user-item matrix.

#     Parameters:
#     user_item_matrix (numpy.ndarray): Matrix representing users' ratings for items (movies).

#     Returns:
#     user_ratings_matrix_normed (numpy.ndarray): Matrix representing users' ratings normalized with neutral scores.
#     """
#     # Calculate the average rating for each user
#     avg_ratings = np.nanmean(user_item_matrix, axis=1)

#     # Center each user's ratings around 0
#     user_ratings_matrix_centered = user_item_matrix - avg_ratings.reshape(-1, 1)

#     # Fill in the missing data with 0s
#     user_ratings_matrix_normed = np.nan_to_num(user_ratings_matrix_centered, nan=0)

#     # classify ratings for performance comparison later
#     user_ratings_matrix_classified = np.where(user_ratings_matrix_normed >= threshold, 1, 0)

#     return user_ratings_matrix_normed, user_ratings_matrix_classified