In [29]:
import pandas as pd
import numpy as np

In [30]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" # to make jupyter print all outputs, not just the last one
from IPython.core.display import HTML # to pretty print pandas df and be able to copy them over (e.g. to ppt slides)

In [31]:
netflix_df = pd.read_parquet('cleaned/netflix_parquet')
movielens_df = pd.read_parquet('cleaned/movielens_parquet')

In [32]:
# netflix_df = netflix_df[netflix_df['review_data'].apply(lambda x: len(x) if x is not None else 0) > 500]
netflix_df = netflix_df[netflix_df['review_data'].apply(lambda x: 30 <= len(x) <= 350 if x is not None else False)]
movielens_df = movielens_df[movielens_df['review_data'].apply(lambda x: 30 <= len(x) <= 350 if x is not None else False)]

In [33]:
df = (netflix_df.sample(n=10,random_state=42))[['movieId','review_data']]
df
df2 = (movielens_df.sample(n=10,random_state=42))[['movieId','review_data']]
df2

Unnamed: 0,movieId,review_data
648,649,"[{'date': 2002-01-09, 'rating': 1.0, 'userId':..."
84,85,"[{'date': 2005-07-11, 'rating': 4.0, 'userId':..."
926,927,"[{'date': 2005-12-05, 'rating': 3.0, 'userId':..."
734,735,"[{'date': 2005-07-06, 'rating': 4.0, 'userId':..."
1336,1337,"[{'date': 2005-06-08, 'rating': 3.0, 'userId':..."
1522,1523,"[{'date': 2005-05-17, 'rating': 5.0, 'userId':..."
967,968,"[{'date': 2004-11-08, 'rating': 5.0, 'userId':..."
978,979,"[{'date': 2005-08-27, 'rating': 5.0, 'userId':..."
1252,1253,"[{'date': 2005-09-17, 'rating': 1.0, 'userId':..."
1725,1726,"[{'date': 2005-12-16, 'rating': 4.0, 'userId':..."


Unnamed: 0,movieId,review_data
1932,2029,"[{'date': 2000-01-18, 'rating': 4.0, 'userId':..."
16102,89305,"[{'date': 2011-12-19, 'rating': 4.0, 'userId':..."
18076,101088,"[{'date': 2020-05-10, 'rating': 2.5, 'userId':..."
12563,64285,"[{'date': 2009-01-29, 'rating': 4.5, 'userId':..."
546,554,"[{'date': 2000-03-20, 'rating': 1.0, 'userId':..."
6352,6517,"[{'date': 2009-12-14, 'rating': 2.5, 'userId':..."
5133,5264,"[{'date': 2020-03-17, 'rating': 2.0, 'userId':..."
23832,131050,"[{'date': 2015-07-08, 'rating': 5.0, 'userId':..."
3871,3991,"[{'date': 2001-07-05, 'rating': 3.0, 'userId':..."
7591,8157,"[{'date': 2009-02-06, 'rating': 4.5, 'userId':..."


In [34]:
review_data = df['review_data'].values
user_ids = np.concatenate([np.array([entry['userId'] for entry in row]) for row in review_data])
ratings = np.concatenate([np.array([entry['rating'] for entry in row]) for row in review_data])
movieIds = np.concatenate([[movieId] * len(row) for movieId, row in zip(df['movieId'], review_data)])
len(user_ids)
len(np.unique(movieIds))

review_data2 = df2['review_data'].values
user_ids2 = np.concatenate([np.array([entry['userId'] for entry in row]) for row in review_data2])
ratings2 = np.concatenate([np.array([entry['rating'] for entry in row]) for row in review_data2])
movieIds2 = np.concatenate([[movieId] * len(row) for movieId, row in zip(df['movieId'], review_data2)])
len(user_ids2)
len(np.unique(movieIds2))

1932

10

1000

10

### Set-up user-item matrix
First we will create a user-item matrix which records all the user-item interactions.


### `create_user_item_matrix` Function Explanation

### Steps:
1. **Extract Review Data**:
   - Extract the review data from the provided DataFrame, which contains user IDs, ratings, and movie IDs.

2. **Create User and Movie IDs Arrays**:
   - Extract user IDs, ratings, and movie IDs from the review data and concatenate them into separate arrays.
   - Generate dictionaries to map user IDs and movie IDs to unique indices in the user-item matrix.

3. **Initialize User-Item Matrix**:
   - Determine the dimensions of the user-item matrix based on the number of unique users and movies.
   - Initialize an empty user-item matrix filled with NaN values.

4. **Populate User-Item Matrix**:
   - Iterate through the review data and populate the user-item matrix with ratings.
   - Map user and movie IDs to their corresponding indices in the matrix and insert the ratings.

5. **Return Results**:
   - Return the user-item matrix along with dictionaries mapping user and movie IDs to indices, and arrays containing user and movie IDs.
  
### Functions Used and Purpose:

- **`np.concatenate()`**: Used to concatenate arrays containing user IDs, ratings, and movie IDs extracted from the review data.
- **`enumerate()`**: Used to iterate over the unique user IDs and movie IDs and generate indices for mapping.
- **`np.unique()`**: Used to find the unique user IDs and movie IDs in the review data.
- **`np.full()`**: Used to initialize an empty user-item matrix filled with NaN values.
- **`zip()`**: Used to iterate over multiple iterables simultaneously (user IDs, movie IDs, ratings).
- **`enumerate()`**: Used to iterate over the indices and elements of an iterable (user IDs, movie IDs) simultaneously.
- **Indexing and Slicing**: Used to access and modify elements in arrays and matrices.

In [35]:
def create_user_item_matrix(train_test_val_set):
    """
    Creates a user-item matrix from the provided dataset containing review data.

    Parameters:
    train_test_val_set (DataFrame): DataFrame containing review data with columns 'review_data',
                                    which is a list of dictionaries with keys 'userId', 'rating',
                                    and 'movieId'.

    Returns:
    user_item_matrix (numpy.ndarray): Matrix representing users' ratings for items (movies), the matrix is an NumPy array which contains lists of user-item interactions, meaning a user and their corresponding ratings to the movieIds.    
    
    user_id_dict (dict): Dictionary mapping user IDs to unique indices in the user-item matrix.
    
    movie_id_dict (dict): Dictionary mapping movie IDs to unique indices in the user-item matrix.
    
    user_ids (numpy.ndarray): Array containing user IDs corresponding to each rating in the matrix.
    
    movie_ids (numpy.ndarray): Array containing movie IDs corresponding to each rating in the matrix.

    """
    review_data = train_test_val_set['review_data'].values
    user_ids = np.concatenate([np.array([entry['userId'] for entry in row]) for row in review_data])
    ratings = np.concatenate([np.array([entry['rating'] for entry in row]) for row in review_data])
    movieIds = np.concatenate([[movieId] * len(row) for movieId, row in zip(train_test_val_set['movieId'], review_data)])

    # create dictionaries to map user IDs and movie IDs to unique indices to map over
    user_id_dict = {user_id: index for index, user_id in enumerate(np.unique(user_ids))}
    movie_id_dict = {movie_id: index for index, movie_id in enumerate(np.unique(movieIds))}

    # initialize an empty user-item matrix
    user_count = len(user_id_dict)
    movie_count = len(movie_id_dict)
    user_item_matrix = np.full((user_count, movie_count), np.nan)

    # populate the user-item matrix with ratings from the dataset
    for i, (user_id, movie_id, rating) in enumerate(zip(user_ids, movieIds, ratings)):
        user_index = user_id_dict[user_id]
        movie_index = movie_id_dict[movie_id]
        user_item_matrix[user_index, movie_index] = rating

    return user_item_matrix, user_id_dict, movie_id_dict, user_ids, movieIds

### Preprocessing of ratings in user-item matrix:
We might suggest filling the empty values with 0s, but that can create issues with recommendation engines. 

If we were to fill this NaN with a 0, we would be incorrectly implying they greatly disliked! We are going to center each user’s ratings around 0 by deducting the row average and then fill in the missing values with 0. This means the missing data is replaced with neutral scores.

### `computing_neutral_scores` Function Explanation

### Functions Used and Purpose:
- **`np.nanmean()`**: Used to calculate the average rating for each user while handling NaN (missing) values.
  - **`axis=1`**: Specifies that the calculation is done along the rows (i.e., for each user).
- **`np.nan_to_num()`**: Used to fill in missing data (NaN) with zeros while preserving non-NaN values.
- **`np.reshape(-1, 1)`**: Used to reshape the array to ensure proper broadcasting during subtraction.
- **Indexing and Slicing**: Used to access elements in arrays and matrices.

### Steps:
1. **Calculate Average Ratings**:
   - Use `np.nanmean()` to compute the average rating for each user along the rows of the user-item matrix. This handles missing ratings (NaN) gracefully, computing the mean while ignoring NaN values.

2. **Center Ratings Around 0**:
   - Subtract the average ratings from each user's ratings in the user-item matrix. This centers each user's ratings around 0, effectively removing the user bias from the ratings.

3. **Fill Missing Data with Zeros**:
   - Use `np.nan_to_num()` to replace missing data (NaN) with zeros while preserving the existing non-NaN values. This ensures that missing ratings are treated neutrally (i.e., as if the user has not rated the item).

4. **Return Normalized User Ratings**:
   - Return the resulting normalized user ratings matrix, where missing ratings have been replaced with zeros and each user's ratings are centered around 0.

In [36]:
def computing_neutral_scores(user_item_matrix):
    """
    Compute neutral scores for user-item interactions in a user-item matrix.

    Parameters:
    user_item_matrix (numpy.ndarray): Matrix representing users' ratings for items (movies).

    Returns:
    user_ratings_matrix_normed (numpy.ndarray): Matrix representing users' ratings normalized with neutral scores.
    """
    # Calculate the average rating for each user
    avg_ratings = np.nanmean(user_item_matrix, axis=1)

    # Center each user's ratings around 0
    user_ratings_matrix_centered = user_item_matrix - avg_ratings.reshape(-1, 1)

    # Fill in the missing data with 0s
    user_ratings_matrix_normed = np.nan_to_num(user_ratings_matrix_centered, nan=0)

    return user_ratings_matrix_normed

### Compute similarity:
Regularly, cosine similarity is often used to measure the similarity between users based on their preferences or ratings for items (in this case, movies). Cosine similarity ranges from -1 to 1, where:

- 1 indicates perfect similarity,
- 0 indicates no similarity, and
- -1 indicates perfect dissimilarity.

### Interpretation:

- **Positive Cosine Similarity**: Users are similar in their preferences or ratings for movies.
- **Zero Cosine Similarity**: Users have no similarity in their preferences.
- **Negative Cosine Similarity**: Users are dissimilar in their preferences, tending towards opposite ratings for movies.

### Practical Implication:

If one user likes certain types of movies, the other user tends to dislike them, or vice versa. In other words, users with negative cosine similarities have contrasting preferences, making them less suitable for recommending movies to each other.

___

To see how similar users are we will compute the similarity between them. I will use cosine similarity as distance measure. The manhatten norm will be used to decrease computational weight instead of euclidian norm.

### Explanation `calculate_user_similarity_manhattan` Function

This function calculates the cosine similarity matrix between users based on their ratings using the Manhattan norm.

1. **Thresholding**: First, the function applies thresholding to the user ratings matrix. Ratings below the threshold are set to 0, ensuring that only significant ratings are considered.

2. **Dot Product Calculation**: It then computes the dot product of each pair of row vectors (users) in the thresholded matrix. This represents the similarity between users based on their common rated items.

3. **Norm Calculation**: Next, it calculates the norms (magnitude) of each row vector, considering only values above the threshold. This step prepares for the normalization process.

4. **Normalization**: The dot products are divided by the norms of the corresponding row vectors, effectively normalizing the similarity values. This step ensures that users with a large number of ratings are not favored over users with fewer ratings.

5. **Setting Diagonal to 0**: Finally, the diagonal elements of the similarity matrix are set to 0 to avoid self-similarity, as a user's rating should not be compared to itself.

### Explanation of NumPy Functions

- **np.dot**: Computes the dot product of arrays. Here, it calculates the dot product of the thresholded user ratings matrix with its transpose, resulting in the similarity matrix.
  
- **np.where**: Returns indices where a condition is true. It's used here to apply thresholding to the user ratings matrix.
  
- **np.sum**: Computes the sum of array elements. It calculates the norms of each row vector after thresholding, which are then used for normalization.
  
- **np.abs**: Computes the absolute value element-wise. Used to ensure positive values for norms calculation.
  
- **np.fill_diagonal**: Fills the diagonal of an array with a specified value. It's used to set diagonal elements of the similarity matrix to 0 to avoid self-similarity.

In [37]:
def calculate_user_similarity_manhattan(user_ratings_matrix, threshold):
    """
    Calculate user similarity using Manhattan distance-based similarity measure.

    Parameters:
    user_ratings_matrix (numpy.ndarray): Matrix representing users' ratings for items (movies).
    threshold (float): Threshold value for considering ratings in the similarity calculation.

    Returns:
    similarity_matrix (numpy.ndarray): Matrix representing similarity between users based on the Manhattan distance.

    The Manhattan distance-based similarity measure is calculated as follows:
    1. Compute the dot product of each pair of row vectors in the user_ratings_matrix, considering only values above the threshold.
    2. Calculate the norms of each row vector, considering only values above the threshold.
    3. Replace zero norms with a small value to avoid division by zero.
    4. Calculate the similarity matrix using broadcasting, where the similarity between users i and j is given by the dot product
       divided by the product of their norms.
    5. Set diagonal elements to 0 to avoid self-similarity.

    """
    # Calculate dot product of each pair of row vectors, considering only values above the threshold
    dot_products = np.dot(np.where(user_ratings_matrix >= threshold, user_ratings_matrix, 0), user_ratings_matrix.T)
    
    # Calculate norms of each row vector, considering only values above the threshold
    norms = np.sum(np.abs(np.where(user_ratings_matrix >= threshold, user_ratings_matrix, 0)), axis=1)
    
    # Replace zero norms with a small value to avoid division by zero
    norms[norms == 0] = 1e-8
    
    # Calculate similarity matrix using broadcasting
    similarity_matrix = dot_products / (norms[:, None] * norms)
    
    # Set diagonal elements to 0 to avoid self-similarity
    np.fill_diagonal(similarity_matrix, 0)
    
    return similarity_matrix

## Perform User-Based KNN

The top 5 recommendations represent the movies that are most highly rated by users who are most similar to the target user, based on the user-based k-nearest neighbors (KNN) collaborative filtering algorithm.

### Explanation `generate_user_knn_regressor` function:
**Recommendations Calculation Process**
1. **Neighbour selection:** based on the cosine similarity number, the top k similar users are selected. This would be positive cosine similarity scores, as they indicate similarity. 
2. **Aggregation of Ratings:**:For each movie that the nearest neighbours have rated that the target user has not, the ratings are aggregated.

3. **Average Ratings Calculation:** the aggregated ratings are divided by the number of neighbors who rated each movie to calculate the average rating for each movie.

4. **Top Recommendations:** Finally, the top 5 movies with the highest average ratings are selected as the recommendations for the target user. These are the movies that are predicted to be most preferred by the target user based on the ratings of their nearest neighbors.

**Explanation of NumPy Functions**

1. **np.argsort**: Returns the indices that would sort an array. Used to find indices of the k most similar users in descending order.
  
2. **np.where**: Returns indices of elements satisfying a condition. Used to find movies rated by similar users (not NaN).

3. **np.sum**: Computes sum of array elements. Used to aggregate ratings and counts for each movie across similar users.

4. **np.divide**: Performs element-wise division. Used to calculate average ratings for each movie by dividing aggregated ratings by the number of similar users who rated each movie. Handles division by zero errors.

5. **np.argsort (again)**: Finds indices that would sort movies by average ratings in descending order. Used to select top 5 recommendations.

In [38]:
def generate_user_knn_regressor(user_id, user_item_matrix, user_similarity_matrix, user_id_dict, movie_id_dict, k):
    """
    Generates movie recommendations for a given user using user-based k-nearest neighbors (KNN) collaborative filtering.

    Parameters:
    user_id (int): ID of the user for whom recommendations are to be generated.
    user_item_matrix (numpy.ndarray): Matrix representing users' ratings for items (movies).
    user_similarity_matrix (numpy.ndarray): Matrix representing cosine similarity between users.
    user_id_dict (dict): Dictionary mapping user IDs to unique indices in the user-item matrix.
    movie_id_dict (dict): Dictionary mapping movie IDs to unique indices in the user-item matrix.
    k (int): Number of nearest neighbors to consider for recommendations.

    Returns:
    recommendations (list): List of tuples containing recommended movie IDs and their predicted or expected ratings for the given user.
    """
    # Ensure user ID exists in the dictionary
    if user_id not in user_id_dict:
        print(f"User with ID {user_id} not found.")
        return []

    # Find the index of the user in the user-item matrix
    user_index = user_id_dict[user_id]

    # Get similarity scores of the user with other users and sort indices
    similar_users_indices = np.argsort(user_similarity_matrix[user_index])[::-1][:k]

    # Find movies rated by similar users
    rated_movies = np.where(~np.isnan(user_item_matrix[similar_users_indices]))[1]

    # Calculate average ratings for each movie
    movie_ratings = np.zeros_like(user_item_matrix[0])
    movie_counts = np.zeros_like(user_item_matrix[0], dtype=int)
    
    # Aggregate ratings and counts for each movie
    for movie in rated_movies:
        movie_ratings[movie] += np.sum(user_item_matrix[similar_users_indices, movie])
        movie_counts[movie] += np.sum(~np.isnan(user_item_matrix[similar_users_indices, movie]))
    
    # Calculate average ratings
    average_ratings = np.divide(movie_ratings, movie_counts, out=np.zeros_like(movie_ratings), where=movie_counts!=0)

    # Sort movies by average ratings in descending order
    sorted_indices = np.argsort(average_ratings)[::-1]

    # Convert movie indices back to movie IDs and return top 5 recommendations
    top_recommendations = [(list(movie_id_dict.keys())[list(movie_id_dict.values()).index(movie_index)], average_ratings[movie_index])
                           for movie_index in sorted_indices[:5]]
    return top_recommendations

## UserKNN classifier:
In contrast to userKNN regressor, we will now recommend items based on majority voting. Which will consist of the following:

Based on the cosine similarity, the nearest neighbours will be selected, just like in the KNN regressor. Afterwards, instead of computing the avg rating and then computing the items which have the highest avg. ratings, KNN classifier counts the items which have the same rating and recommends the item to the user which has the highest count of the same ratings, in other words: **majority vote**.

Alternatively, **weighted average** could be used as a approach in userKNN classifier. The ratings of the nearest neighbours are assigned weights based on their similarity to the target user. More similar neighbors might have a greater influence on the prediction.

Because of the preprocessing of the data, the ratings are generally not exactly the same. **Therefore, weighted average will be used**. 

### `generate_user_knn_recommendations_classifier` Function Explanation

### Steps:
1. **Input Validation**:
   - Check if the provided user ID exists in the user ID dictionary. If not found, print a message and return an empty list.

2. **Find Similar Users**:
   - Get the similarity scores of the target user with other users from the similarity matrix.
   - Sort the indices of similar users based on their similarity scores in descending order.
   - Select the top `k` similar users for consideration.

3. **Calculate Weighted Average Ratings**:
   - For each movie rated by the selected similar users:
     - Calculate the weighted sum of ratings, where the weights are the similarity scores between the target user and the similar users.
     - Accumulate the sum of similarities for normalization.
     - Compute the weighted average rating for each movie.

4. **Sort Recommendations**:
   - Sort the movies by their weighted average ratings in descending order.

5. **Convert Indices to Movie IDs**:
   - Convert the indices of the top recommended movies back to their corresponding movie IDs using the `movie_id_dict`.

6. **Return Recommendations**:
   - Return a list of tuples containing the movie IDs and their predicted ratings, limited to the top 5 recommendations.

In [39]:
def generate_user_knn_recommendations_classifier(user_id, user_item_matrix, user_similarity_matrix, user_id_dict, movie_id_dict, k):
    """
    Generates movie recommendations for a given user using user-based k-nearest neighbors (KNN) collaborative filtering with weighted average.

    Parameters:
    user_id (int): ID of the user for whom recommendations are to be generated.
    user_item_matrix (numpy.ndarray): Matrix representing users' ratings for items (movies).
    user_similarity_matrix (numpy.ndarray): Matrix representing cosine similarity between users.
    user_id_dict (dict): Dictionary mapping user IDs to unique indices in the user-item matrix.
    movie_id_dict (dict): Dictionary mapping movie IDs to unique indices in the user-item matrix.
    k (int): Number of nearest neighbors to consider for recommendations.

    Returns:
    recommendations (list): List of tuples containing recommended movie IDs and their predicted ratings for the given user.
    """
    # Ensure user ID exists in the dictionary
    if user_id not in user_id_dict:
        print(f"User with ID {user_id} not found.")
        return []

    # Find the index of the user in the user-item matrix
    user_index = user_id_dict[user_id]

    # Get similarity scores of the user with other users and sort indices
    similar_users_indices = np.argsort(user_similarity_matrix[user_index])[::-1][:k]

    # Find movies rated by similar users
    rated_movies = np.where(~np.isnan(user_item_matrix[similar_users_indices]))[1]

    # Calculate weighted average ratings for each movie
    weighted_avg_ratings = np.zeros_like(user_item_matrix[0])
    similarity_sum = 0
    
    for movie in rated_movies:
        # Calculate weighted sum of ratings and sum of similarities
        weighted_sum = np.sum(user_item_matrix[similar_users_indices, movie] * user_similarity_matrix[user_index, similar_users_indices])
        similarity_sum += np.sum(user_similarity_matrix[user_index, similar_users_indices])
        weighted_avg_ratings[movie] = weighted_sum / similarity_sum if similarity_sum != 0 else 0

    # Sort movies by weighted average ratings in descending order
    sorted_indices = np.argsort(weighted_avg_ratings)[::-1]

    # Convert movie indices back to movie IDs and return recommendations
    recommendations_classifier = [(list(movie_id_dict.keys())[list(movie_id_dict.values()).index(movie_index)], weighted_avg_ratings[movie_index])
                       for movie_index in sorted_indices[:5]]
    return recommendations_classifier

### See a first batch of recommendations:

By using the functions above to recommend movies above the following results are generated:

`Netflix dataset:`

In [40]:
user_item_matrix, user_id_dict, movie_id_dict, user_ids, movieIds = create_user_item_matrix(df)
user_ratings_matrix_normed = computing_neutral_scores(user_item_matrix)
user_similarity_matrix_manhattan = calculate_user_similarity_manhattan(user_ratings_matrix_normed, threshold=0.5) # still explain why treshold on 0.5!!!!!!!

In [41]:
user_id = user_ids[1]
user_knn_recommendations = generate_user_knn_regressor(user_id, user_item_matrix, user_similarity_matrix_manhattan, user_id_dict, movie_id_dict, k=1)
print(f"UserKNN regressor Recommendations: \nTop 5 recommended movies with predicted ratings for user {user_id}:")
for movie_id, rating in user_knn_recommendations:
    print(f"Movie ID: {movie_id}, Predicted Rating: {rating}")

UserKNN regressor Recommendations: 
Top 5 recommended movies with predicted ratings for user 2012897:
Movie ID: 979, Predicted Rating: 4.0
Movie ID: 1726, Predicted Rating: 0.0
Movie ID: 1523, Predicted Rating: 0.0
Movie ID: 1337, Predicted Rating: 0.0
Movie ID: 1253, Predicted Rating: 0.0


In [42]:
user_id = user_ids[1]
recommendations_knn_classifier = generate_user_knn_recommendations_classifier(user_id, user_item_matrix, user_similarity_matrix_manhattan, user_id_dict, movie_id_dict, k=1)
print(f"UserKNN classifier Recommendations: \nTop 5 recommended movies with predicted ratings for user {user_id}:")
for movie_id, rating in recommendations_knn_classifier:
    print(f"Movie ID: {movie_id}, Predicted Rating: {rating}")

UserKNN classifier Recommendations: 
Top 5 recommended movies with predicted ratings for user 2012897:
Movie ID: 1726, Predicted Rating: 0.0
Movie ID: 1523, Predicted Rating: 0.0
Movie ID: 1337, Predicted Rating: 0.0
Movie ID: 1253, Predicted Rating: 0.0
Movie ID: 979, Predicted Rating: 0.0


Even though the movieIds which are recommended are the same, the predicted rating differs somewhat, already indicating a difference between the two userKNN models.

`MovieLens dataset:`

In [43]:
user_item_matrix2, user_id_dict2, movie_id_dict2, user_ids2, movieIds2 = create_user_item_matrix(df2)
user_ratings_matrix_normed2 = computing_neutral_scores(user_item_matrix2)
user_similarity_matrix_manhattan2 = calculate_user_similarity_manhattan(user_ratings_matrix_normed2, threshold=0.5) # still explain why treshold on 0.5!!!!!!!

In [44]:
user_id2 = user_ids2[1]
user_knn_recommendations2 = generate_user_knn_regressor(user_id2, user_item_matrix2, user_similarity_matrix_manhattan2, user_id_dict2, movie_id_dict2, k=1)
print(f"UserKNN regressor Recommendations: \nTop 5 recommended movies with predicted ratings for user {user_id2}:")
for movie_id, rating in user_knn_recommendations2:
    print(f"Movie ID: {movie_id}, Predicted Rating: {rating}")

UserKNN regressor Recommendations: 
Top 5 recommended movies with predicted ratings for user 144354:
Movie ID: 8157, Predicted Rating: 3.5
Movie ID: 131050, Predicted Rating: 0.0
Movie ID: 101088, Predicted Rating: 0.0
Movie ID: 89305, Predicted Rating: 0.0
Movie ID: 64285, Predicted Rating: 0.0


In [45]:
user_id2 = user_ids2[1]
recommendations_knn_classifier2 = generate_user_knn_recommendations_classifier(user_id2, user_item_matrix2, user_similarity_matrix_manhattan2, user_id_dict2, movie_id_dict2, k=1)
print(f"UserKNN classifier Recommendations: \nTop 5 recommended movies with predicted ratings for user {user_id2}:")
for movie_id, rating in recommendations_knn_classifier2:
    print(f"Movie ID: {movie_id}, Predicted Rating: {rating}")

UserKNN classifier Recommendations: 
Top 5 recommended movies with predicted ratings for user 144354:
Movie ID: 131050, Predicted Rating: 0.0
Movie ID: 101088, Predicted Rating: 0.0
Movie ID: 89305, Predicted Rating: 0.0
Movie ID: 64285, Predicted Rating: 0.0
Movie ID: 8157, Predicted Rating: 0.0


## Baseline performance