In [56]:
import pandas as pd
import numpy as np

In [57]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" # to make jupyter print all outputs, not just the last one
from IPython.core.display import HTML # to pretty print pandas df and be able to copy them over (e.g. to ppt slides)

In [58]:
netflix_df = pd.read_parquet('cleaned/netflix_parquet')

In [59]:
# netflix_df = netflix_df[netflix_df['review_data'].apply(lambda x: len(x) if x is not None else 0) > 500]
netflix_df = netflix_df[netflix_df['review_data'].apply(lambda x: 30 <= len(x) <= 350 if x is not None else False)]

In [60]:
netflix_df

Unnamed: 0,movieId,year,title,review_data,genres
1,2,2004,Isle of Man TT 2004 Review,"[{'date': 2005-09-05, 'rating': 4.0, 'userId':...",
3,4,1994,Paula Abdul's Get Up & Dance,"[{'date': 2005-09-06, 'rating': 3.0, 'userId':...",[Family]
6,7,1992,8 Man,"[{'date': 2001-11-04, 'rating': 2.0, 'userId':...","[Action, Sci-Fi]"
8,9,1991,Class of Nuke 'Em High 2,"[{'date': 2004-11-20, 'rating': 2.0, 'userId':...","[Comedy, Horror, Sci-Fi]"
9,10,2001,Fighter,"[{'date': 2004-02-09, 'rating': 2.0, 'userId':...",
...,...,...,...,...,...
1953,1954,2000,Deliberate Intent,"[{'date': 2003-10-11, 'rating': 3.0, 'userId':...","[Thriller, Drama]"
1956,1957,1977,The Mighty Peking Man,"[{'date': 2004-07-21, 'rating': 2.0, 'userId':...","[Action, Adventure, Horror, Sci-Fi]"
1957,1958,2002,WWE: Before They Were Superstars 2,"[{'date': 2005-06-16, 'rating': 2.0, 'userId':...","[Documentary, Sport]"
1959,1960,2003,Smack: Vol. 1,"[{'date': 2004-09-22, 'rating': 3.0, 'userId':...",


In [61]:
df = (netflix_df.sample(n=250,random_state=42))[['movieId','review_data']]
df

Unnamed: 0,movieId,review_data
648,649,"[{'date': 2002-01-09, 'rating': 1.0, 'userId':..."
84,85,"[{'date': 2005-07-11, 'rating': 4.0, 'userId':..."
926,927,"[{'date': 2005-12-05, 'rating': 3.0, 'userId':..."
734,735,"[{'date': 2005-07-06, 'rating': 4.0, 'userId':..."
1336,1337,"[{'date': 2005-06-08, 'rating': 3.0, 'userId':..."
...,...,...
1089,1090,"[{'date': 2005-05-11, 'rating': 2.0, 'userId':..."
1280,1281,"[{'date': 2005-04-11, 'rating': 3.0, 'userId':..."
248,249,"[{'date': 2004-11-29, 'rating': 4.0, 'userId':..."
181,182,"[{'date': 2002-02-20, 'rating': 4.0, 'userId':..."


In [62]:
review_data = df['review_data'].values
user_ids = np.concatenate([np.array([entry['userId'] for entry in row]) for row in review_data])
ratings = np.concatenate([np.array([entry['rating'] for entry in row]) for row in review_data])
movieIds = np.concatenate([[movieId] * len(row) for movieId, row in zip(df['movieId'], review_data)])
len(user_ids)
len(np.unique(movieIds))

43956

250

### Set-up user-item matrix

In [63]:
def create_user_item_matrix(train_test_val_set):
    """
    Creates a user-item matrix from the provided dataset containing review data.

    Parameters:
    train_test_val_set (DataFrame): DataFrame containing review data with columns 'review_data',
                                    which is a list of dictionaries with keys 'userId', 'rating',
                                    and 'movieId'.

    Returns:
    user_item_matrix (numpy.ndarray): Matrix representing users' ratings for items (movies), the matrix is an NumPy array which contains lists of user-item interactions, meaning a user and their corresponding ratings to the movieIds.    
    
    user_id_dict (dict): Dictionary mapping user IDs to unique indices in the user-item matrix.
    
    movie_id_dict (dict): Dictionary mapping movie IDs to unique indices in the user-item matrix.
    
    user_ids (numpy.ndarray): Array containing user IDs corresponding to each rating in the matrix.
    
    movie_ids (numpy.ndarray): Array containing movie IDs corresponding to each rating in the matrix.

    """
    review_data = train_test_val_set['review_data'].values
    user_ids = np.concatenate([np.array([entry['userId'] for entry in row]) for row in review_data])
    ratings = np.concatenate([np.array([entry['rating'] for entry in row]) for row in review_data])
    movieIds = np.concatenate([[movieId] * len(row) for movieId, row in zip(train_test_val_set['movieId'], review_data)])

    # create dictionaries to map user IDs and movie IDs to unique indices to map over
    user_id_dict = {user_id: index for index, user_id in enumerate(np.unique(user_ids))}
    movie_id_dict = {movie_id: index for index, movie_id in enumerate(np.unique(movieIds))}

    # initialize an empty user-item matrix
    user_count = len(user_id_dict)
    movie_count = len(movie_id_dict)
    user_item_matrix = np.full((user_count, movie_count), np.nan)

    # populate the user-item matrix with ratings from the dataset
    for i, (user_id, movie_id, rating) in enumerate(zip(user_ids, movieIds, ratings)):
        user_index = user_id_dict[user_id]
        movie_index = movie_id_dict[movie_id]
        user_item_matrix[user_index, movie_index] = rating

    return user_item_matrix, user_id_dict, movie_id_dict, user_ids, movieIds

In [64]:
user_item_matrix, user_id_dict, movie_id_dict, user_ids, movieIds = create_user_item_matrix(df)
user_item_matrix.shape
np.unique(user_item_matrix)

# Get unique values and their counts
unique_values, value_counts = np.unique(user_item_matrix, return_counts=True)

# Print unique values and their counts
for value, count in zip(unique_values, value_counts):
    print(f"Value: {value}, Count: {count}")

(28675, 250)

array([ 1.,  2.,  3.,  4.,  5., nan])

Value: 1.0, Count: 6717
Value: 2.0, Count: 7806
Value: 3.0, Count: 13759
Value: 4.0, Count: 9586
Value: 5.0, Count: 6088
Value: nan, Count: 7124794


We might suggest filling the empty values with 0s, but that can create issues with recommendation engines. 

If we were to fill this NaN with a 0, we would be incorrectly implying they greatly disliked! We are going to center each user’s ratings around 0 by deducting the row average and then fill in the missing values with 0. This means the missing data is replaced with neutral scores.

In [65]:
# Calculate the average rating for each user
avg_ratings = np.nanmean(user_item_matrix, axis=1)

# Center each user's ratings around 0
user_ratings_matrix_centered = user_item_matrix - avg_ratings.reshape(-1, 1)

# Fill in the missing data with 0s
user_ratings_matrix_normed = np.nan_to_num(user_ratings_matrix_centered, nan=0)

In [66]:
user_ratings_matrix_normed
np.unique(user_ratings_matrix_normed)
user_ratings_matrix_normed.shape

# Get unique values and their counts
unique_values_normed, value_counts_normed = np.unique(user_ratings_matrix_normed, return_counts=True)

# Print unique values and their counts
for value, count in zip(unique_values_normed, value_counts_normed):
    print(f"Value: {value}, Count: {count}")

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

array([-3.33333333, -3.14285714, -3.        , -2.83333333, -2.8       ,
       -2.75      , -2.66666667, -2.6       , -2.57142857, -2.55555556,
       -2.55      , -2.53333333, -2.5       , -2.42857143, -2.42857143,
       -2.4       , -2.375     , -2.36363636, -2.33333333, -2.28571429,
       -2.27272727, -2.25      , -2.22222222, -2.2       , -2.18181818,
       -2.16949153, -2.16666667, -2.16666667, -2.15384615, -2.14285714,
       -2.125     , -2.11111111, -2.        , -1.9375    , -1.88888889,
       -1.875     , -1.85714286, -1.83333333, -1.8       , -1.78947368,
       -1.7826087 , -1.77777778, -1.77272727, -1.75      , -1.73469388,
       -1.73333333, -1.71428571, -1.7       , -1.68421053, -1.68085106,
       -1.66666667, -1.64705882, -1.625     , -1.61538462, -1.6       ,
       -1.6       , -1.57142857, -1.57142857, -1.55555556, -1.55      ,
       -1.54545455, -1.53333333, -1.5       , -1.47368421, -1.46666667,
       -1.44827586, -1.44444444, -1.4375    , -1.42857143, -1.4 

(28675, 250)

Value: -3.333333333333333, Count: 1
Value: -3.1428571428571432, Count: 1
Value: -3.0, Count: 2
Value: -2.8333333333333335, Count: 1
Value: -2.8, Count: 4
Value: -2.75, Count: 5
Value: -2.6666666666666665, Count: 13
Value: -2.6, Count: 2
Value: -2.5714285714285716, Count: 1
Value: -2.5555555555555554, Count: 2
Value: -2.55, Count: 1
Value: -2.533333333333333, Count: 1
Value: -2.5, Count: 6
Value: -2.428571428571429, Count: 1
Value: -2.4285714285714284, Count: 1
Value: -2.4, Count: 6
Value: -2.375, Count: 1
Value: -2.3636363636363638, Count: 1
Value: -2.3333333333333335, Count: 16
Value: -2.2857142857142856, Count: 3
Value: -2.272727272727273, Count: 1
Value: -2.25, Count: 14
Value: -2.2222222222222223, Count: 4
Value: -2.2, Count: 3
Value: -2.1818181818181817, Count: 5
Value: -2.169491525423729, Count: 4
Value: -2.166666666666667, Count: 1
Value: -2.1666666666666665, Count: 1
Value: -2.153846153846154, Count: 1
Value: -2.142857142857143, Count: 2
Value: -2.125, Count: 2
Value: -2.111111

### Compute similarity:
Regularly, cosine similarity is often used to measure the similarity between users based on their preferences or ratings for items (in this case, movies). Cosine similarity ranges from -1 to 1, where:

- 1 indicates perfect similarity,
- 0 indicates no similarity, and
- -1 indicates perfect dissimilarity.

### Interpretation:

- **Positive Cosine Similarity**: Users are similar in their preferences or ratings for movies.
- **Zero Cosine Similarity**: Users have no similarity in their preferences.
- **Negative Cosine Similarity**: Users are dissimilar in their preferences, tending towards opposite ratings for movies.

### Practical Implication:

If one user likes certain types of movies, the other user tends to dislike them, or vice versa. In other words, users with negative cosine similarities have contrasting preferences, making them less suitable for recommending movies to each other.

___

To see how similar users are we will compute the similarity between them. I will use cosine similarity as distance measure. The manhatten norm will be used to decrease computational weight instead of euclidian norm.

### Explanation of calculate_user_similarity_manhattan Function

This function calculates the cosine similarity matrix between users based on their ratings using the Manhattan norm.

1. **Thresholding**: First, the function applies thresholding to the user ratings matrix. Ratings below the threshold are set to 0, ensuring that only significant ratings are considered.

2. **Dot Product Calculation**: It then computes the dot product of each pair of row vectors (users) in the thresholded matrix. This represents the similarity between users based on their common rated items.

3. **Norm Calculation**: Next, it calculates the norms (magnitude) of each row vector, considering only values above the threshold. This step prepares for the normalization process.

4. **Normalization**: The dot products are divided by the norms of the corresponding row vectors, effectively normalizing the similarity values. This step ensures that users with a large number of ratings are not favored over users with fewer ratings.

5. **Setting Diagonal to 0**: Finally, the diagonal elements of the similarity matrix are set to 0 to avoid self-similarity, as a user's rating should not be compared to itself.

### Explanation of NumPy Functions

- **np.dot**: Computes the dot product of arrays. Here, it calculates the dot product of the thresholded user ratings matrix with its transpose, resulting in the similarity matrix.
  
- **np.where**: Returns indices where a condition is true. It's used here to apply thresholding to the user ratings matrix.
  
- **np.sum**: Computes the sum of array elements. It calculates the norms of each row vector after thresholding, which are then used for normalization.
  
- **np.abs**: Computes the absolute value element-wise. Used to ensure positive values for norms calculation.
  
- **np.fill_diagonal**: Fills the diagonal of an array with a specified value. It's used to set diagonal elements of the similarity matrix to 0 to avoid self-similarity.

In [67]:
def calculate_user_similarity_manhattan(user_ratings_matrix, threshold):
    # Calculate dot product of each pair of row vectors, considering only values above the threshold
    dot_products = np.dot(np.where(user_ratings_matrix >= threshold, user_ratings_matrix, 0), user_ratings_matrix.T)
    
    # Calculate norms of each row vector, considering only values above the threshold
    norms = np.sum(np.abs(np.where(user_ratings_matrix >= threshold, user_ratings_matrix, 0)), axis=1)
    
    # Replace zero norms with a small value to avoid division by zero
    norms[norms == 0] = 1e-8
    
    # Calculate similarity matrix using broadcasting
    similarity_matrix = dot_products / (norms[:, None] * norms)
    
    # Set diagonal elements to 0 to avoid self-similarity
    np.fill_diagonal(similarity_matrix, 0)
    
    return similarity_matrix

# Example usage:
# user_ratings_matrix_normed is your matrix of normalized user-item interactions
user_similarity_matrix_manhattan = calculate_user_similarity_manhattan(user_ratings_matrix_normed, threshold=0.5)
print("Cosine similarity matrix between users using Manhattan norm (with threshold):")
print(user_similarity_matrix_manhattan)

Cosine similarity matrix between users using Manhattan norm (with threshold):
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [68]:
user_similarity_matrix_manhattan

user_similarity_matrix_manhattan.shape

# Get unique values and their counts
unique_values_sim, value_counts_sim = np.unique(user_similarity_matrix_manhattan, return_counts=True)

# Print unique values and their counts
for value, count in zip(unique_values_sim, value_counts_sim):
    print(f"Value: {value}, Count: {count}")

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

(28675, 28675)

Value: -178260869.5652174, Count: 3
Value: -178260869.56521738, Count: 31
Value: -178260869.56521735, Count: 5
Value: -171428571.42857146, Count: 2
Value: -171428571.42857143, Count: 13
Value: -171428571.4285714, Count: 1
Value: -170000000.0, Count: 2
Value: -160000000.00000003, Count: 4
Value: -160000000.0, Count: 91
Value: -159999999.99999997, Count: 22
Value: -159999999.99999994, Count: 1
Value: -157142857.14285716, Count: 6
Value: -157142857.14285713, Count: 1
Value: -157142857.1428571, Count: 7
Value: -155555555.55555552, Count: 16
Value: -124782608.69565217, Count: 1
Value: -120000000.0, Count: 1
Value: -114285714.2857143, Count: 1
Value: -108888888.88888888, Count: 1
Value: -104761904.76190476, Count: 1
Value: -104347826.08695653, Count: 1
Value: -91428571.42857143, Count: 1
Value: -89999999.99999999, Count: 7
Value: -89130434.78260869, Count: 5
Value: -88888888.8888889, Count: 2
Value: -88888888.88888888, Count: 4
Value: -88888888.88888887, Count: 2
Value: -87500000.00000001, C

### Perform KNN

We will be working with user 1's similarity scores and find their nearest neighbors, meaning the most similar other users. Based on the ratings those neighbors gave a movie, we can infer what rating user 1 would give give it if they saw it.

Below you will find how KNN works, before implemting the scikitlearn function:

In [69]:
user_similarity_matrix_manhattan[1, :]

array([0., 0., 0., ..., 0., 0., 0.])

In [70]:
# Isolate the similarity scores for the first user and sort descendingly
user_similarity_scores = user_similarity_matrix_manhattan[1, :]
ordered_similarities_indices = np.argsort(user_similarity_scores)[::-1]  # Sort in descending order
ordered_similarities = user_similarity_scores[ordered_similarities_indices]

# Find the top 10 most similar users (excluding self-similarity)
nearest_neighbors = ordered_similarities_indices[1:11]

# Extract the ratings of the neighbors
neighbor_ratings = user_ratings_matrix_normed[nearest_neighbors]

# Calculate the mean rating given by the users nearest neighbors
mean_rating = np.mean(neighbor_ratings[:, 2])  # Assuming you want the mean rating for item 2
print(mean_rating)

0.0


In [71]:
user_similarity_matrix_manhattan

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

## User-Based KNN

The top 5 recommendations represent the movies that are most highly rated by users who are most similar to the target user, based on the user-based k-nearest neighbors (KNN) collaborative filtering algorithm.

### Recommendations Calculation Process

1. **Neighbour selection:** based on the cosine similarity number, the top k similar users are selected. This would be positive cosine similarity scores, as they indicate similarity. 
2. **Aggregation of Ratings:**:For each movie that the nearest neighbours have rated that the target user has not, the ratings are aggregated.

3. **Average Ratings Calculation:** the aggregated ratings are divided by the number of neighbors who rated each movie to calculate the average rating for each movie.

4. **Top Recommendations:** Finally, the top 5 movies with the highest average ratings are selected as the recommendations for the target user. These are the movies that are predicted to be most preferred by the target user based on the ratings of their nearest neighbors.

### Explanation of NumPy Functions

1. **np.argsort**: Returns the indices that would sort an array. Used to find indices of the k most similar users in descending order.
  
2. **np.where**: Returns indices of elements satisfying a condition. Used to find movies rated by similar users (not NaN).

3. **np.sum**: Computes sum of array elements. Used to aggregate ratings and counts for each movie across similar users.

4. **np.divide**: Performs element-wise division. Used to calculate average ratings for each movie by dividing aggregated ratings by the number of similar users who rated each movie. Handles division by zero errors.

5. **np.argsort (again)**: Finds indices that would sort movies by average ratings in descending order. Used to select top 5 recommendations.

In [72]:
def generate_user_knn_recommendations(user_id, user_item_matrix, user_similarity_matrix, user_id_dict, movie_id_dict, k):
    """
    Generates movie recommendations for a given user using user-based k-nearest neighbors (KNN) collaborative filtering.

    Parameters:
    user_id (int): ID of the user for whom recommendations are to be generated.
    user_item_matrix (numpy.ndarray): Matrix representing users' ratings for items (movies).
    user_similarity_matrix (numpy.ndarray): Matrix representing cosine similarity between users.
    user_id_dict (dict): Dictionary mapping user IDs to unique indices in the user-item matrix.
    movie_id_dict (dict): Dictionary mapping movie IDs to unique indices in the user-item matrix.
    k (int): Number of nearest neighbors to consider for recommendations.

    Returns:
    recommendations (list): List of tuples containing recommended movie IDs and their predicted or expected ratings for the given user.
    """
    # Ensure user ID exists in the dictionary
    if user_id not in user_id_dict:
        print(f"User with ID {user_id} not found.")
        return []

    # Find the index of the user in the user-item matrix
    user_index = user_id_dict[user_id]

    # Get similarity scores of the user with other users and sort indices
    similar_users_indices = np.argsort(user_similarity_matrix[user_index])[::-1][:k]

    # Find movies rated by similar users
    rated_movies = np.where(~np.isnan(user_item_matrix[similar_users_indices]))[1]

    # Calculate average ratings for each movie
    movie_ratings = np.zeros_like(user_item_matrix[0])
    movie_counts = np.zeros_like(user_item_matrix[0], dtype=int)
    
    # Aggregate ratings and counts for each movie
    for movie in rated_movies:
        movie_ratings[movie] += np.sum(user_item_matrix[similar_users_indices, movie])
        movie_counts[movie] += np.sum(~np.isnan(user_item_matrix[similar_users_indices, movie]))
    
    # Calculate average ratings
    average_ratings = np.divide(movie_ratings, movie_counts, out=np.zeros_like(movie_ratings), where=movie_counts!=0)

    # Sort movies by average ratings in descending order
    sorted_indices = np.argsort(average_ratings)[::-1]

    # Convert movie indices back to movie IDs and return top 5 recommendations
    top_recommendations = [(list(movie_id_dict.keys())[list(movie_id_dict.values()).index(movie_index)], average_ratings[movie_index])
                           for movie_index in sorted_indices[:5]]
    return top_recommendations

# Example usage:
user_id = user_ids[1]  # Example user ID
user_knn_recommendations = generate_user_knn_recommendations(user_id, user_item_matrix, user_similarity_matrix_manhattan, user_id_dict, movie_id_dict, k=1)
print("Recommended movies for user", user_id)
print(user_knn_recommendations)

Recommended movies for user 2012897
[(928, 3.0), (1106, 3.0), (1131, 2.0), (736, 0.0), (726, 0.0)]


Attempt sparse

In [73]:
# from scipy.sparse import csr_matrix

# # Assuming user_ratings_matrix_normed is your numpy matrix
# user_ratings_matrix_normed_sparse = csr_matrix(user_ratings_matrix_normed)

In [74]:
# def calculate_user_similarity_manhattan_sparse(user_ratings_matrix, threshold=0):
#     # Apply threshold to the sparse matrix
#     user_ratings_matrix = csr_matrix(user_ratings_matrix)  # Ensure matrix is in CSR format
#     user_ratings_matrix.data[user_ratings_matrix.data < threshold] = 0
    
#     # Calculate dot product of each pair of row vectors
#     dot_products = user_ratings_matrix.dot(user_ratings_matrix.T)
    
#     # Calculate norms of each row vector
#     norms = user_ratings_matrix.sum(axis=1).A1  # Sum along rows and convert to 1-D array
    
#     # Replace zero norms with a small value to avoid division by zero
#     norms[norms == 0] = 1e-8
    
#     # Calculate similarity matrix using broadcasting
#     outer_norms = norms[:, None]  # Convert norms to a column vector for broadcasting
#     similarity_matrix = dot_products / (outer_norms * outer_norms.T)
    
#     # Set diagonal elements to 0 to avoid self-similarity
#     similarity_matrix.setdiag(0)
    
#     return similarity_matrix

In [75]:
# # Example usage:
# user_similarity_matrix_manhattan_sparse = calculate_user_similarity_manhattan_sparse(user_ratings_matrix_normed, threshold=4)
# print("Cosine similarity matrix between users using Manhattan norm for sparse matrices (with threshold):")
# print(user_similarity_matrix_manhattan_sparse)