In [88]:
import pandas as pd
import numpy as np

In [89]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" # to make jupyter print all outputs, not just the last one
from IPython.core.display import HTML # to pretty print pandas df and be able to copy them over (e.g. to ppt slides)

In [90]:
netflix_df = pd.read_parquet('cleaned/netflix_parquet')

In [91]:
netflix_df = netflix_df[netflix_df['review_data'].apply(lambda x: len(x) if x is not None else 0) > 100]

In [92]:
netflix_df

Unnamed: 0,movieId,year,title,review_data,genres
0,1,2003,Dinosaur Planet,"[{'date': 2005-09-06, 'rating': 3.0, 'userId':...","[Documentary, Animation, Family]"
1,2,2004,Isle of Man TT 2004 Review,"[{'date': 2005-09-05, 'rating': 4.0, 'userId':...",
2,3,1997,Character,"[{'date': 2003-03-29, 'rating': 4.0, 'userId':...","[Crime, Drama, Mystery]"
3,4,1994,Paula Abdul's Get Up & Dance,"[{'date': 2005-09-06, 'rating': 3.0, 'userId':...",[Family]
4,5,2004,The Rise and Fall of ECW,"[{'date': 2005-02-08, 'rating': 5.0, 'userId':...","[Documentary, Sport]"
...,...,...,...,...,...
1956,1957,1977,The Mighty Peking Man,"[{'date': 2004-07-21, 'rating': 2.0, 'userId':...","[Action, Adventure, Horror, Sci-Fi]"
1958,1959,1961,Splendor in the Grass,"[{'date': 2005-07-07, 'rating': 4.0, 'userId':...","[Drama, Romance]"
1959,1960,2003,Smack: Vol. 1,"[{'date': 2004-09-22, 'rating': 3.0, 'userId':...",
1960,1961,1939,Port of Shadows,"[{'date': 2005-08-12, 'rating': 3.0, 'userId':...",


In [93]:
df = (netflix_df.sample(n=10,random_state=42))[['movieId','review_data']]
df

Unnamed: 0,movieId,review_data
1663,1664,"[{'date': 2005-07-08, 'rating': 5.0, 'userId':..."
1227,1228,"[{'date': 2005-07-07, 'rating': 4.0, 'userId':..."
847,848,"[{'date': 2003-05-30, 'rating': 3.0, 'userId':..."
1621,1622,"[{'date': 2004-12-05, 'rating': 3.0, 'userId':..."
1400,1401,"[{'date': 2001-12-11, 'rating': 4.0, 'userId':..."
385,386,"[{'date': 2001-12-05, 'rating': 3.0, 'userId':..."
256,257,"[{'date': 2003-08-13, 'rating': 3.0, 'userId':..."
617,618,"[{'date': 2005-08-07, 'rating': 2.0, 'userId':..."
1425,1426,"[{'date': 2003-08-27, 'rating': 3.0, 'userId':..."
1355,1356,"[{'date': 2005-09-17, 'rating': 1.0, 'userId':..."


In [94]:
review_data = df['review_data'].values
user_ids = np.concatenate([np.array([entry['userId'] for entry in row]) for row in review_data])
ratings = np.concatenate([np.array([entry['rating'] for entry in row]) for row in review_data])
movieIds = np.concatenate([[movieId] * len(row) for movieId, row in zip(df['movieId'], review_data)])
len(user_ids)
len(np.unique(movieIds))

38332

10

In [95]:
def create_user_item_matrix(train_test_val_set):
    """
    Creates a user-item matrix from the provided dataset containing review data.

    Parameters:
    train_test_val_set (DataFrame): DataFrame containing review data with columns 'review_data',
                                    which is a list of dictionaries with keys 'userId', 'rating',
                                    and 'movieId'.

    Returns:
    user_item_matrix (numpy.ndarray): Matrix representing users' ratings for items (movies), the matrix is an NumPy array which contains lists of user-item interactions, meaning a user and their corresponding ratings to the movieIds.    
    
    user_id_dict (dict): Dictionary mapping user IDs to unique indices in the user-item matrix.
    
    movie_id_dict (dict): Dictionary mapping movie IDs to unique indices in the user-item matrix.
    
    user_ids (numpy.ndarray): Array containing user IDs corresponding to each rating in the matrix.
    
    movie_ids (numpy.ndarray): Array containing movie IDs corresponding to each rating in the matrix.

    """
    review_data = train_test_val_set['review_data'].values
    user_ids = np.concatenate([np.array([entry['userId'] for entry in row]) for row in review_data])
    ratings = np.concatenate([np.array([entry['rating'] for entry in row]) for row in review_data])
    movieIds = np.concatenate([[movieId] * len(row) for movieId, row in zip(train_test_val_set['movieId'], review_data)])

    # create dictionaries to map user IDs and movie IDs to unique indices to map over
    user_id_dict = {user_id: index for index, user_id in enumerate(np.unique(user_ids))}
    movie_id_dict = {movie_id: index for index, movie_id in enumerate(np.unique(movieIds))}

    # initialize an empty user-item matrix
    user_count = len(user_id_dict)
    movie_count = len(movie_id_dict)
    user_item_matrix = np.full((user_count, movie_count), np.nan)

    # populate the user-item matrix with ratings from the dataset
    for i, (user_id, movie_id, rating) in enumerate(zip(user_ids, movieIds, ratings)):
        user_index = user_id_dict[user_id]
        movie_index = movie_id_dict[movie_id]
        user_item_matrix[user_index, movie_index] = rating

    return user_item_matrix, user_id_dict, movie_id_dict, user_ids, movieIds

In [96]:
user_item_matrix, user_id_dict, movie_id_dict, user_ids, movieIds = create_user_item_matrix(df)
user_item_matrix

array([[ 4., nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [ 4., nan, nan, ..., nan, nan,  3.],
       ...,
       [ 4., nan, nan, ..., nan, nan, nan],
       [ 4., nan, nan, ..., nan, nan, nan],
       [ 4., nan, nan, ..., nan, nan, nan]])

We might suggest filling the empty values with 0s, but that can create issues with recommendation engines. 

If we were to fill this NaN with a 0, we would be incorrectly implying they greatly disliked! An alternative is to center each user’s ratings around 0 by deducting the row average and then fill in the missing values with 0. This means the missing data is replaced with neutral scores.

In [106]:
# Get the average rating for each user 
avg_ratings = np.mean(user_item_matrix, axis=1)

# Center each user's ratings around 0
user_ratings_matrix_centered = user_item_matrix - avg_ratings.reshape(-1, 1)

# Fill in the missing data with 0s
user_ratings_matrix_normed = np.nan_to_num(user_ratings_matrix_centered, nan=0.0)

In [112]:
user_ratings_matrix_normed
np.unique(user_ratings_matrix_normed)
user_ratings_matrix_normed.shape

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

array([-0.6, -0.4, -0.1,  0. ,  0.4,  0.6,  0.9,  1.4,  2.6])

(34790, 10)

To see how similar users are we will compute the similarity between them. I will use cosine similarity as distance measure. The manhatten norm will be used to decrease computational weight.

In [114]:
# Function to calculate cosine similarity between two vectors using Manhattan norm
def cosine_similarity_manhattan(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1, ord=1)
    norm_vec2 = np.linalg.norm(vec2, ord=1)
    
    if norm_vec1 != 0 and norm_vec2 != 0:
        return dot_product / (norm_vec1 * norm_vec2)
    else:
        return 0

# Function to calculate cosine similarity matrix between users using Manhattan norm
def calculate_user_similarity_manhattan(user_ratings_matrix):
    # Calculate dot product of each pair of row vectors
    dot_products = np.dot(user_ratings_matrix, user_ratings_matrix.T)
    
    # Calculate norms of each row vector
    norms = np.linalg.norm(user_ratings_matrix, ord=1, axis=1)
    
    # Replace zero norms with a small value to avoid division by zero
    norms[norms == 0] = 1e-8
    
    # Calculate similarity matrix using broadcasting
    similarity_matrix = dot_products / np.outer(norms, norms)
    
    # Set diagonal elements to 0 to avoid self-similarity
    np.fill_diagonal(similarity_matrix, 0)
    
    return similarity_matrix

# Assuming user_ratings_matrix_normed is your matrix of normalized user-item interactions
user_similarity_matrix_manhattan = calculate_user_similarity_manhattan(user_ratings_matrix_normed)

print("Cosine similarity matrix between users using Manhattan norm:")
print(user_similarity_matrix_manhattan)

Cosine similarity matrix between users using Manhattan norm:
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


### Perform KNN

To find the nearest neighbours: most similar users