In [143]:
import numpy as np
import pandas as pd

In [144]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" # to make jupyter print all outputs, not just the last one
from IPython.core.display import HTML # to pretty print pandas df and be able to copy them over (e.g. to ppt slides)

In [145]:
df = pd.read_parquet('cleaned/netflix_parquet')

To start with, I am going to make multiple random samples using random sample with replacement:

In [146]:
# make samples using random sampling:
sample_3quarter = df.sample(frac=3/4, random_state=42)
sample_half = df.sample(frac=1/2, random_state=42)
sample_third = df.sample(frac=1/3, random_state=42)
sample_quarter = df.sample(frac=1/4, random_state=42)
sample_sixth = df.sample(frac=1/6, random_state=42)
sample_tenth = df.sample(frac=1/10, random_state=42)
sample_25th = df.sample(frac=1/25, random_state=42)

### Feature engineering:

Year and title will be dropped:

In [147]:
netflix_df = sample_tenth.drop(['year','title'],axis=1)
netflix_df

Unnamed: 0,movieId,review_data
123,124,"[{'date': 2002-04-01, 'rating': 3.0, 'userId':..."
1193,13434,"[{'date': 2003-02-20, 'rating': 2.0, 'userId':..."
462,4601,"[{'date': 2003-12-27, 'rating': 3.0, 'userId':..."
351,352,"[{'date': 2003-10-13, 'rating': 4.0, 'userId':..."
1058,9560,"[{'date': 2003-07-06, 'rating': 4.0, 'userId':..."
...,...,...
367,4506,"[{'date': 2005-07-06, 'rating': 4.0, 'userId':..."
1415,13656,"[{'date': 2003-11-21, 'rating': 2.0, 'userId':..."
768,9270,"[{'date': 2002-09-26, 'rating': 4.0, 'userId':..."
1218,13459,"[{'date': 2000-07-18, 'rating': 2.0, 'userId':..."


#### Let's work with movies and reviews first, add other features later:

Only rating and userId of dictionary will be kept to accomplish this.

In [148]:
netflix_df['review_data'] = netflix_df['review_data'].apply(lambda x: None if x is None else [{'userId': review['userId'], 'rating': review['rating']} for review in x if 'userId' in review and 'rating' in review])

Then, let's split our data into train, validation and test sets where we ensure that no training data flows into test and validation sets:

In [149]:
def train_val_test_split(data, train_ratio=0.8, val_ratio=0.1, test_ratio=0.1):
    """
    Splits the data into training, validation, and test sets.

    Parameters:
    - data: pandas DataFrame containing the data to be split.
    - train_ratio: float, ratio of the training set size to the total data size (default: 0.8).
    - val_ratio: float, ratio of the validation set size to the total data size (default: 0.1).
    - test_ratio: float, ratio of the test set size to the total data size (default: 0.1).

    Returns:
    - train_data: pandas DataFrame, training set.
    - val_data: pandas DataFrame, validation set.
    - test_data: pandas DataFrame, test set.
    """
    # Shuffle the data
    data_shuffled = data.sample(frac=1, random_state=42)

    # Calculate the sizes of each set
    num_samples = len(data_shuffled)
    num_train = int(train_ratio * num_samples)
    num_val = int(val_ratio * num_samples)
    num_test = num_samples - num_train - num_val

    # Split the data into train, validation, and test sets
    train_data = data_shuffled[:num_train]
    # Below is ensured the validation data and the test data starts after the indices which are already in the training data, ensuring that no training data will flow into validation of test data.
    val_data = data_shuffled[num_train:num_train+num_val]
    test_data = data_shuffled[num_train+num_val:]

    # Reset index for each set
    train_data.reset_index(drop=True, inplace=True)
    val_data.reset_index(drop=True, inplace=True)
    test_data.reset_index(drop=True, inplace=True)

    return train_data, val_data, test_data

Let's split the data accordingly:

In [150]:
train_data, val_data, test_data = train_val_test_split(netflix_df)

Subsequently, let's define some function to make our life easer for the compatibility of more datasets. We gather unique item and user ids, create user-item matrix which will be centered, followed by performing SVD en making recommendations using the dot product between the decomposed matrices resulting from SVD:

In [156]:
def create_user_item_matrix(train_test_val_set):
    review_data = train_test_val_set['review_data'].values
    user_ids = np.concatenate([np.array([entry['userId'] for entry in row]) for row in review_data])
    ratings = np.concatenate([np.array([entry['rating'] for entry in row]) for row in review_data])
    movieIds = np.concatenate([[movieId] * len(row) for movieId, row in zip(train_data['movieId'], review_data)])

    # create dictionaries to map user IDs and movie IDs to unique indices to map over
    user_id_dict = {user_id: index for index, user_id in enumerate(np.unique(user_ids))}
    movie_id_dict = {movie_id: index for index, movie_id in enumerate(np.unique(movieIds))}

    # initialize an empty user-item matrix
    user_count = len(user_id_dict)
    movie_count = len(movie_id_dict)
    user_item_matrix = np.zeros((user_count, movie_count))

    # populate the user-item matrix with ratings from netflix dataset
    for i, (user_id, movie_id, rating) in enumerate(zip(user_ids, movieIds, ratings)):
        user_index = user_id_dict[user_id]
        movie_index = movie_id_dict[movie_id]
        user_item_matrix[user_index, movie_index] = rating

    return user_item_matrix, user_id_dict, movie_id_dict, user_ids, movieIds

# I will center the data in the function below, to make the matrix more robust to handle variations in user ratings
def center_data(user_item_matrix):
    user_means = np.mean(user_item_matrix, axis=1)
    centered_user_item_matrix = user_item_matrix - user_means[:, np.newaxis]
    return centered_user_item_matrix, user_means

# I will decompose the user item matrix in this function using numpy
def apply_svd(centered_user_item_matrix, num_latent_factors):
    # U, sigma and Vt are created using the svd function from numpy
    U, Sigma, Vt = np.linalg.svd(centered_user_item_matrix, full_matrices=False)
    # set up sigma, which is the diagonal matrix from the decomposition
    Sigma = np.diag(Sigma[:num_latent_factors])
    # set up U and Vt which have to orthonormal to each other to ensure U represents each user and Vt represents each item, otherwise the total matrix would not add up.
    U = U[:, :num_latent_factors]
    Vt = Vt[:num_latent_factors, :]
    return U, Sigma, Vt

def compute_recommendations_for_all_users(U, Sigma, Vt, user_means, user_ids, num_recommendations):
    all_recommendations = {}
    all_predicted_centered_ratings = {}  # Store predicted ratings for all users
    user_id_to_index = {user_id: idx for idx, user_id in enumerate(user_ids)}

    for user_id in user_ids:
        user_index = user_id_to_index[user_id]
        
        # matrix multriplication between Sigma and Vt, to reconstruct an item matrix with less features, followed by the dot product of U and the reconstruction of item matrix. It essentially calculates the predicted ratings for each item for the given user based on their latent given ratings. user_means[user_index] rules out the items the user already interacted with.
        user_ratings = np.dot(U[user_index, :], np.dot(Sigma, Vt)) + user_means[user_index]
        
        # mask out items the user has already interacted with
        user_ratings[user_item_matrix[user_index, :] > 0] = -np.inf
        
        # sort the predicted ratings in descending order
        top_indices = np.argsort(user_ratings)[::-1]
        
        # store the top predicted ratings for the current user
        all_predicted_centered_ratings[user_id] = user_ratings
        
        # select the top 'num_recommendations' items as recommendations
        top_items = top_indices[:num_recommendations] + 1
        
        # store the top recommended items for the current user
        all_recommendations[user_id] = top_items
    
    # return both the recommendations and the predicted ratings, so that we can evaluate the predicted ratings compared to the existing ratings in model evaluation
    return all_recommendations, all_predicted_centered_ratings

In [157]:
user_item_matrix, user_id_dict, movie_id_dict, user_ids, movieIds = create_user_item_matrix(train_data)
# get unique movieIds, use set to ensure unique values and create list from it
user_ids = list(set(movieIds))
item_ids = list(set(user_ids))

In [158]:
user_id_dict

{'1000033': 0,
 '1000035': 1,
 '1000038': 2,
 '1000053': 3,
 '100006': 4,
 '1000062': 5,
 '1000079': 6,
 '1000084': 7,
 '1000094': 8,
 '1000095': 9,
 '1000104': 10,
 '1000122': 11,
 '1000153': 12,
 '1000170': 13,
 '1000176': 14,
 '1000183': 15,
 '1000192': 16,
 '1000195': 17,
 '1000215': 18,
 '1000225': 19,
 '1000232': 20,
 '1000234': 21,
 '1000264': 22,
 '1000270': 23,
 '1000285': 24,
 '1000287': 25,
 '100029': 26,
 '1000301': 27,
 '1000303': 28,
 '1000328': 29,
 '100035': 30,
 '1000380': 31,
 '1000383': 32,
 '1000386': 33,
 '1000387': 34,
 '1000404': 35,
 '1000406': 36,
 '1000410': 37,
 '1000412': 38,
 '1000419': 39,
 '1000427': 40,
 '1000432': 41,
 '1000433': 42,
 '1000438': 43,
 '1000439': 44,
 '1000441': 45,
 '1000452': 46,
 '1000457': 47,
 '1000458': 48,
 '1000461': 49,
 '1000481': 50,
 '1000485': 51,
 '1000489': 52,
 '1000517': 53,
 '1000527': 54,
 '1000546': 55,
 '1000554': 56,
 '1000559': 57,
 '1000561': 58,
 '1000569': 59,
 '100057': 60,
 '1000571': 61,
 '1000596': 62,
 '1000

In [159]:
unique_values = np.unique(user_item_matrix)
print("Unique values in the user-item matrix:", unique_values)
user_item_matrix

Unique values in the user-item matrix: [0. 1. 2. 3. 4. 5.]


array([[4., 0., 0., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [5., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [3., 0., 0., ..., 0., 0., 0.]])

Code which works without functions to evaluate:

In [160]:
# select the Number of Latent Factors
num_latent_factors = 4

# unpack the tuple returned by center_data function to get an updates user item matrix which is more robust to variations in rating
centered_user_item_matrix, user_means = center_data(user_item_matrix)

# apply SVD using the centered matrix to reduce memory usage and to decompose the matrix to be able to make recommendations using the dot product method
U, Sigma, Vt = apply_svd(centered_user_item_matrix, num_latent_factors)
U 
Sigma
Vt

# define number of recommendations per user
num_recommendations = 4

# compute the recommendations
all_recommendations = compute_recommendations_for_all_users(U, Sigma, Vt, user_means, user_ids, num_recommendations)

array([[-0.00298226, -0.00105855, -0.00022081,  0.00091144],
       [-0.00283966, -0.00073889,  0.00521205,  0.00210447],
       [-0.00273062, -0.00316922,  0.00028598, -0.00016714],
       ...,
       [-0.00052413,  0.00143719,  0.00398661,  0.00179054],
       [-0.00061305,  0.00116704, -0.00131438, -0.00418057],
       [-0.00163837, -0.00190153,  0.00017159, -0.00010028]])

array([[1480.19981372,    0.        ,    0.        ,    0.        ],
       [   0.        ,  887.98194436,    0.        ,    0.        ],
       [   0.        ,    0.        ,  772.40907707,    0.        ],
       [   0.        ,    0.        ,    0.        ,  701.57819756]])

array([[-8.08372698e-01, -1.94172837e-02,  1.72785552e-02,
         3.03946990e-02,  1.71108747e-02,  3.14586237e-02,
         2.68222298e-02, -3.20702369e-02,  2.00142850e-02,
         3.11595248e-02,  3.12965181e-02,  2.96517257e-02,
         3.04956655e-02,  1.93967576e-02,  3.15445291e-02,
         1.06771989e-02,  3.14275230e-02,  3.14368316e-02,
         2.19215861e-02, -1.19902604e-03,  1.01577558e-02,
         2.96076530e-02,  3.15960550e-02, -1.93955254e-01,
         3.06565263e-02,  3.02269115e-02,  3.13766075e-02,
         1.17975779e-02,  3.14347355e-02,  3.15368135e-02,
         3.15301334e-02, -1.81486954e-01,  3.11323534e-02,
         2.80787560e-02,  3.06963237e-02,  3.14611452e-02,
         2.93358645e-02,  2.88734908e-02,  2.12136070e-02,
         3.00237668e-02,  3.11743597e-02,  4.24759670e-03,
         3.02555947e-02,  3.06256615e-02,  1.67006968e-02,
         2.53947974e-02, -3.29352258e-01,  3.15174643e-02,
         2.77504623e-02,  2.78257492e-02,  3.10886015e-0

In [161]:
all_recommendations

({4610: array([ 47,  24, 114,  55], dtype=int64),
  4625: array([82, 60, 94, 72], dtype=int64),
  9233: array([ 82,  32,  59, 114], dtype=int64),
  9240: array([ 47,  68, 101,  23], dtype=int64),
  4636: array([ 82,  32,  59, 114], dtype=int64),
  30: array([ 82,  94, 111, 114], dtype=int64),
  9249: array([ 82,  94, 111,  32], dtype=int64),
  9255: array([47,  1, 24, 94], dtype=int64),
  44: array([ 82,  32,  59, 114], dtype=int64),
  45: array([60, 32, 24, 47], dtype=int64),
  50: array([ 82,  94,  60, 114], dtype=int64),
  52: array([92, 48, 10, 27], dtype=int64),
  9270: array([24, 60, 32, 82], dtype=int64),
  4666: array([ 47,  24,  55, 111], dtype=int64),
  4667: array([ 82,  32,  59, 114], dtype=int64),
  4671: array([ 82,  32,  59, 114], dtype=int64),
  66: array([ 94, 114,  72,  32], dtype=int64),
  13383: array([ 82,  32,  59, 114], dtype=int64),
  71: array([ 82,  32,  59, 114], dtype=int64),
  77: array([ 82,  32,  59, 114], dtype=int64),
  79: array([60, 94, 72, 82], dtype

### Include time feature in matrix:

In [162]:
# train_data2, val_data2, test_data2 = train_val_test_split(df)

In [163]:
# # Extract titles, user IDs, ratings, and dates
# review_data2 = train_data2['review_data'].values
# user_ids2 = np.concatenate([np.array([entry['userId'] for entry in row]) for row in review_data2])
# ratings2 = np.concatenate([np.array([entry['rating'] for entry in row]) for row in review_data2])
# dates = np.concatenate([np.array([entry['date'] for entry in row], dtype='datetime64') for row in review_data2])
# movieIds2 = np.concatenate([[movieId] * len(row) for movieId, row in zip(train_data['movieId'], review_data2)])

In [164]:
# train_data

In [165]:
# movieIds2
# user_ids2
# ratings
# dates

In [166]:
# # define function to convert datetime64[D] to months to normalize the dates
# def get_month(date):
#     month = (date.astype('datetime64[M]').astype(int) % 12) + 1
#     return month

# # Convert datetime64[D] dates to months
# months = np.array([get_month(date) for date in dates])

In [167]:
# # Create dictionaries to map user IDs and movie IDs to unique indices
# user_id_dict2 = {user_id: index for index, user_id in enumerate(np.unique(user_ids2))}
# movie_id_dict2 = {movie_id: index for index, movie_id in enumerate(np.unique(movieIds2))}

# # Initialize the user-item-time matrix
# user_count2 = len(user_id_dict2)
# movie_count2 = len(movie_id_dict2)
# matrix_3d = np.zeros((user_count2, movie_count2, 2))

# # Populate the matrix with ratings and normalized timestamps
# for user_id, movie_id, rating, month in zip(user_ids2, movieIds2, ratings2, months):
#     user_index = user_id_dict2[user_id]
#     movie_index = movie_id_dict2[movie_id]
#     matrix_3d[user_index, movie_index] = [rating, month]

In [168]:
# unique_values = np.unique(matrix_3d)
# print("Unique values in the user-item-time matrix:", unique_values)
# matrix_3d

In [169]:
# # set is used because it does not allow for duplicates
# user_ids2 = set()

# # iterate over each row
# for index, row in train_data2.iterrows():
#     # iterate over each dictionary in the 'review_data' column of the current row
#     for review_dict in row['review_data']:
#         user_id = review_dict.get('userId')  # Extract userId from the dictionary
#         if user_id:  # Check if userId exists
#             user_ids2.add(int(user_id))  # Add user ID to the set

# user_ids2 = list(user_ids2)

In [170]:
# # put movieids in set so duplicates are not allowed here either
# item_ids2 = list(set(train_data2['movieId'].unique()))

In [171]:
# def center_data_3d(matrix_3d):
#     # Calculate mean along the second axis (movies axis)
#     user_means = np.mean(matrix_3d, axis=(1, 2), keepdims=True)
#     # Subtract the mean from the original matrix
#     centered_user_item_matrix_3d = matrix_3d - user_means
#     return centered_user_item_matrix_3d, user_means

# def apply_svd_3d(centered_user_item_matrix_3d, num_latent_factors):
#     # Reshape the matrix to be 2D for SVD
#     reshaped_matrix = centered_user_item_matrix_3d.reshape(centered_user_item_matrix_3d.shape[0], -1)
#     # Perform SVD
#     U, Sigma, Vt = np.linalg.svd(reshaped_matrix, full_matrices=False)
#     # Keep only the specified number of latent factors
#     U = U[:, :num_latent_factors]
#     Sigma = np.diag(Sigma[:num_latent_factors])
#     Vt = Vt[:num_latent_factors, :]
#     return U, Sigma, Vt

# def compute_recommendations_for_all_users_3d(U, Sigma, Vt, user_means, user_ids2, num_recommendations):
#     all_recommendations = {}
#     user_id_to_index = {user_id: idx for idx, user_id in enumerate(user_ids2)}
#     for user_id in user_ids2:
#         user_index = user_id_to_index[user_id]
#         # Perform dot product for each user
#         user_ratings = np.dot(U[user_index], np.dot(Sigma, Vt)) + user_means[user_index]
#         # Exclude items already interacted with
#         user_ratings[user_item_matrix[user_index] > 0] = -np.inf
#         # Get indices of top recommendations
#         top_indices = np.argsort(user_ratings)[::-1][:num_recommendations]
#         top_items = top_indices + 1
#         all_recommendations[user_id] = top_items
#     return all_recommendations

In [172]:
# # select the Number of Latent Factors
# num_latent_factors = 4 

# # unpack the tuple returned by center_data function to get an updates user item matrix which is more robust to variations in rating
# centered_user_item_matrix_3d, user_means = center_data_3d(matrix_3d)

# # apply SVD using the centered matrix to reduce memory usage and to decompose the matrix to be able to make recommendations using the dot product method
# U, Sigma, Vt = apply_svd_3d(centered_user_item_matrix_3d, num_latent_factors)
# U 
# Sigma
# Vt

# # define number of recommendations per user
# num_recommendations = 4

# # compute the recommendations
# all_recommendations2 = compute_recommendations_for_all_users_3d(U, Sigma, Vt, user_means, user_ids, num_recommendations)

In [173]:
# all_recommendations2

# Redundant but maybe useful for troubleshooting:

In [174]:
# # Extract unique user IDs from the dataset
# dataset_user_ids = set()
# for review_list in train_data['review_data']:
#     for review_dict in review_list:
#         user_id = review_dict.get('userId')
#         if user_id:
#             dataset_user_ids.add(user_id)

# # Check if all user IDs in the matrix are also in the dataset, and vice versa
# user_ids_in_dataset_not_in_matrix = dataset_user_ids - set(user_ids)
# user_ids_in_matrix_not_in_dataset = set(user_ids) - dataset_user_ids
# len(user_ids_in_dataset_not_in_matrix)
# len(user_ids_in_matrix_not_in_dataset)

User item matrix with pandas:

In [175]:
# # extract review dates, user IDs, and ratings using NumPy
# review_data = train_data['review_data'].values
# user_ids = np.concatenate([np.array([entry['userId'] for entry in row]) for row in review_data])
# ratings = np.concatenate([np.array([entry['rating'] for entry in row]) for row in review_data])

# # Extract movie titles
# movieIds = np.concatenate([[movieId] * len(row) for movieId, row in zip(train_data['movieId'], review_data)])

# # Create a DataFrame with review dates, user IDs, ratings, and movie titles
# review_df = pd.DataFrame({'userId': user_ids, 'rating': ratings, 'movieId': movieIds})

# # Pivot review_df to get user-item matrix with reviews as values
# user_item_matrix_df = review_df.pivot_table(index='userId', columns='movieId', values='rating')

# # Fill NaN values with 0
# user_item_matrix_df = user_item_matrix_df.fillna(0)

# # Convert DataFrame to NumPy array
# user_item_matrix = user_item_matrix_df.to_numpy()

# user_item_matrix

In [176]:
# # extract titles from dataframe, user IDs, and ratings from dictionary using NumPy
# review_data = train_data['review_data'].values
# user_ids = np.concatenate([np.array([entry['userId'] for entry in row]) for row in review_data])
# ratings = np.concatenate([np.array([entry['rating'] for entry in row]) for row in review_data])
# movieIds = np.concatenate([[movieId] * len(row) for movieId, row in zip(train_data['movieId'], review_data)])

# # create dictionaries to map user IDs and movie IDs to unique indices to map over
# user_id_dict = {user_id: index for index, user_id in enumerate(np.unique(user_ids))}
# movie_id_dict = {movie_id: index for index, movie_id in enumerate(np.unique(movieIds))}

# # initialize an empty user-item matrix
# user_count = len(user_id_dict)
# movie_count = len(movie_id_dict)
# user_item_matrix = np.zeros((user_count, movie_count))

# # populate the user-item matrix with ratings from netflix dataset
# for i, (user_id, movie_id, rating) in enumerate(zip(user_ids, movieIds, ratings)):
#     user_index = user_id_dict[user_id]
#     movie_index = movie_id_dict[movie_id]
#     user_item_matrix[user_index, movie_index] = rating

In [177]:
# # set is used because it does not allow for duplicates
# user_ids = set()

# # iterate over each row
# for index, row in train_data.iterrows():
#     # iterate over each dictionary in the 'review_data' column of the current row
#     for review_dict in row['review_data']:
#         user_id = review_dict.get('userId')  # Extract userId from the dictionary
#         if user_id:  # Check if userId exists
#             user_ids.add(int(user_id))  # Add user ID to the set

# user_ids = list(user_ids)

In [178]:
# # put movieids in set so duplicates are not allowed here either
# item_ids = list(set(train_data['movieId'].unique()))

Function which computes recommendations and returns recommendations only, not the predicted ratings for every item after the svd matrix dot product:

In [179]:
# I will compute recommendations for each user_id in the training/test/validation data by performing the dot product between the previous reviews in the matrix by the reconstruction of the user item matrix with less features
# def compute_recommendations_for_all_users(U, Sigma, Vt, user_means, user_ids, num_recommendations):
#     all_recommendations = {}
#     user_id_to_index = {user_id: idx for idx, user_id in enumerate(user_ids)}
#     for user_id in user_ids:
#         user_index = user_id_to_index[user_id]
#         # matrix multriplication between Sigma and Vt, to reconstruct an item matrix with less features, followed by the dot product of U and the reconstruction of item matrix. It essentially calculates the predicted ratings for each item for the given user based on their latent given ratings. user_means[user_index] rules out the items the user already interacted with.
#         user_ratings = np.dot(U[user_index, :], np.dot(Sigma, Vt)) + user_means[user_index]
#         user_ratings[user_item_matrix[user_index, :] > 0] = -np.inf
#         top_indices = np.argsort(user_ratings)[::-1][:num_recommendations]
#         top_items = top_indices + 1
#         all_recommendations[user_id] = top_items
#     return all_recommendations

Old functions to extract unique user and item ids

In [180]:
# def extract_unique_user_ids(train_test_val_set):
#     user_ids = set()
#     # iterate over each row
#     for index, row in train_test_val_set.iterrows():
#         # iterate over each dictionary in the 'review_data' column of the current row
#         for review_dict in row['review_data']:
#             user_id = review_dict.get('userId')  # Extract userId from the dictionary
#             if user_id:  # Check if userId exists
#                 user_ids.add(int(user_id))  # Add user ID to the set
#     return list(user_ids)

# def extract_unique_movie_ids(train_test_val_set):
#     movie_ids = set(train_test_val_set['movieId'].unique())
#     return list(movie_ids)