In [1]:
import numpy as np
import pandas as pd

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" # to make jupyter print all outputs, not just the last one
from IPython.core.display import HTML # to pretty print pandas df and be able to copy them over (e.g. to ppt slides)

In [3]:
netflix_df = pd.read_parquet('cleaned/netflix_parquet')

In [4]:
# make samples using random sampling:
sample_3quarter = netflix_df.sample(frac=3/4, random_state=42)
sample_half = netflix_df.sample(frac=1/2, random_state=42)
sample_third = netflix_df.sample(frac=1/3, random_state=42)
sample_quarter = netflix_df.sample(frac=1/4, random_state=42)
sample_sixth = netflix_df.sample(frac=1/6, random_state=42)
sample_tenth = netflix_df.sample(frac=1/10, random_state=42)
sample_25th = netflix_df.sample(frac=1/25, random_state=42)

### Feature engineering:

A user-item matrix will be created.

In [5]:
sample_tenth = sample_tenth.drop(['year','title'],axis=1)
sample_tenth

Unnamed: 0,movieId,review_data
123,124,"[{'date': 2002-04-01, 'rating': 3.0, 'userId':..."
1193,13434,"[{'date': 2003-02-20, 'rating': 2.0, 'userId':..."
462,4601,"[{'date': 2003-12-27, 'rating': 3.0, 'userId':..."
351,352,"[{'date': 2003-10-13, 'rating': 4.0, 'userId':..."
1058,9560,"[{'date': 2003-07-06, 'rating': 4.0, 'userId':..."
...,...,...
367,4506,"[{'date': 2005-07-06, 'rating': 4.0, 'userId':..."
1415,13656,"[{'date': 2003-11-21, 'rating': 2.0, 'userId':..."
768,9270,"[{'date': 2002-09-26, 'rating': 4.0, 'userId':..."
1218,13459,"[{'date': 2000-07-18, 'rating': 2.0, 'userId':..."


#### Let's work with movies and reviews first, add other features later:

Only rating and userId of dictionary will be kept to accomplish this.

In [6]:
sample_tenth['review_data'] = sample_tenth['review_data'].apply(lambda x: None if x is None else [{'userId': review['userId'], 'rating': review['rating']} for review in x if 'userId' in review and 'rating' in review])

In [7]:
def train_val_test_split(data, train_ratio=0.8, val_ratio=0.1, test_ratio=0.1):
    """
    Splits the data into training, validation, and test sets.

    Parameters:
    - data: pandas DataFrame containing the data to be split.
    - train_ratio: float, ratio of the training set size to the total data size (default: 0.8).
    - val_ratio: float, ratio of the validation set size to the total data size (default: 0.1).
    - test_ratio: float, ratio of the test set size to the total data size (default: 0.1).

    Returns:
    - train_data: pandas DataFrame, training set.
    - val_data: pandas DataFrame, validation set.
    - test_data: pandas DataFrame, test set.
    """
    # Shuffle the data
    data_shuffled = data.sample(frac=1, random_state=42)

    # Calculate the sizes of each set
    num_samples = len(data_shuffled)
    num_train = int(train_ratio * num_samples)
    num_val = int(val_ratio * num_samples)
    num_test = num_samples - num_train - num_val

    # Split the data into train, validation, and test sets
    train_data = data_shuffled[:num_train]
    # Below is ensured the validation data and the test data starts after the indices which are already in the training data, ensuring that no training data will flow into validation of test data.
    val_data = data_shuffled[num_train:num_train+num_val]
    test_data = data_shuffled[num_train+num_val:]

    # Reset index for each set
    train_data.reset_index(drop=True, inplace=True)
    val_data.reset_index(drop=True, inplace=True)
    test_data.reset_index(drop=True, inplace=True)

    return train_data, val_data, test_data

In [8]:
train_data, val_data, test_data = train_val_test_split(sample_tenth)

In [9]:
train_data

Unnamed: 0,movieId,review_data
0,240,"[{'userId': '1109700', 'rating': 5.0}, {'userI..."
1,52,"[{'userId': '1402412', 'rating': 4.0}, {'userI..."
2,290,"[{'userId': '1989800', 'rating': 4.0}, {'userI..."
3,44,"[{'userId': '1734805', 'rating': 5.0}, {'userI..."
4,4554,"[{'userId': '2508228', 'rating': 5.0}, {'userI..."
...,...,...
109,13668,"[{'userId': '41412', 'rating': 4.0}, {'userId'..."
110,13489,"[{'userId': '276813', 'rating': 3.0}, {'userId..."
111,9255,"[{'userId': '370762', 'rating': 5.0}, {'userId..."
112,9331,"[{'userId': '364518', 'rating': 3.0}, {'userId..."


#### First we need to find unique user and item(movie)ids:

In [10]:
# # set is used because it does not allow for duplicates
# user_ids = set()

# # iterate over each row
# for index, row in train_data.iterrows():
#     # iterate over each dictionary in the 'review_data' column of the current row
#     for review_dict in row['review_data']:
#         user_id = review_dict.get('userId')  # Extract userId from the dictionary
#         if user_id:  # Check if userId exists
#             user_ids.add(int(user_id))  # Add user ID to the set
# # user_ids = list(user_ids)
# # user_ids = [int(user_id) for user_id in user_ids]



In [18]:
# put movieids in set so duplicates are not allowed for here either
item_ids = list(set(train_data['movieId'].unique()))
# generate user_ids using list comprehension
user_ids = {int(review_dict.get('userId')) for index, row in train_data.iterrows() for review_dict in row['review_data'] if review_dict.get('userId')}
# convert to list to create an order which generation of recommendations can loop through
user_ids = list(user_ids)
user_ids
len(user_ids)

[1572864,
 2621442,
 1048579,
 2621443,
 524291,
 6,
 524295,
 7,
 1048586,
 2097163,
 1048594,
 524307,
 524308,
 524312,
 524313,
 1048600,
 524315,
 524319,
 2097185,
 1048612,
 2621481,
 1572905,
 1048619,
 42,
 1048624,
 2621491,
 1048629,
 1572917,
 524341,
 2621497,
 1048633,
 2097210,
 59,
 1572925,
 1572923,
 2621506,
 1572931,
 1572933,
 2621510,
 1572935,
 524358,
 79,
 2621520,
 2097232,
 1048660,
 2621524,
 1572950,
 87,
 1572951,
 524373,
 524378,
 2097244,
 94,
 1572958,
 1572960,
 97,
 2097249,
 1572963,
 2097252,
 2097253,
 524390,
 2097254,
 2097256,
 524389,
 524396,
 1572976,
 2097264,
 524402,
 2097267,
 116,
 1572980,
 524408,
 2097275,
 1048700,
 2097277,
 1572992,
 1048708,
 134,
 1048711,
 2097287,
 524422,
 1573000,
 2621578,
 1572999,
 2621581,
 2097296,
 2621587,
 1048728,
 1048731,
 158,
 1048735,
 168,
 169,
 524458,
 524460,
 1048749,
 2097327,
 1048756,
 183,
 524471,
 1048763,
 188,
 2097339,
 1048766,
 192,
 2097345,
 195,
 2621635,
 2621637,
 1573062,

238312

In [12]:
print("The user/item matrix will be {} x {}. Therefore its likely the matrix will be very sparse.".format(len(user_ids),len(item_ids))) 

The user/item matrix will be 238312 x 114. Therefore its likely the matrix will be very sparse.


#### Now we will populate the matrix with the training data values:

In [13]:
# Assuming is a list of unique user IDs
user_ids_dict = {uid: idx for idx, uid in enumerate(user_ids)}

# Determine the number of users and movies
num_users = len(user_ids)
num_movies = len(item_ids)

# Initialize user-item matrix with zeros
user_item_matrix = np.zeros((num_users, num_movies))

for i, (reviews, movie_id) in enumerate(zip(train_data['review_data'], train_data['movieId'])):
    for review in reviews:
        user_idx = user_ids_dict.get(review['userId'])
        if user_idx is not None:  # Check if user exists in user_ids
            rating = round(review['rating'], 6)
            # Populate the user-item matrix using the user index as the row index
            user_item_matrix[user_idx, i] = rating

In [14]:
user_item_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [15]:
unique_values = np.unique(user_item_matrix)
print("Unique values in the user-item matrix:", unique_values)
user_item_matrix

Unique values in the user-item matrix: [0.]


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [19]:
# I will center the data in the function below, to account for variations in the ratings
def center_data(user_item_matrix):
    user_means = np.mean(user_item_matrix, axis=1)
    centered_user_item_matrix = user_item_matrix - user_means[:, np.newaxis]
    return centered_user_item_matrix, user_means

# I will decompose the user item matrix in this function using numpy
def apply_svd(centered_user_item_matrix, num_latent_factors):
    # U, sigma and Vt are created using the svd function from numpy
    U, Sigma, Vt = np.linalg.svd(centered_user_item_matrix, full_matrices=False)
    # set up sigma, which is the diagonal matrix from the decomposition
    Sigma = np.diag(Sigma[:num_latent_factors])
    # set up U and Vt which have to orthonormal to each other to ensure U represents each user and Vt represents each item, otherwise the total matrix would not add up.
    U = U[:, :num_latent_factors]
    Vt = Vt[:num_latent_factors, :]
    return U, Sigma, Vt

def compute_recommendations_for_all_users(U, Sigma, Vt, user_means, user_ids, num_recommendations):
    all_recommendations = {}
    user_id_to_index = {user_id: idx for idx, user_id in enumerate(user_ids)}
    for user_id in user_ids:
        user_index = user_id_to_index[user_id]
        # matrix multriplication between Sigma and Vt, to reconstruct an item matrix with less features, followed by the dot product of U and the reconstruction of item matrix. It essentially calculates the predicted ratings for each item for the given user based on their latent given ratings. user_means[user_index] rules out the items the user already interacted with.
        user_ratings = np.dot(U[user_index, :], np.dot(Sigma, Vt)) + user_means[user_index]
        user_ratings[user_item_matrix[user_index, :] > 0] = -np.inf
        top_indices = np.argsort(user_ratings)[::-1][:num_recommendations]
        top_items = top_indices + 1
        all_recommendations[user_id] = top_items
    return all_recommendations

In [20]:
# Step 4: Select the Number of Latent Factors
num_latent_factors = 4  # Example: Set the number of latent factors

# Unpack the tuple returned by center_data
centered_user_item_matrix, user_means = center_data(user_item_matrix)

# Apply SVD using the centered matrix
U, Sigma, Vt = apply_svd(centered_user_item_matrix, num_latent_factors)

# Define num_recommendations
num_recommendations = 4

# Compute recommendations for all users
all_recommendations = compute_recommendations_for_all_users(U, Sigma, Vt, user_means, user_ids, num_recommendations)

In [28]:
all_recommendations

{1572864: array([114,  29,  31,  32], dtype=int64),
 2621442: array([114,  29,  31,  32], dtype=int64),
 1048579: array([114,  29,  31,  32], dtype=int64),
 2621443: array([114,  29,  31,  32], dtype=int64),
 524291: array([114,  29,  31,  32], dtype=int64),
 6: array([114,  29,  31,  32], dtype=int64),
 524295: array([114,  29,  31,  32], dtype=int64),
 7: array([114,  29,  31,  32], dtype=int64),
 1048586: array([114,  29,  31,  32], dtype=int64),
 2097163: array([114,  29,  31,  32], dtype=int64),
 1048594: array([114,  29,  31,  32], dtype=int64),
 524307: array([114,  29,  31,  32], dtype=int64),
 524308: array([114,  29,  31,  32], dtype=int64),
 524312: array([114,  29,  31,  32], dtype=int64),
 524313: array([114,  29,  31,  32], dtype=int64),
 1048600: array([114,  29,  31,  32], dtype=int64),
 524315: array([114,  29,  31,  32], dtype=int64),
 524319: array([114,  29,  31,  32], dtype=int64),
 2097185: array([114,  29,  31,  32], dtype=int64),
 1048612: array([114,  29,  31, 