In [1]:
import numpy as np
import pandas as pd

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" # to make jupyter print all outputs, not just the last one
from IPython.core.display import HTML # to pretty print pandas df and be able to copy them over (e.g. to ppt slides)

In [3]:
netflix_df = pd.read_parquet('cleaned/netflix_parquet')

### Feature engineering:

A user-item matrix will be created.

In [4]:
netflix_df = netflix_df.drop(['year','title'],axis=1)
netflix_df

Unnamed: 0,movieId,review_data
0,1,"[{'date': 2005-09-06, 'rating': 3.0, 'userId':..."
1,2,"[{'date': 2005-09-05, 'rating': 4.0, 'userId':..."
2,3,"[{'date': 2003-03-29, 'rating': 4.0, 'userId':..."
3,4,"[{'date': 2005-09-06, 'rating': 3.0, 'userId':..."
4,5,"[{'date': 2005-02-08, 'rating': 5.0, 'userId':..."
...,...,...
1428,13669,"[{'date': 2005-07-07, 'rating': 4.0, 'userId':..."
1429,13670,"[{'date': 2002-09-04, 'rating': 1.0, 'userId':..."
1430,13671,"[{'date': 2005-11-28, 'rating': 4.0, 'userId':..."
1431,13672,"[{'date': 2005-07-07, 'rating': 5.0, 'userId':..."


#### Let's work with movies and reviews first, add other features later:

Only rating and userId of dictionary will be kept to accomplish this.

In [5]:
netflix_df['review_data'] = netflix_df['review_data'].apply(lambda x: None if x is None else [{'userId': review['userId'], 'rating': review['rating']} for review in x if 'userId' in review and 'rating' in review])

Make samples:

In [6]:
# make samples using random sampling:
sample_3quarter = netflix_df.sample(frac=3/4, random_state=42)
sample_half = netflix_df.sample(frac=1/2, random_state=42)
sample_third = netflix_df.sample(frac=1/3, random_state=42)
sample_quarter = netflix_df.sample(frac=1/4, random_state=42)
sample_sixth = netflix_df.sample(frac=1/6, random_state=42)
sample_tenth = netflix_df.sample(frac=1/10, random_state=42)
sample_25th = netflix_df.sample(frac=1/25, random_state=42)

In [7]:
def train_val_test_split(data, train_ratio=0.8, val_ratio=0.1, test_ratio=0.1):
    """
    Splits the data into training, validation, and test sets.

    Parameters:
    - data: pandas DataFrame containing the data to be split.
    - train_ratio: float, ratio of the training set size to the total data size (default: 0.8).
    - val_ratio: float, ratio of the validation set size to the total data size (default: 0.1).
    - test_ratio: float, ratio of the test set size to the total data size (default: 0.1).

    Returns:
    - train_data: pandas DataFrame, training set.
    - val_data: pandas DataFrame, validation set.
    - test_data: pandas DataFrame, test set.
    """
    # Shuffle the data
    data_shuffled = data.sample(frac=1, random_state=42)

    # Calculate the sizes of each set
    num_samples = len(data_shuffled)
    num_train = int(train_ratio * num_samples)
    num_val = int(val_ratio * num_samples)
    num_test = num_samples - num_train - num_val

    # Split the data into train, validation, and test sets
    train_data = data_shuffled[:num_train]
    # Below is ensured the validation data and the test data starts after the indices which are already in the training data, ensuring that no training data will flow into validation of test data.
    val_data = data_shuffled[num_train:num_train+num_val]
    test_data = data_shuffled[num_train+num_val:]

    # Reset index for each set
    train_data.reset_index(drop=True, inplace=True)
    val_data.reset_index(drop=True, inplace=True)
    test_data.reset_index(drop=True, inplace=True)

    return train_data, val_data, test_data

In [8]:
train_data, val_data, test_data = train_val_test_split(sample_tenth)

With numpy for speed (does not work yet):

In [9]:
# Extract review data, user IDs, and ratings using NumPy
review_data = train_data['review_data'].values
user_ids = np.concatenate([np.array([entry['userId'] for entry in row]) for row in review_data])
ratings = np.concatenate([np.array([entry['rating'] for entry in row]) for row in review_data])

# Extract movie titles
movieIds = np.concatenate([[movieId] * len(row) for movieId, row in zip(train_data['movieId'], review_data)])

# Step 1: Create dictionaries to map user IDs and movie IDs to unique indices
user_id_dict = {user_id: index for index, user_id in enumerate(np.unique(user_ids))}
movie_id_dict = {movie_id: index for index, movie_id in enumerate(np.unique(movieIds))}

# Step 2: Initialize an empty user-item matrix
user_count = len(user_id_dict)
movie_count = len(movie_id_dict)
user_item_matrix = np.zeros((user_count, movie_count))

# Step 3: Populate the user-item matrix with ratings
for i, (user_id, movie_id, rating) in enumerate(zip(user_ids, movieIds, ratings)):
    user_index = user_id_dict[user_id]
    movie_index = movie_id_dict[movie_id]
    user_item_matrix[user_index, movie_index] = rating

# Print the user-item matrix
user_item_matrix

array([[4., 0., 0., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [5., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [3., 0., 0., ..., 0., 0., 0.]])

In [10]:
unique_values = np.unique(user_item_matrix)
print("Unique values in the user-item matrix:", unique_values)
user_item_matrix

Unique values in the user-item matrix: [0. 1. 2. 3. 4. 5.]


array([[4., 0., 0., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [5., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [3., 0., 0., ..., 0., 0., 0.]])

#### First we need to find unique user and item(movie)ids:

In [11]:
# set is used because it does not allow for duplicates
user_ids = set()

# iterate over each row
for index, row in train_data.iterrows():
    # iterate over each dictionary in the 'review_data' column of the current row
    for review_dict in row['review_data']:
        user_id = review_dict.get('userId')  # Extract userId from the dictionary
        if user_id:  # Check if userId exists
            user_ids.add(int(user_id))  # Add user ID to the set

user_ids = list(user_ids)

In [12]:
# put movieids in set so duplicates are not allowed here either
item_ids = list(set(train_data['movieId'].unique()))

#### Now we will populate the matrix with the training data values:

In [14]:
unique_values = np.unique(user_item_matrix)
print("Unique values in the user-item matrix:", unique_values)
user_item_matrix

Unique values in the user-item matrix: [0. 1. 2. 3. 4. 5.]


array([[4., 0., 0., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [5., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [3., 0., 0., ..., 0., 0., 0.]])

In [15]:
# I will center the data in the function below, to make the matrix more robust to handle variations in user ratings
def center_data(user_item_matrix):
    user_means = np.mean(user_item_matrix, axis=1)
    centered_user_item_matrix = user_item_matrix - user_means[:, np.newaxis]
    return centered_user_item_matrix, user_means

# I will decompose the user item matrix in this function using numpy
def apply_svd(centered_user_item_matrix, num_latent_factors):
    # U, sigma and Vt are created using the svd function from numpy
    U, Sigma, Vt = np.linalg.svd(centered_user_item_matrix, full_matrices=False)
    # set up sigma, which is the diagonal matrix from the decomposition
    Sigma = np.diag(Sigma[:num_latent_factors])
    # set up U and Vt which have to orthonormal to each other to ensure U represents each user and Vt represents each item, otherwise the total matrix would not add up.
    U = U[:, :num_latent_factors]
    Vt = Vt[:num_latent_factors, :]
    return U, Sigma, Vt

def compute_recommendations_for_all_users(U, Sigma, Vt, user_means, user_ids, num_recommendations):
    all_recommendations = {}
    user_id_to_index = {user_id: idx for idx, user_id in enumerate(user_ids)}
    for user_id in user_ids:
        user_index = user_id_to_index[user_id]
        # matrix multriplication between Sigma and Vt, to reconstruct an item matrix with less features, followed by the dot product of U and the reconstruction of item matrix. It essentially calculates the predicted ratings for each item for the given user based on their latent given ratings. user_means[user_index] rules out the items the user already interacted with.
        user_ratings = np.dot(U[user_index, :], np.dot(Sigma, Vt)) + user_means[user_index]
        user_ratings[user_item_matrix[user_index, :] > 0] = -np.inf
        top_indices = np.argsort(user_ratings)[::-1][:num_recommendations]
        top_items = top_indices + 1
        all_recommendations[user_id] = top_items
    return all_recommendations

In [16]:
# select the Number of Latent Factors
num_latent_factors = 4 

# unpack the tuple returned by center_data function to get an updates user item matrix which is more robust to variations in rating
centered_user_item_matrix, user_means = center_data(user_item_matrix)

# apply SVD using the centered matrix to reduce memory usage and to decompose the matrix to be able to make recommendations using the dot product method
U, Sigma, Vt = apply_svd(centered_user_item_matrix, num_latent_factors)
U 
Sigma
Vt

# define number of recommendations per user
num_recommendations = 4

# compute the recommendations
all_recommendations = compute_recommendations_for_all_users(U, Sigma, Vt, user_means, user_ids, num_recommendations)

array([[-0.00298226, -0.00105855, -0.00022081,  0.00091144],
       [-0.00283966, -0.00073889,  0.00521205,  0.00210447],
       [-0.00273062, -0.00316922,  0.00028598, -0.00016714],
       ...,
       [-0.00052413,  0.00143719,  0.00398661,  0.00179054],
       [-0.00061305,  0.00116704, -0.00131438, -0.00418057],
       [-0.00163837, -0.00190153,  0.00017159, -0.00010028]])

array([[1480.19981372,    0.        ,    0.        ,    0.        ],
       [   0.        ,  887.98194436,    0.        ,    0.        ],
       [   0.        ,    0.        ,  772.40907707,    0.        ],
       [   0.        ,    0.        ,    0.        ,  701.57819756]])

array([[-8.08372698e-01, -1.94172837e-02,  1.72785552e-02,
         3.03946990e-02,  1.71108747e-02,  3.14586237e-02,
         2.68222298e-02, -3.20702369e-02,  2.00142850e-02,
         3.11595248e-02,  3.12965181e-02,  2.96517257e-02,
         3.04956655e-02,  1.93967576e-02,  3.15445291e-02,
         1.06771989e-02,  3.14275230e-02,  3.14368316e-02,
         2.19215861e-02, -1.19902604e-03,  1.01577558e-02,
         2.96076530e-02,  3.15960550e-02, -1.93955254e-01,
         3.06565263e-02,  3.02269115e-02,  3.13766075e-02,
         1.17975779e-02,  3.14347355e-02,  3.15368135e-02,
         3.15301334e-02, -1.81486954e-01,  3.11323534e-02,
         2.80787560e-02,  3.06963237e-02,  3.14611452e-02,
         2.93358645e-02,  2.88734908e-02,  2.12136070e-02,
         3.00237668e-02,  3.11743597e-02,  4.24759670e-03,
         3.02555947e-02,  3.06256615e-02,  1.67006968e-02,
         2.53947974e-02, -3.29352258e-01,  3.15174643e-02,
         2.77504623e-02,  2.78257492e-02,  3.10886015e-0

In [17]:
all_recommendations

{1572864: array([ 47,  24, 114,  55], dtype=int64),
 2621442: array([82, 60, 94, 72], dtype=int64),
 1048579: array([ 82,  32,  59, 114], dtype=int64),
 2621443: array([ 47,  68, 101,  23], dtype=int64),
 524291: array([ 82,  32,  59, 114], dtype=int64),
 6: array([ 82,  94, 111, 114], dtype=int64),
 524295: array([ 82,  94, 111,  32], dtype=int64),
 7: array([47,  1, 24, 94], dtype=int64),
 1048586: array([ 82,  32,  59, 114], dtype=int64),
 2097163: array([60, 32, 24, 47], dtype=int64),
 1048594: array([ 82,  94,  60, 114], dtype=int64),
 524307: array([92, 48, 10, 27], dtype=int64),
 524308: array([24, 60, 32, 82], dtype=int64),
 524312: array([ 47,  24,  55, 111], dtype=int64),
 524313: array([ 82,  32,  59, 114], dtype=int64),
 1048600: array([ 82,  32,  59, 114], dtype=int64),
 524315: array([ 94, 114,  72,  32], dtype=int64),
 524319: array([ 82,  32,  59, 114], dtype=int64),
 2097185: array([ 82,  32,  59, 114], dtype=int64),
 1048612: array([ 82,  32,  59, 114], dtype=int64),


Redundant but maybe useful for troubleshooting:

In [18]:
# # Extract unique user IDs from the dataset
# dataset_user_ids = set()
# for review_list in train_data['review_data']:
#     for review_dict in review_list:
#         user_id = review_dict.get('userId')
#         if user_id:
#             dataset_user_ids.add(user_id)

# # Check if all user IDs in the matrix are also in the dataset, and vice versa
# user_ids_in_dataset_not_in_matrix = dataset_user_ids - set(user_ids)
# user_ids_in_matrix_not_in_dataset = set(user_ids) - dataset_user_ids
# len(user_ids_in_dataset_not_in_matrix)
# len(user_ids_in_matrix_not_in_dataset)

User item matrix with pandas:

In [19]:
# # extract review dates, user IDs, and ratings using NumPy
# review_data = train_data['review_data'].values
# user_ids = np.concatenate([np.array([entry['userId'] for entry in row]) for row in review_data])
# ratings = np.concatenate([np.array([entry['rating'] for entry in row]) for row in review_data])

# # Extract movie titles
# movieIds = np.concatenate([[movieId] * len(row) for movieId, row in zip(train_data['movieId'], review_data)])

# # Create a DataFrame with review dates, user IDs, ratings, and movie titles
# review_df = pd.DataFrame({'userId': user_ids, 'rating': ratings, 'movieId': movieIds})

# # Pivot review_df to get user-item matrix with reviews as values
# user_item_matrix_df = review_df.pivot_table(index='userId', columns='movieId', values='rating')

# # Fill NaN values with 0
# user_item_matrix_df = user_item_matrix_df.fillna(0)

# # Convert DataFrame to NumPy array
# user_item_matrix = user_item_matrix_df.to_numpy()

# user_item_matrix