In [26]:
import numpy as np
import pandas as pd

In [27]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" # to make jupyter print all outputs, not just the last one
from IPython.core.display import HTML # to pretty print pandas df and be able to copy them over (e.g. to ppt slides)

In [28]:
matrix_df = pd.read_parquet('cleaned/netflix_parquet')

In [29]:
matrix_df

Unnamed: 0,movieId,year,title,review_data
0,1,2003,Dinosaur Planet,"[{'date': 2005-09-06, 'rating': 3.0, 'userId':..."
1,2,2004,Isle of Man TT 2004 Review,"[{'date': 2005-09-05, 'rating': 4.0, 'userId':..."
2,3,1997,Character,"[{'date': 2003-03-29, 'rating': 4.0, 'userId':..."
3,4,1994,Paula Abdul's Get Up & Dance,"[{'date': 2005-09-06, 'rating': 3.0, 'userId':..."
4,5,2004,The Rise and Fall of ECW,"[{'date': 2005-02-08, 'rating': 5.0, 'userId':..."
...,...,...,...,...
768,13508,1999,The League of Gentlemen: Series 1,"[{'date': 2003-12-18, 'rating': 2.0, 'userId':..."
769,13509,1998,Little City,"[{'date': 2003-05-29, 'rating': 3.0, 'userId':..."
770,13510,1959,Last Train from Gun Hill,"[{'date': 2005-09-02, 'rating': 3.0, 'userId':..."
771,13511,1993,Much Ado About Nothing,"[{'date': 2000-10-01, 'rating': 4.0, 'userId':..."


### Feature engineering:

A user-item matrix will be created.

In [30]:
matrix_df = matrix_df.drop(['year','title'],axis=1)
print(matrix_df)

     movieId                                        review_data
0          1  [{'date': 2005-09-06, 'rating': 3.0, 'userId':...
1          2  [{'date': 2005-09-05, 'rating': 4.0, 'userId':...
2          3  [{'date': 2003-03-29, 'rating': 4.0, 'userId':...
3          4  [{'date': 2005-09-06, 'rating': 3.0, 'userId':...
4          5  [{'date': 2005-02-08, 'rating': 5.0, 'userId':...
..       ...                                                ...
768    13508  [{'date': 2003-12-18, 'rating': 2.0, 'userId':...
769    13509  [{'date': 2003-05-29, 'rating': 3.0, 'userId':...
770    13510  [{'date': 2005-09-02, 'rating': 3.0, 'userId':...
771    13511  [{'date': 2000-10-01, 'rating': 4.0, 'userId':...
772    13512  [{'date': 2001-04-26, 'rating': 4.0, 'userId':...

[773 rows x 2 columns]


#### Let's work with movies and reviews first, add other features later:

Only first two keys of dictionary will be kept to accomplish this.

In [31]:
matrix_df['review_data'] = matrix_df['review_data'].apply(lambda x: None if x is None else [{'userId': review['userId'], 'rating': review['rating']} for review in x if 'userId' in review and 'rating' in review])

First we need to find unique user and item(movie)ids:

In [33]:
# set is used because it does not allow for duplicates
user_ids = set()

# iterate over each row
for index, row in matrix_df.iterrows():
    # iterate over each dictionary in the 'review_data' column of the current row
    for review_dict in row['review_data']:
        user_id = review_dict.get('userId')  # Extract userId from the dictionary
        if user_id:  # Check if userId exists
            user_ids.add(user_id)  # Add user ID to the set

user_ids = list(user_ids)

In [34]:
# put movieids in set so duplicates are not allowed for here either
item_ids = list(set(matrix_df['movieId'].unique()))

We will initialize the matrix now:

In [35]:
num_users = len(user_ids)
num_items = len(item_ids)
user_item_matrix = np.zeros((num_items, num_users))

Now we will populate the matrix with the matrix_df values:

In [37]:
# # this for loop looks into each values of both keys in the dictionary in the review_data column
# for i, reviews in enumerate(matrix_df['review_data']):
#     for review in reviews:
#         user_idx = user_ids.index(review['userId'])
#         user_item_matrix[i, user_idx] = review['rating']

# Assuming user_ids is a list of unique user IDs
user_ids_dict = {uid: idx for idx, uid in enumerate(user_ids)}

for i, reviews in enumerate(matrix_df['review_data']):
    for review in reviews:
        user_idx = user_ids_dict.get(review['userId'])
        if user_idx is not None:  # Check if user exists in user_ids
            user_item_matrix[i, user_idx] = review['rating']

In [38]:
user_item_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])