In [130]:
import numpy as np
import pandas as pd

In [131]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" # to make jupyter print all outputs, not just the last one
from IPython.core.display import HTML # to pretty print pandas df and be able to copy them over (e.g. to ppt slides)

In [132]:
netflix_df = pd.read_parquet('cleaned/netflix_parquet')
movielens_df = pd.read_parquet('cleaned/movielens_parquet')

In [133]:
netflix_df
movielens_df['genres'].iloc[0]

Unnamed: 0,movieId,year,title,review_data
0,1,2003,Dinosaur Planet,"[{'date': 2005-09-06, 'rating': 3.0, 'userId':..."
1,2,2004,Isle of Man TT 2004 Review,"[{'date': 2005-09-05, 'rating': 4.0, 'userId':..."
2,3,1997,Character,"[{'date': 2003-03-29, 'rating': 4.0, 'userId':..."
3,4,1994,Paula Abdul's Get Up & Dance,"[{'date': 2005-09-06, 'rating': 3.0, 'userId':..."
4,5,2004,The Rise and Fall of ECW,"[{'date': 2005-02-08, 'rating': 5.0, 'userId':..."
...,...,...,...,...
768,13508,1999,The League of Gentlemen: Series 1,"[{'date': 2003-12-18, 'rating': 2.0, 'userId':..."
769,13509,1998,Little City,"[{'date': 2003-05-29, 'rating': 3.0, 'userId':..."
770,13510,1959,Last Train from Gun Hill,"[{'date': 2005-09-02, 'rating': 3.0, 'userId':..."
771,13511,1993,Much Ado About Nothing,"[{'date': 2000-10-01, 'rating': 4.0, 'userId':..."


array(['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy'],
      dtype=object)

### Feature engineering:

A user-item matrix will be created.

In [134]:
netflix_df = netflix_df.drop(['year','title'],axis=1)
print(netflix_df)
netflix_df['review_data'].iloc[0]

     movieId                                        review_data
0          1  [{'date': 2005-09-06, 'rating': 3.0, 'userId':...
1          2  [{'date': 2005-09-05, 'rating': 4.0, 'userId':...
2          3  [{'date': 2003-03-29, 'rating': 4.0, 'userId':...
3          4  [{'date': 2005-09-06, 'rating': 3.0, 'userId':...
4          5  [{'date': 2005-02-08, 'rating': 5.0, 'userId':...
..       ...                                                ...
768    13508  [{'date': 2003-12-18, 'rating': 2.0, 'userId':...
769    13509  [{'date': 2003-05-29, 'rating': 3.0, 'userId':...
770    13510  [{'date': 2005-09-02, 'rating': 3.0, 'userId':...
771    13511  [{'date': 2000-10-01, 'rating': 4.0, 'userId':...
772    13512  [{'date': 2001-04-26, 'rating': 4.0, 'userId':...

[773 rows x 2 columns]


array([{'date': datetime.date(2005, 9, 6), 'rating': 3.0, 'userId': '1488844'},
       {'date': datetime.date(2005, 5, 13), 'rating': 5.0, 'userId': '822109'},
       {'date': datetime.date(2005, 10, 19), 'rating': 4.0, 'userId': '885013'},
       {'date': datetime.date(2005, 12, 26), 'rating': 4.0, 'userId': '30878'},
       {'date': datetime.date(2004, 5, 3), 'rating': 3.0, 'userId': '823519'},
       {'date': datetime.date(2005, 11, 17), 'rating': 3.0, 'userId': '893988'},
       {'date': datetime.date(2004, 8, 5), 'rating': 4.0, 'userId': '124105'},
       {'date': datetime.date(2004, 4, 22), 'rating': 3.0, 'userId': '1248029'},
       {'date': datetime.date(2004, 5, 9), 'rating': 4.0, 'userId': '1842128'},
       {'date': datetime.date(2005, 5, 11), 'rating': 3.0, 'userId': '2238063'},
       {'date': datetime.date(2005, 5, 19), 'rating': 4.0, 'userId': '1503895'},
       {'date': datetime.date(2005, 6, 6), 'rating': 5.0, 'userId': '2207774'},
       {'date': datetime.date(2004, 8

#### Let's work with movies and reviews first, add other features later:

Only rating and userId of dictionary will be kept to accomplish this.

In [135]:
netflix_df['review_data'] = netflix_df['review_data'].apply(lambda x: None if x is None else [{'userId': review['userId'], 'rating': review['rating']} for review in x if 'userId' in review and 'rating' in review])

#### First we need to find unique user and item(movie)ids:

In [136]:
# set is used because it does not allow for duplicates
user_ids = set()

# iterate over each row
for index, row in netflix_df.iterrows():
    # iterate over each dictionary in the 'review_data' column of the current row
    for review_dict in row['review_data']:
        user_id = review_dict.get('userId')  # Extract userId from the dictionary
        if user_id:  # Check if userId exists
            user_ids.add(user_id)  # Add user ID to the set

user_ids = list(user_ids)

In [137]:
# put movieids in set so duplicates are not allowed for here either
item_ids = list(set(netflix_df['movieId'].unique()))

In [138]:
print("The user/item matrix will be {} x {}. Therefore its likely the matrix will be very sparse.".format(len(user_ids),len(item_ids))) 

The user/item matrix will be 417530 x 773. Therefore its likely the matrix will be very sparse.


#### Now we will populate the matrix with the matrix_df values:

In [139]:
# Assuming user_ids is a list of unique user IDs
user_ids_dict = {uid: idx for idx, uid in enumerate(user_ids)}

# Determine the number of users and movies
num_users = len(user_ids)
num_movies = len(item_ids)

# Initialize user-item matrix with zeros
user_item_matrix = np.zeros((num_users, num_movies))

for i, (reviews, movie_id) in enumerate(zip(netflix_df['review_data'], netflix_df['movieId'])):
    for review in reviews:
        user_idx = user_ids_dict.get(review['userId'])
        if user_idx is not None:  # Check if user exists in user_ids
            rating = round(review['rating'], 6)
            # Populate the user-item matrix using the user index as the row index
            user_item_matrix[user_idx, i] = rating

In [140]:
user_item_matrix

array([[0., 0., 0., ..., 0., 4., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [141]:
unique_values = np.unique(user_item_matrix)
print("Unique values in the user-item matrix:", unique_values)
user_item_matrix

Unique values in the user-item matrix: [0. 1. 2. 3. 4. 5.]


array([[0., 0., 0., ..., 0., 4., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [142]:
# Step 2: Center the Data
def center_data(user_item_matrix):
    user_means = np.mean(user_item_matrix, axis=1)
    centered_user_item_matrix = user_item_matrix - user_means[:, np.newaxis]
    return centered_user_item_matrix, user_means

# Step 3: Apply SVD
def apply_svd(centered_user_item_matrix, num_latent_factors):
    U, Sigma, Vt = np.linalg.svd(centered_user_item_matrix, full_matrices=False)
    Sigma = np.diag(Sigma[:num_latent_factors])
    U = U[:, :num_latent_factors]
    Vt = Vt[:num_latent_factors, :]
    return U, Sigma, Vt

# Step 4: Select the Number of Latent Factors
num_latent_factors = 2  # Example: Set the number of latent factors

# Step 5: Compute Recommendations
def compute_recommendations(U, Sigma, Vt, user_means, user_id, num_recommendations):
    user_index = user_id - 1
    user_ratings = np.dot(U[user_index, :], np.dot(Sigma, Vt)) + user_means[user_index]
    # Filter out items the user has already interacted with
    user_ratings[user_item_matrix[user_index, :] > 0] = -np.inf
    top_indices = np.argsort(user_ratings)[::-1][:num_recommendations]
    return top_indices + 1  # Add 1 to match item IDs (assuming item IDs start from 1)

# Step 6: Recommendation Generation
def generate_recommendations(user_id, num_recommendations):
    recommendations = compute_recommendations(U, Sigma, Vt, user_means, user_id, num_recommendations)
    return recommendations