In [44]:
import numpy as np
import pandas as pd

In [45]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" # to make jupyter print all outputs, not just the last one
from IPython.core.display import HTML # to pretty print pandas df and be able to copy them over (e.g. to ppt slides)

In [46]:
netflix_df = pd.read_parquet('cleaned/netflix_parquet')

In [47]:
# make samples using random sampling:
sample_3quarter = netflix_df.sample(frac=3/4, random_state=42)
sample_half = netflix_df.sample(frac=1/2, random_state=42)
sample_third = netflix_df.sample(frac=1/3, random_state=42)
sample_quarter = netflix_df.sample(frac=1/4, random_state=42)
sample_sixth = netflix_df.sample(frac=1/6, random_state=42)
sample_tenth = netflix_df.sample(frac=1/10, random_state=42)
sample_25th = netflix_df.sample(frac=1/25, random_state=42)

### Feature engineering:

A user-item matrix will be created.

In [48]:
sample_tenth = sample_tenth.drop(['year','title'],axis=1)
sample_tenth

Unnamed: 0,movieId,review_data
123,124,"[{'date': 2002-04-01, 'rating': 3.0, 'userId':..."
1193,13434,"[{'date': 2003-02-20, 'rating': 2.0, 'userId':..."
462,4601,"[{'date': 2003-12-27, 'rating': 3.0, 'userId':..."
351,352,"[{'date': 2003-10-13, 'rating': 4.0, 'userId':..."
1058,9560,"[{'date': 2003-07-06, 'rating': 4.0, 'userId':..."
...,...,...
367,4506,"[{'date': 2005-07-06, 'rating': 4.0, 'userId':..."
1415,13656,"[{'date': 2003-11-21, 'rating': 2.0, 'userId':..."
768,9270,"[{'date': 2002-09-26, 'rating': 4.0, 'userId':..."
1218,13459,"[{'date': 2000-07-18, 'rating': 2.0, 'userId':..."


#### Let's work with movies and reviews first, add other features later:

Only rating and userId of dictionary will be kept to accomplish this.

In [49]:
sample_tenth['review_data'] = sample_tenth['review_data'].apply(lambda x: None if x is None else [{'userId': review['userId'], 'rating': review['rating']} for review in x if 'userId' in review and 'rating' in review])

In [52]:
def train_val_test_split(data, train_ratio=0.8, val_ratio=0.1, test_ratio=0.1):
    """
    Splits the data into training, validation, and test sets.

    Parameters:
    - data: pandas DataFrame containing the data to be split.
    - train_ratio: float, ratio of the training set size to the total data size (default: 0.8).
    - val_ratio: float, ratio of the validation set size to the total data size (default: 0.1).
    - test_ratio: float, ratio of the test set size to the total data size (default: 0.1).

    Returns:
    - train_data: pandas DataFrame, training set.
    - val_data: pandas DataFrame, validation set.
    - test_data: pandas DataFrame, test set.
    """
    # Shuffle the data
    data_shuffled = data.sample(frac=1, random_state=42)

    # Calculate the sizes of each set
    num_samples = len(data_shuffled)
    num_train = int(train_ratio * num_samples)
    num_val = int(val_ratio * num_samples)
    num_test = num_samples - num_train - num_val

    # Split the data into train, validation, and test sets
    train_data = data_shuffled[:num_train]
    val_data = data_shuffled[num_train:num_train+num_val]
    test_data = data_shuffled[num_train+num_val:num_train+num_val+num_test]

    return train_data, val_data, test_data

In [53]:
train_data, val_data, test_data = train_val_test_split(sample_tenth)

#### First we need to find unique user and item(movie)ids:

In [54]:
# set is used because it does not allow for duplicates
user_ids = set()

# iterate over each row
for index, row in train_data.iterrows():
    # iterate over each dictionary in the 'review_data' column of the current row
    for review_dict in row['review_data']:
        user_id = review_dict.get('userId')  # Extract userId from the dictionary
        if user_id:  # Check if userId exists
            user_ids.add(user_id)  # Add user ID to the set

user_ids = list(user_ids)

In [55]:
# put movieids in set so duplicates are not allowed for here either
item_ids = list(set(train_data['movieId'].unique()))

In [56]:
print("The user/item matrix will be {} x {}. Therefore its likely the matrix will be very sparse.".format(len(user_ids),len(item_ids))) 

The user/item matrix will be 238312 x 114. Therefore its likely the matrix will be very sparse.


#### Now we will populate the matrix with the matrix_df values:

In [57]:
# Assuming user_ids is a list of unique user IDs
user_ids_dict = {uid: idx for idx, uid in enumerate(user_ids)}

# Determine the number of users and movies
num_users = len(user_ids)
num_movies = len(item_ids)

# Initialize user-item matrix with zeros
user_item_matrix = np.zeros((num_users, num_movies))

for i, (reviews, movie_id) in enumerate(zip(train_data['review_data'], train_data['movieId'])):
    for review in reviews:
        user_idx = user_ids_dict.get(review['userId'])
        if user_idx is not None:  # Check if user exists in user_ids
            rating = round(review['rating'], 6)
            # Populate the user-item matrix using the user index as the row index
            user_item_matrix[user_idx, i] = rating

In [58]:
user_item_matrix

array([[0., 0., 4., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [3., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [59]:
unique_values = np.unique(user_item_matrix)
print("Unique values in the user-item matrix:", unique_values)
user_item_matrix

Unique values in the user-item matrix: [0. 1. 2. 3. 4. 5.]


array([[0., 0., 4., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [3., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [60]:
# Step 2: Center the Data
def center_data(user_item_matrix):
    user_means = np.mean(user_item_matrix, axis=1)
    centered_user_item_matrix = user_item_matrix - user_means[:, np.newaxis]
    return centered_user_item_matrix, user_means

# Step 3: Apply SVD
def apply_svd(centered_user_item_matrix, num_latent_factors):
    U, Sigma, Vt = np.linalg.svd(centered_user_item_matrix, full_matrices=False)
    Sigma = np.diag(Sigma[:num_latent_factors])
    U = U[:, :num_latent_factors]
    Vt = Vt[:num_latent_factors, :]
    return U, Sigma, Vt

# Step 4: Select the Number of Latent Factors
num_latent_factors = 2  # Example: Set the number of latent factors

# Step 5: Compute Recommendations
def compute_recommendations(U, Sigma, Vt, user_means, user_id, num_recommendations):
    user_index = user_id - 1
    user_ratings = np.dot(U[user_index, :], np.dot(Sigma, Vt)) + user_means[user_index]
    # Filter out items the user has already interacted with
    user_ratings[user_item_matrix[user_index, :] > 0] = -np.inf
    top_indices = np.argsort(user_ratings)[::-1][:num_recommendations]
    return top_indices + 1  # Add 1 to match item IDs (assuming item IDs start from 1)

# Step 6: Recommendation Generation
def generate_recommendations(user_id, num_recommendations):
    recommendations = compute_recommendations(U, Sigma, Vt, user_means, user_id, num_recommendations)
    return recommendations

In [72]:
user_ids

['1707083',
 '1225166',
 '271571',
 '1187504',
 '376197',
 '343079',
 '2314153',
 '593400',
 '984864',
 '1775345',
 '11059',
 '203982',
 '17083',
 '369256',
 '2195799',
 '1451179',
 '1275924',
 '270212',
 '2067793',
 '817811',
 '2388852',
 '1043570',
 '2462248',
 '821900',
 '291156',
 '2469581',
 '1454436',
 '360846',
 '2632347',
 '1860633',
 '1222147',
 '2216584',
 '1978818',
 '488519',
 '592636',
 '1102182',
 '1155747',
 '2071147',
 '1754534',
 '2433552',
 '1487026',
 '1227725',
 '1958917',
 '346499',
 '789999',
 '1186156',
 '2378681',
 '2343337',
 '1657798',
 '1645783',
 '1225309',
 '2394320',
 '260430',
 '24373',
 '1072822',
 '1062539',
 '1183014',
 '1462925',
 '1523528',
 '108733',
 '385413',
 '1657452',
 '235786',
 '156948',
 '1994031',
 '1978844',
 '1816429',
 '458577',
 '445879',
 '1904886',
 '2378599',
 '2063145',
 '2352323',
 '408117',
 '386278',
 '506115',
 '2365159',
 '480163',
 '2360901',
 '1682857',
 '2370132',
 '1675249',
 '1815109',
 '1756385',
 '802808',
 '1696864',
 '

In [75]:
centered_user_item_matrix, user_means = center_data(user_item_matrix)
U, Sigma, Vt = apply_svd(centered_user_item_matrix, num_latent_factors)
num_recommendations = 2
# Iterate over each user ID in the user_ids list
for user_id in user_ids:
    user_id = int(user_id)
    # Generate recommendations for the current user
    recommendations = generate_recommendations(user_id, num_recommendations)
    print("Recommendations for user", user_id, ":", recommendations)

IndexError: index 1707082 is out of bounds for axis 0 with size 238312