In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from utils import get_content_similarity
import ast
import json

In [19]:
# df_ratings = pd.read_csv('dataset/ratings_small.csv') 
df_ratings = pd.read_csv('dataset/ratings_generated.csv', low_memory=False)
df_movies = pd.read_csv('dataset/movies_data_embeddings.csv', low_memory=False)
df_movies['embeddings'] = df_movies['embeddings'].apply(json.loads)

# Optionally, convert lists back to NumPy arrays
df_movies['embeddings'] = df_movies['embeddings'].apply(np.array)

In [20]:
def create_user_movie_matrix(ratings):
    """
    Create a user-movie matrix with users as rows and movies as columns.
    Missing entries are filled with 0.
    """
    user_movie_matrix = ratings.pivot(index='userId', columns='movieId', values='rating').fillna(0)
    return user_movie_matrix

user_movie_matrix = create_user_movie_matrix(df_ratings)
user_movie_matrix

movieId,5,6,11,12,13,14,15,16,17,18,...,455661,456018,456781,458298,459802,459928,460024,461297,461805,465044
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2.0,1.0,4.0,5.0,4.0,3.0,1.0,1.0,2.0,5.0,...,2.0,0.0,3.0,4.0,0.0,4.0,2.0,2.0,0.0,1.0
2,0.0,3.0,5.0,5.0,5.0,2.0,3.0,3.0,0.0,1.0,...,3.0,0.0,0.0,5.0,3.0,5.0,3.0,3.0,2.0,0.0
3,5.0,4.0,5.0,0.0,0.0,4.0,2.0,4.0,5.0,4.0,...,1.0,1.0,1.0,5.0,3.0,5.0,3.0,5.0,1.0,2.0
4,4.0,4.0,0.0,2.0,0.0,0.0,4.0,3.0,5.0,0.0,...,0.0,5.0,4.0,1.0,5.0,5.0,0.0,5.0,5.0,2.0
5,0.0,2.0,5.0,4.0,4.0,2.0,1.0,4.0,2.0,5.0,...,1.0,3.0,2.0,4.0,2.0,3.0,3.0,2.0,2.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
296,3.0,0.0,0.0,4.0,0.0,4.0,3.0,5.0,0.0,4.0,...,5.0,5.0,0.0,1.0,0.0,5.0,5.0,4.0,3.0,2.0
297,3.0,2.0,5.0,0.0,0.0,4.0,0.0,5.0,1.0,4.0,...,3.0,5.0,3.0,3.0,3.0,3.0,4.0,2.0,0.0,1.0
298,3.0,4.0,3.0,3.0,0.0,0.0,1.0,5.0,0.0,4.0,...,5.0,5.0,0.0,0.0,0.0,0.0,1.0,0.0,4.0,1.0
299,0.0,1.0,5.0,0.0,1.0,5.0,0.0,5.0,5.0,3.0,...,0.0,0.0,0.0,4.0,2.0,2.0,1.0,0.0,1.0,1.0


In [21]:
def compute_svd(user_movie_matrix, k=20):
    """
    Perform SVD on the user-movie matrix and reduce to k latent factors.
    """
    matrix = user_movie_matrix.values
    U, sigma, Vt = np.linalg.svd(matrix, full_matrices=False)
    # Reduce to k latent factors
    U_k = U[:, :k]
    sigma_k = np.diag(sigma[:k])
    Vt_k = Vt[:k, :]
    return U_k, sigma_k, Vt_k

# Compute SVD
k = 400  # Number of latent factors
U_k, sigma_k, Vt_k = compute_svd(user_movie_matrix, k)

In [22]:
def predict_ratings(U_k, sigma_k, Vt_k):
    """
    Reconstruct the user-movie matrix to predict ratings.
    """
    return np.dot(np.dot(U_k, sigma_k), Vt_k)

predicted_ratings = predict_ratings(U_k, sigma_k, Vt_k)
predicted_ratings_df = pd.DataFrame(predicted_ratings, index=user_movie_matrix.index, columns=user_movie_matrix.columns)
predicted_ratings_df

movieId,5,6,11,12,13,14,15,16,17,18,...,455661,456018,456781,458298,459802,459928,460024,461297,461805,465044
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2.000000e+00,1.000000e+00,4.000000e+00,5.000000e+00,4.000000e+00,3.000000e+00,1.000000e+00,1.0,2.000000e+00,5.000000e+00,...,2.000000e+00,1.776357e-15,3.000000e+00,4.000000e+00,-5.773160e-15,4.000000e+00,2.000000e+00,2.000000e+00,-4.218847e-15,1.000000e+00
2,-2.712053e-12,3.000000e+00,5.000000e+00,5.000000e+00,5.000000e+00,2.000000e+00,3.000000e+00,3.0,-1.909584e-14,1.000000e+00,...,3.000000e+00,1.110223e-14,-1.071365e-14,5.000000e+00,3.000000e+00,5.000000e+00,3.000000e+00,3.000000e+00,2.000000e+00,-2.442491e-15
3,5.000000e+00,4.000000e+00,5.000000e+00,-2.137179e-14,-1.176836e-14,4.000000e+00,2.000000e+00,4.0,5.000000e+00,4.000000e+00,...,1.000000e+00,1.000000e+00,1.000000e+00,5.000000e+00,3.000000e+00,5.000000e+00,3.000000e+00,5.000000e+00,1.000000e+00,2.000000e+00
4,4.000000e+00,4.000000e+00,-8.959500e-13,2.000000e+00,1.554312e-15,-1.793010e-14,4.000000e+00,3.0,5.000000e+00,-3.080869e-14,...,7.993606e-15,5.000000e+00,4.000000e+00,1.000000e+00,5.000000e+00,5.000000e+00,1.332268e-15,5.000000e+00,5.000000e+00,2.000000e+00
5,-1.898481e-12,2.000000e+00,5.000000e+00,4.000000e+00,4.000000e+00,2.000000e+00,1.000000e+00,4.0,2.000000e+00,5.000000e+00,...,1.000000e+00,3.000000e+00,2.000000e+00,4.000000e+00,2.000000e+00,3.000000e+00,3.000000e+00,2.000000e+00,2.000000e+00,3.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
296,3.000000e+00,8.453377e-13,-5.141165e-13,4.000000e+00,-4.485301e-14,4.000000e+00,3.000000e+00,5.0,-6.572520e-14,4.000000e+00,...,5.000000e+00,5.000000e+00,-4.218847e-15,1.000000e+00,-5.107026e-15,5.000000e+00,5.000000e+00,4.000000e+00,3.000000e+00,2.000000e+00
297,3.000000e+00,2.000000e+00,5.000000e+00,-1.261213e-13,-2.713801e-14,4.000000e+00,1.110223e-14,5.0,1.000000e+00,4.000000e+00,...,3.000000e+00,5.000000e+00,3.000000e+00,3.000000e+00,3.000000e+00,3.000000e+00,4.000000e+00,2.000000e+00,1.909584e-14,1.000000e+00
298,3.000000e+00,4.000000e+00,3.000000e+00,3.000000e+00,-5.273559e-14,-2.278733e-14,1.000000e+00,5.0,-6.727952e-14,4.000000e+00,...,5.000000e+00,5.000000e+00,-1.554312e-15,-9.436896e-16,-6.411538e-15,1.398881e-14,1.000000e+00,-2.442491e-15,4.000000e+00,1.000000e+00
299,-3.503864e-13,1.000000e+00,5.000000e+00,-8.676393e-14,1.000000e+00,5.000000e+00,-1.976197e-14,5.0,5.000000e+00,3.000000e+00,...,-1.831868e-15,-1.762479e-15,-1.132427e-14,4.000000e+00,2.000000e+00,2.000000e+00,1.000000e+00,-5.107026e-15,1.000000e+00,1.000000e+00


In [23]:
def collaborative_recommendations(user_id, top_n=10):
    """
    Get top N recommended movies for a user based on predicted ratings.
    """
    user_ratings = predicted_ratings_df.loc[user_id]
    # Exclude already rated movies
    rated_movies = user_movie_matrix.loc[user_id]
    unrated_movies = user_ratings[rated_movies == 0]
    # Get the top N recommendations
    top_recommendations = unrated_movies.sort_values(ascending=False).head(top_n)
    return top_recommendations

In [24]:
# Step 6: Hybrid Recommendations
def hybrid_recommendations(user_id, movie_id, top_n=10, alpha=0.5):
    """
    Generate hybrid recommendations for a user based on content similarity and collaborative filtering.
    alpha: Weight for blending content-based and collaborative scores.
    """
    # Content-based similar movies
    similar_movies = get_content_similarity(df_movies, movie_id, 4*top_n)
    print(similar_movies['title'])
    # Add collaborative filtering scores
    similar_movies['collab_score'] = similar_movies['id'].apply(lambda x: predicted_ratings_df.loc[user_id, x]
                                                                     if x in predicted_ratings_df.columns else 0)

    # Combine scores
    similar_movies['hybrid_score'] = alpha * similar_movies['similarity'] + (1 - alpha) * similar_movies['collab_score']

    # Return movies sorted by hybrid score
    return similar_movies.sort_values(by='hybrid_score', ascending=False).head(top_n)

In [25]:
user_id = 1
movie_id = 5  # Movie for which recommendations are requested
top_n = 5  # Number of recommendations

recommendations = hybrid_recommendations(user_id, movie_id, top_n=top_n, alpha=0.5)
recommendations

645                 The Apartment
15064                 Powder Room
17246             Table for Three
13028            About Last Night
5980                  The Bellboy
11244                       Hotel
7135                   Waiting...
2201                   Body Shots
14150              Bachelor Night
9158     Vacancy 2: The First Cut
15346        Bachelor Party Vegas
16532                    Room 314
5182                  Plaza Suite
15200             Screwball Hotel
8971                   Management
2334              Room at the Top
19810           A Year and Change
3702             California Suite
15008     Week-End at the Waldorf
5136      Blame It on the Bellboy
Name: title, dtype: object


Unnamed: 0,adult,id,original_language,original_title,overview,popularity,runtime,tagline,title,vote_average,...,release_year,overview_keywords,tags,directors,characters,actors,embeddings,similarity,collab_score,hybrid_score
13028,False,222899,en,About Last Night,A modern reimagining of the classic romantic c...,9.543004,100.0,It's about compromise. It's about love. It's a...,About Last Night,6.0,...,2014.0,"['modern', 'reimagining', 'classic', 'romantic...",['duringcreditsstinger'],['Steve Pink'],"['Bernie', 'Danny', 'Joan']","['Kevin Hart', 'Michael Ealy', 'Regina Hall']","[-0.14264850318431854, -0.13995902240276337, -...",0.506749,5.0,2.753374
15346,False,14505,en,Bachelor Party Vegas,A planned evening of debauchery in Las Vegas t...,6.262514,91.0,This is one weekend they will never forget!,Bachelor Party Vegas,5.0,...,2006.0,"['plan', 'evening', 'debauchery', 'las', 'vega...","['female nudity', 'sex', 'bachelor party']",['Eric Bernt'],"['Z-Bob', 'Nathan', 'Ash']","['Kal Penn', 'Jonathan Bennett', 'Donald Faison']","[-0.015568161383271217, 0.05240566283464432, -...",0.489975,5.0,2.744988
16532,False,60002,en,Room 314,How many stories can one hotel room tell? Watc...,0.172008,0.0,How many stories can one hotel room tell?,Room 314,4.0,...,2006.0,"['story', 'hotel', 'room', 'tell', 'watch', 'c...",['independent film'],[],[],[],"[0.1762130707502365, -0.2454240620136261, -0.0...",0.489015,5.0,2.744507
5136,False,34151,en,Blame It on the Bellboy,"Mike Lawton, Maurice Horton, and Melvin Orton ...",7.777791,78.0,"Mix-ups, Mishaps, Madness, and Mayhem... It's ...",Blame It on the Bellboy,5.8,...,1992.0,"['mike', 'lawton', 'maurice', 'horton', 'melvi...",[],['Mark Herman'],"['Melvyn Orton', 'Mike Lawton / Charlton Black...","['Dudley Moore', 'Bryan Brown', 'Richard Griff...","[-0.14066843688488007, -0.0925675556063652, -0...",0.478118,5.0,2.739059
645,False,284,en,The Apartment,Bud Baxter is a minor clerk in a huge New York...,11.994281,125.0,"Movie-wise, there has never been anything like...",The Apartment,8.1,...,1960.0,"['bud', 'baxter', 'minor', 'clerk', 'huge', 'y...","['new york', ""new year's eve"", 'lovesickness',...","['Billy Wilder', 'Hal W. Polaire', 'David Salv...","['C.C. Baxter', 'Fran Kubelik', 'Jeff D. Sheld...","['Jack Lemmon', 'Shirley MacLaine', 'Fred MacM...","[-0.018236879259347916, 0.03725520521402359, -...",0.527283,4.0,2.263642
