In [22]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from utils import get_content_similarity
import ast
import json

In [25]:
df_ratings = pd.read_csv('dataset/ratings_small.csv') 
df_movies = pd.read_csv('dataset/movies_data_embeddings.csv', low_memory=False)
df_movies['embeddings'] = df_movies['embeddings'].apply(json.loads)

# Optionally, convert lists back to NumPy arrays
df_movies['embeddings'] = df_movies['embeddings'].apply(np.array)

In [3]:
def create_user_movie_matrix(ratings):
    """
    Create a user-movie matrix with users as rows and movies as columns.
    Missing entries are filled with 0.
    """
    user_movie_matrix = ratings.pivot(index='userId', columns='movieId', values='rating').fillna(0)
    return user_movie_matrix

user_movie_matrix = create_user_movie_matrix(df_ratings)
user_movie_matrix

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
668,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
669,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
670,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
def compute_svd(user_movie_matrix, k=20):
    """
    Perform SVD on the user-movie matrix and reduce to k latent factors.
    """
    matrix = user_movie_matrix.values
    U, sigma, Vt = np.linalg.svd(matrix, full_matrices=False)
    # Reduce to k latent factors
    U_k = U[:, :k]
    sigma_k = np.diag(sigma[:k])
    Vt_k = Vt[:k, :]
    return U_k, sigma_k, Vt_k

# Compute SVD
k = 20  # Number of latent factors
U_k, sigma_k, Vt_k = compute_svd(user_movie_matrix, k)

In [5]:
def predict_ratings(U_k, sigma_k, Vt_k):
    """
    Reconstruct the user-movie matrix to predict ratings.
    """
    return np.dot(np.dot(U_k, sigma_k), Vt_k)

predicted_ratings = predict_ratings(U_k, sigma_k, Vt_k)
predicted_ratings_df = pd.DataFrame(predicted_ratings, index=user_movie_matrix.index, columns=user_movie_matrix.columns)
predicted_ratings_df

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-0.044813,0.013889,0.001341,-0.005832,-0.039115,0.043074,0.004063,-0.007086,-0.004045,0.033885,...,-0.001459,-0.001540,0.020654,0.001150,0.001725,-0.002677,0.030982,-0.000678,-0.000407,-0.002918
2,0.666567,1.479836,-0.047319,0.127629,0.259239,0.589083,0.126781,0.040950,0.022299,2.242133,...,0.003219,0.002013,-0.002494,-0.003948,-0.005921,-0.022349,-0.003741,0.000397,0.000238,0.006437
3,1.048507,0.330970,-0.012771,0.007109,0.030008,0.097194,-0.072403,-0.020532,-0.048307,0.451688,...,-0.002163,-0.005212,0.008056,-0.006371,-0.009556,-0.017288,0.012084,0.005364,0.003218,-0.004326
4,1.832262,1.435050,0.286621,0.170728,-0.011755,-0.582135,0.009321,-0.036736,0.150644,1.467122,...,0.028887,0.006862,0.068589,0.008977,0.013465,0.049192,0.102883,-0.007377,-0.004426,0.057773
5,1.945208,1.357930,0.552549,0.107910,0.633452,0.032023,0.271999,0.164337,0.068556,0.511015,...,-0.003500,-0.004342,0.001920,0.002994,0.004491,-0.000533,0.002880,0.005864,0.003518,-0.007001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,1.530127,0.864579,0.480869,0.149693,0.497642,1.393595,0.619800,0.049702,0.266400,1.295869,...,0.003672,0.000660,0.008511,0.004436,0.006655,-0.000604,0.012767,-0.002617,-0.001570,0.007344
668,0.494503,-0.124331,0.029578,0.022238,0.016577,0.287814,0.000319,-0.009426,-0.002144,0.040967,...,-0.003015,-0.001573,-0.001612,0.006020,0.009030,-0.034325,-0.002419,-0.000110,-0.000066,-0.006031
669,0.539520,0.034518,0.104565,0.020079,0.089153,0.118650,0.030190,0.000717,0.008118,-0.039229,...,-0.003970,0.003272,-0.007095,0.000513,0.000769,0.007917,-0.010643,-0.000039,-0.000023,-0.007940
670,1.431368,0.121979,0.196389,0.056162,0.138813,0.694786,0.161205,0.005677,0.070996,0.279289,...,0.009560,-0.002998,0.003663,0.000146,0.000219,-0.020511,0.005494,0.003497,0.002098,0.019120


In [6]:
def collaborative_recommendations(user_id, top_n=10):
    """
    Get top N recommended movies for a user based on predicted ratings.
    """
    user_ratings = predicted_ratings_df.loc[user_id]
    # Exclude already rated movies
    rated_movies = user_movie_matrix.loc[user_id]
    unrated_movies = user_ratings[rated_movies == 0]
    # Get the top N recommendations
    top_recommendations = unrated_movies.sort_values(ascending=False).head(top_n)
    return top_recommendations

In [49]:
# Step 6: Hybrid Recommendations
def hybrid_recommendations(user_id, movie_id, top_n=10, alpha=0.5):
    """
    Generate hybrid recommendations for a user based on content similarity and collaborative filtering.
    alpha: Weight for blending content-based and collaborative scores.
    """
    # Content-based similar movies
    similar_movies = get_content_similarity(df_movies, movie_id, 4*top_n)
    print(similar_movies['title'])
    # Add collaborative filtering scores
    similar_movies['collab_score'] = similar_movies['id'].apply(lambda x: predicted_ratings_df.loc[user_id, x]
                                                                     if x in predicted_ratings_df.columns else 0)

    # Combine scores
    similar_movies['hybrid_score'] = alpha * similar_movies['similarity'] + (1 - alpha) * similar_movies['collab_score']

    # Return movies sorted by hybrid score
    return similar_movies.sort_values(by='hybrid_score', ascending=False).head(top_n)

In [50]:
user_id = 1
movie_id = 15  # Movie for which recommendations are requested
top_n = 5  # Number of recommendations

recommendations = hybrid_recommendations(user_id, movie_id, top_n=top_n, alpha=0.5)
recommendations

1598                                       Saboteur
7232                                         Edison
7731                                           Gacy
1834                                            8MM
1668                                   Detroit 9000
9842                                           2:13
20225                                 True Identity
20358                               Somebody's Hero
7648                                Everyone's Hero
6094                                   The Stranger
9269                            Law Abiding Citizen
5572                          While the City Sleeps
9505                       The House on 92nd Street
2589                                American Psycho
10964                                   The Bat Man
18911                              The Captive City
13571                        A Dangerous Profession
6254                                   Brother John
5648     Investigation of a Citizen Above Suspicion
3999        

Unnamed: 0,adult,id,original_language,original_title,overview,popularity,runtime,tagline,title,vote_average,...,release_year,overview_keywords,tags,directors,characters,actors,embeddings,similarity,collab_score,hybrid_score
1598,False,31997,en,Saboteur,Aircraft factory worker Barry Kane goes on the...,4.911826,108.0,You'd like to say - IT CAN'T HAPPEN HERE!... b...,Saboteur,6.8,...,1942.0,"['aircraft', 'factory', 'worker', 'barry', 'ka...","['falsely accused', 'suspense', 'aircraft fact...","['Alfred Hitchcock', 'Fred Frank', 'Adele Cann...","['Pat Martin', 'Barry Kane', 'Charles Tobin']","['Priscilla Lane', 'Robert Cummings', 'Otto Kr...","[-0.16216124594211578, 0.0444406159222126, -0....",0.545571,0.0,0.272785
7232,False,10064,en,Edison,"Upon discovering a den of corrupt policemen, a...",5.394304,99.0,"In this city, only the cops are above the law.",Edison,5.4,...,2005.0,"['discover', 'den', 'corrupt', 'policeman', 'f...","['journalist', 'police brutality', 'metropolis...",['David J. Burke'],"['Moses Ashford', 'Levon Wallace', 'Josh Polla...","['Morgan Freeman', 'Kevin Spacey', 'Justin Tim...","[-0.18535935878753662, -0.08495169878005981, -...",0.537901,0.0,0.26895
7731,False,27387,en,Gacy,Based on a true story of serial killer a model...,6.677536,88.0,Friend. Neighbor. Killer.,Gacy,4.4,...,2003.0,"['base', 'true', 'story', 'serial', 'killer', ...","['serial killer', 'democrat']",['Clive Saunders'],"['John Wayne Gacy, Jr.', 'John Gacy, Sr.', 'To...","['Mark Holton', 'Adam Baldwin', 'Charlie Weber']","[-0.1652667373418808, 0.051793426275253296, -0...",0.531825,0.0,0.265912
1834,False,8224,en,8MM,"A small, seemingly innocuous plastic reel of f...",7.473718,123.0,You can't prepare for where the truth will tak...,8MM,6.1,...,1999.0,"['small', 'seemingly', 'innocuous', 'plastic',...","['pornography', 'porn actor', 'loss of daughte...","['Joel Schumacher', 'Alan Edmisten', 'Mads Han...","['Tom Welles', 'Max California', 'Eddie Poole']","['Nicolas Cage', 'Joaquin Phoenix', 'James Gan...","[-0.05856899544596672, -0.05534164234995842, -...",0.496173,0.0,0.248087
1668,False,85837,en,Detroit 9000,After a fundraiser for a black politician is r...,0.37055,106.0,It's the murder capital of the world. And the ...,Detroit 9000,6.0,...,1973.0,"['fundraiser', 'black', 'politician', 'rob', '...","['hitman', 'police', 'party', 'murder', 'money...",['Arthur Marks'],"['Lt. Danny Bassett', 'Ruby Harris', 'Sergeant...","['Alex Rocco', 'Vonetta McGee', 'Hari Rhodes']","[-0.21278412640094757, -0.08952146768569946, -...",0.496147,0.0,0.248074
