In [19]:
import pandas as pd
import numpy as np
from utils import get_content_similarity
import json

In [20]:
df_ratings = pd.read_csv('dataset/ratings_generated.csv', low_memory=False)
df_movies = pd.read_csv('dataset/movies_data_embeddings.csv', low_memory=False)
df_movies['embeddings'] = df_movies['embeddings'].apply(json.loads)
df_movies['embeddings'] = df_movies['embeddings'].apply(np.array)

In [21]:
def create_user_movie_matrix(ratings):
    user_movie_matrix = ratings.pivot(index='userId', columns='movieId', values='rating').fillna(0)
    return user_movie_matrix

user_movie_matrix = create_user_movie_matrix(df_ratings)
user_movie_matrix

movieId,5,6,11,12,13,14,15,16,17,18,...,455661,456018,456781,458298,459802,459928,460024,461297,461805,465044
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2.0,1.0,4.0,5.0,4.0,3.0,1.0,1.0,2.0,5.0,...,2.0,0.0,3.0,4.0,0.0,4.0,2.0,2.0,0.0,1.0
2,0.0,3.0,5.0,5.0,5.0,2.0,3.0,3.0,0.0,1.0,...,3.0,0.0,0.0,5.0,3.0,5.0,3.0,3.0,2.0,0.0
3,5.0,4.0,5.0,0.0,0.0,4.0,2.0,4.0,5.0,4.0,...,1.0,1.0,1.0,5.0,3.0,5.0,3.0,5.0,1.0,2.0
4,4.0,4.0,0.0,2.0,0.0,0.0,4.0,3.0,5.0,0.0,...,0.0,5.0,4.0,1.0,5.0,5.0,0.0,5.0,5.0,2.0
5,0.0,2.0,5.0,4.0,4.0,2.0,1.0,4.0,2.0,5.0,...,1.0,3.0,2.0,4.0,2.0,3.0,3.0,2.0,2.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
296,3.0,0.0,0.0,4.0,0.0,4.0,3.0,5.0,0.0,4.0,...,5.0,5.0,0.0,1.0,0.0,5.0,5.0,4.0,3.0,2.0
297,3.0,2.0,5.0,0.0,0.0,4.0,0.0,5.0,1.0,4.0,...,3.0,5.0,3.0,3.0,3.0,3.0,4.0,2.0,0.0,1.0
298,3.0,4.0,3.0,3.0,0.0,0.0,1.0,5.0,0.0,4.0,...,5.0,5.0,0.0,0.0,0.0,0.0,1.0,0.0,4.0,1.0
299,0.0,1.0,5.0,0.0,1.0,5.0,0.0,5.0,5.0,3.0,...,0.0,0.0,0.0,4.0,2.0,2.0,1.0,0.0,1.0,1.0


In [22]:
def compute_svd(user_movie_matrix, k=20):
    matrix = user_movie_matrix.values
    U, sigma, Vt = np.linalg.svd(matrix, full_matrices=False)
    U_k = U[:, :k]
    sigma_k = np.diag(sigma[:k])
    Vt_k = Vt[:k, :]
    return U_k, sigma_k, Vt_k

k = 200  
U_k, sigma_k, Vt_k = compute_svd(user_movie_matrix, k)

In [23]:
def predict_ratings(U_k, sigma_k, Vt_k):
    return np.dot(np.dot(U_k, sigma_k), Vt_k)

predicted_ratings = predict_ratings(U_k, sigma_k, Vt_k)
predicted_ratings_df = pd.DataFrame(predicted_ratings, index=user_movie_matrix.index, columns=user_movie_matrix.columns)
predicted_ratings_df

movieId,5,6,11,12,13,14,15,16,17,18,...,455661,456018,456781,458298,459802,459928,460024,461297,461805,465044
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.995719,1.094210,5.104703,3.990432,3.721595,3.216500,0.800072,1.135165,2.406389,4.579373,...,2.548966,2.472938,2.798762,3.867859,1.585321,3.958448,2.779355,3.199304,1.266020,1.363318
2,2.187329,2.168844,5.949948,4.014375,4.818516,1.678524,3.668751,2.454160,0.905711,1.261258,...,3.863479,1.638981,0.698955,3.825366,1.863504,3.388516,1.522669,3.371634,1.797914,0.862577
3,4.357223,2.443939,5.478785,0.255670,0.599130,2.719034,1.153149,3.367220,2.951093,3.543109,...,0.636095,1.043671,1.667218,5.140607,1.789504,3.894502,3.450246,4.630219,1.923521,1.123040
4,3.873280,3.718954,0.972098,3.082024,0.482781,0.702436,2.678592,1.986596,4.952734,0.068086,...,0.606871,4.644738,3.926422,0.474387,2.602791,5.155706,0.630305,2.825728,4.126551,3.172374
5,-0.446806,1.849725,3.311257,4.838129,3.583744,2.802359,1.941673,4.297351,2.067231,3.800283,...,2.690444,2.819085,3.258106,2.665853,3.056611,1.946508,3.665217,1.722246,2.530174,3.321082
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
296,2.510524,0.928352,1.284286,4.599774,1.377725,3.596236,3.997840,4.546889,-0.852859,3.088160,...,4.692245,3.565650,1.494044,1.566857,1.407845,3.451951,4.567842,3.894074,3.164406,1.630131
297,2.346710,4.026140,2.720596,0.315244,0.472967,4.473582,0.972641,4.189309,1.132582,4.150004,...,3.785724,3.559208,2.799961,2.135025,1.953980,3.317359,3.852439,2.232175,0.591534,1.657947
298,2.355009,3.024893,2.520976,2.173161,0.626169,0.177161,1.716518,4.128797,-1.281584,4.105161,...,3.353091,4.887209,1.800900,0.134102,0.165778,0.726922,-0.224037,-0.815442,3.128026,0.839683
299,0.940466,-0.220263,4.352632,0.826094,1.030691,5.370588,1.347399,2.919950,4.749657,2.842538,...,1.079467,-0.123649,1.341027,2.488760,1.166458,2.542539,0.903595,0.547683,0.709485,1.816941


In [24]:
def collaborative_recommendations(user_id, top_n=10):
    user_ratings = predicted_ratings_df.loc[user_id]
    rated_movies = user_movie_matrix.loc[user_id]
    unrated_movies = user_ratings[rated_movies == 0]
    top_recommendations = unrated_movies.sort_values(ascending=False).head(top_n)
    return top_recommendations

In [40]:
def hybrid_recommendations(user_id, movie_id, top_n=10, alpha=0.5):
    similar_movies = get_content_similarity(df_movies, movie_id, 4*top_n)
    print(similar_movies['original_title'][:5])
    similar_movies['collab_score'] = similar_movies['id'].apply(lambda x: predicted_ratings_df.loc[user_id, x]
                                                                     if x in predicted_ratings_df.columns else 0)
    similar_movies['hybrid_score'] = alpha * similar_movies['similarity'] + (1 - alpha) * similar_movies['collab_score']
    return similar_movies.sort_values(by='hybrid_score', ascending=False).head(top_n)

In [47]:
user_id = 1
movie_id = 5  
top_n = 5  

recommendations = hybrid_recommendations(user_id, movie_id, top_n=5, alpha=0.5)
recommendations[['id', 'title']]

645         The Apartment
15064         Powder Room
17246     Table for Three
13028    About Last Night
5980          The Bellboy
Name: original_title, dtype: object


Unnamed: 0,id,title
13028,222899,About Last Night
5136,34151,Blame It on the Bellboy
645,284,The Apartment
15064,226363,Powder Room
16532,60002,Room 314


In [43]:
print(predicted_ratings_df.loc[user_id, recommendations["id"]])
user_movie_matrix.loc[user_id, recommendations["id"]]

movieId
222899    4.821535
34151     4.403131
284       4.150017
226363    4.038229
60002     4.059041
Name: 1, dtype: float64


movieId
222899    5.0
34151     5.0
284       4.0
226363    4.0
60002     5.0
Name: 1, dtype: float64