In [26]:
import numpy as np
import pandas as pd
import pickle
from surprise import Reader, Dataset
from sklearn.linear_model import Ridge
import json

In [24]:
df_merged = pd.read_csv('../data/merged_movies.csv', encoding='utf-8')
df_ratings = pd.read_csv("../data/ratings.csv")
df_ratings = df_ratings.groupby('movieId').filter(lambda x: len(x) > 10).groupby('userId').filter(lambda x: len(x) > 5)

reader = Reader()
data = Dataset.load_from_df(df_ratings[['userId', 'movieId', 'rating']], reader)
trainset = data.build_full_trainset()

In [27]:
# Load the SVD model from the saved file
with open('svd_model.pkl', 'rb') as f:
    svd = pickle.load(f)

    new_user_id = df_ratings['userId'].max() + 1  # Assign an ID to the new user
    
    # Select random movie IDs from actual movie IDs in df_ratings
    movie_ids = df_ratings['movieId'].unique()
    random_movie_ids = np.random.choice(movie_ids, size=10, replace=False)
    
    # Generate random user ratings for these movies
    new_user_ratings = {int(movie_id): np.random.randint(1, 6) for movie_id in random_movie_ids}
    
    # Step 1: Map raw IDs to inner IDs and collect item factors and biases
    item_inner_ids = {}
    qi_list = []
    bi_list = []
    ratings_list = []
    
    for raw_iid, rating in new_user_ratings.items():
        try:
            inner_id = trainset.to_inner_iid(raw_iid)
            item_inner_ids[raw_iid] = inner_id
            qi = svd.qi[inner_id]  # Item factors
            bi = svd.bi[inner_id]  # Item bias
            qi_list.append(qi)
            bi_list.append(bi)
            ratings_list.append(rating)
        except ValueError:
            # The item was not in the training set
            print(f"MovieId {raw_iid} not in training set, skipping.")
    
    # Step 2: Set up the least squares problem to solve for pu and bu
    ratings_arr = np.array(ratings_list)
    global_mean = trainset.global_mean
    bi_arr = np.array(bi_list)
    y = ratings_arr - global_mean - bi_arr  # Target variable
    
    qi_arr = np.array(qi_list)  # Design matrix without bias term
    X = np.hstack([np.ones((qi_arr.shape[0], 1)), qi_arr])  # Add bias term
    
    # Solve for theta = [bu, pu]
    # theta, residuals, rank, s = np.linalg.lstsq(X, y, rcond=None)
    # bu = theta[0]
    # pu = theta[1:]
    # use Ridge regression instead of `np.linalg.lstsq` for efficiency
    ridge = Ridge(alpha=1.0)
    ridge.fit(X, y)
    theta = ridge.coef_
    bu = theta[0]
    pu = theta[1:]
    
    # Step 3: Predict ratings for all items
    # Get all item factors and biases
    qi_all = svd.qi
    bi_all = svd.bi
    
    # Compute predicted ratings
    r_hat = global_mean + bu + bi_all + qi_all.dot(pu)
    
    # Map inner IDs to raw IDs
    inner_ids = np.arange(trainset.n_items)
    raw_ids = [int(trainset.to_raw_iid(inner_id)) for inner_id in inner_ids]

    min_length = min(len(raw_ids), len(r_hat))
    raw_ids = raw_ids[:min_length]
    r_hat = r_hat[:min_length]

    # Ensure lengths are consistent
    if len(raw_ids) != len(r_hat):
        print(f"Mismatch: raw_ids length = {len(raw_ids)}, r_hat length = {len(r_hat)}")
        # Align lengths here if necessary or investigate why they differ
        exit()
    
    # Create a DataFrame with predictions
    predictions_df = pd.DataFrame({
        'movieId': raw_ids,
        'est_rating': r_hat
    })
    
    # Remove movies the user has already rated
    rated_movie_ids = set(new_user_ratings.keys())
    predictions_df = predictions_df[~predictions_df['movieId'].isin(rated_movie_ids)]
    
    # Step 4: Get top 10 recommendations
    top_recommendations = predictions_df.sort_values('est_rating', ascending=False).head(10)
    
    # print(f"Top 10 recommendations for new user:")
    # print(top_recommendations[['movieId', 'est_rating']])

    top_recommendation_details = pd.merge(
        top_recommendations, 
        df_merged, 
        left_on='movieId', 
        right_on='id'
    )
    
    # Select only relevant columns for the JSON response
    top_recommendation_details = top_recommendation_details[['movieId', 'imdb_id', 'tmdb_id', 'title', 'release_date', 'genres', 'cast', 'crew', 'keywords', 'est_rating']]
    
    # Convert to dictionary format (or JSON-like structure)
    json_response = top_recommendation_details.to_dict(orient='records')
    
    # Print or return the JSON response
    print(json.dumps(json_response, indent=4))  # Pretty print the JSON
    

[
    {
        "movieId": 4369,
        "imdb_id": 231844,
        "tmdb_id": 4369,
        "title": "Just a Question of Love",
        "release_date": "2000-01-01",
        "genres": "[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'name': 'Romance'}]",
        "cast": "[{'cast_id': 5, 'character': 'Laurent', 'credit_id': '52fe43bdc3a36847f806bd69', 'gender': 0, 'id': 38449, 'name': 'Cyrille Thouvenin', 'order': 0, 'profile_path': '/lJXcAuLhjL1LuFeCkpIEryplyvD.jpg'}, {'cast_id': 6, 'character': 'C\u00e9dric', 'credit_id': '52fe43bdc3a36847f806bd6d', 'gender': 2, 'id': 38450, 'name': 'St\u00e9phan Gu\u00e9rin-Tilli\u00e9', 'order': 1, 'profile_path': '/AdFNoHxVeVMRzVI612oNuhHOLnx.jpg'}, {'cast_id': 7, 'character': 'Carole', 'credit_id': '52fe43bdc3a36847f806bd71', 'gender': 0, 'id': 38451, 'name': 'Caroline Veyt', 'order': 2, 'profile_path': None}, {'cast_id': 8, 'character': 'Emma', 'credit_id': '52fe43bdc3a36847f806bd75', 'gender': 1, 'id': 25346, 'name': 'Eva Darlan', 'order': 3, 'prof