# Collaborative Filtering with Surprise
---  

## Concept
Surprise is a Python SciKit that comes with various recommender algorithms and similarity metrics to make it easy to build and analyze recommenders.

<!-- <img src="nmf.png" width="600" height="400"> -->

## Loading Libraries
***Implementing the Surprise Library to Model a Recommender System***

In [1]:
# required libs and packages 

import sys
import time # for counting the time for each steps

import pandas as pd
import numpy as np
from surprise import Dataset
from surprise import Reader


# import pandas as pd
from surprise import KNNWithMeans, Dataset, Reader
from surprise.model_selection import cross_validate, GridSearchCV
import time
# loading similarity search model

from surprise.prediction_algorithms import KNNWithMeans, SVD, SVDpp, SlopeOne, NMF, NormalPredictor, KNNBaseline, KNNBasic, KNNWithMeans, KNNWithZScore, BaselineOnly, CoClustering


from PIL import Image

## Functions of the WorkFlow

In [None]:


def perform_grid_search_ratings(file_path, model_gscv, sim_options, measures=["rmse", "mae"], cv):
    """
    Function to perform a grid search on KNNWithMeans algorithm for collaborative filtering.

    Parameters:
    - file_path: str, path to the CSV file containing the dataset.
    - sim_options: dict, dictionary of similarity options for KNN.
    - param_grid: dict, grid of hyperparameters for grid search.
    - measures: list, list of evaluation measures.
    - cv: int, number of cross-validation folds.

    Returns:
    - gs: Fitted GridSearchCV object with results.
    - time_elapsed: Time taken for the grid search.
    """
    
    # Load and preprocess the dataset
    df_rating = pd.read_csv(file_path)
    df_rating.rename(columns={'userId': 'user', 'movieId': 'item'}, inplace=True)
    
    # Convert the DataFrame to a Surprise dataset format
    reader = Reader(rating_scale=(df_rating['rating'].min(), df_rating['rating'].max()))
    data = Dataset.load_from_df(df_rating[["user", "item", "rating"]], reader)
    
    # Set default similarity options if not provided
    if sim_options is None:
        sim_options = {
            "name": ["msd", "cosine"],
            "min_support": [3, 4, 5],
            "user_based": [False, True],
        }
    
    # Set default parameter grid if not provided
    # if param_grid is None:
    param_grid = {"sim_options": sim_options}
    
    # Perform grid search
    time_start = time.time()
    gs = GridSearchCV(model_gscv, param_grid, measures=measures, cv=cv)
    gs.fit(data)
    time_elapsed = time.time() - time_start
    
    return gs, time_elapsed

In [140]:
def fun_Recommender(test_user, df_rating, df_movies, model, no_recom):
    """Making a function which gets the user, df_rating, and movies and optimized model
    and return a recommendation list """
    # here we predict the ratings by one user on all the movies. The user is 10
    user_predic = [] # a list which stores the predicted rates
    for  movie in df_rating['item'].unique():
        prediction = model.predict(test_user, movie)
        user_predic.append(prediction.est)
    
    # Get the data for the user and rating and turn them into a panda dataframe
    dict_user = {'movieId': df_rating['item'].unique(), 'rating': user_predic}
    df_user = pd.DataFrame(dict_user)

    
    dict_user = {'movieId': df_rating['item'].unique(), 'rating': user_predic}
    df_user = pd.DataFrame(dict_user)

    # first searching for movies rated by the user
    list_seen_movies = df_rating[df_rating['user'] == user]['item'].tolist()
    
    df_user_recom = df_user.copy()
    # here we set the rating for seen movies by the user to zero
    for i in range(df_user_recom.shape[0]):
        if df_user_recom.loc[i, 'movieId'] in list_seen_movies:
            df_user_recom.loc[i, 'rating'] = 0
    df_user_recom_sorted = df_user_recom.sort_values(by='rating', ascending = False)
    recommendations_movie=df_user_recom_sorted['movieId']
    return df_movies.set_index('movieId').loc[recommendations_movie].head(no_recom)

In [273]:
import requests
from PIL import Image
from io import BytesIO
import os

def get_movie_posters(movie_tmbds, api_key, save_directory='./images'):
    """
    Function to get movie posters' URLs and save the images locally.
    
    Parameters:
    - movie_tmbds: list of int, a list of TMDB movie IDs.
    - api_key: str, TMDB API key.
    - save_directory: str, path to the directory where images will be saved (default is './images').

    Returns:
    - list_poster_url: list of str, list of poster URLs.
    """
    # Ensure the directory for saving images exists
    if not os.path.exists(save_directory):
        os.makedirs(save_directory)

    image_paths = []
    
    for movie_tmbd in movie_tmbds:
        print(f"Fetching poster for movie ID: {movie_tmbd}")
        base_url = f'https://api.themoviedb.org/3/movie/{movie_tmbd}'

        # Send a GET request to TMDB API
        response = requests.get(base_url, params={'api_key': api_key})

        # Check if the request was successful
        if response.status_code == 200:
            data = response.json()
            poster_path = data.get('poster_path')
            if poster_path:
                poster_url = f'https://image.tmdb.org/t/p/w500{poster_path}'
                # list_poster_url.append(poster_url)
                
                # Send a GET request to the image URL
                response_image = requests.get(poster_url)
                image = Image.open(BytesIO(response_image.content))
                
                # Save the image locally
                image_path = os.path.join(save_directory, f"poster_tmbd_{movie_tmbd}.jpg")
                image.save(image_path)
                image_paths.append(image_path)
                print(f"Saved poster for movie ID {movie_tmbd} at {image_path}")
            else:
                print(f"Poster not found for movie ID {movie_tmbd}.")
        else:
            print(f"Failed to fetch movie details for ID {movie_tmbd}. Status code: {response.status_code}")
    
    return image_paths

In [None]:
def func_image_merger(image_paths, show_type):
    """ It gets in paths of images and merge the images vertically (ver) or horizontally (hor)"""
    # image_paths is a list of paths of images
    # show_type is direction to show the merge image

    # Open images and store them in a list
    images = [Image.open(img_path) for img_path in image_paths]

    # Determine the width and height for the final merged image
    # For horizontal merge
    # calculating the total width width and maximum hight of all images
    total_width = sum(img.width for img in images)
    max_height = max(img.height for img in images)

    # For vertical merge
    # calculating the total hights width and maximum wdith of all images
    total_height = sum(img.height for img in images)
    max_width = max(img.width for img in images)

    # Create a new blank image for horizontal merge
    merged_image_horizontal = Image.new('RGB', (total_width, max_height))

    # Create a new blank image for vertical merge
    merged_image_vertical = Image.new('RGB', (max_width, total_height))

    # Paste images side by side for horizontal merge
    x_offset = 0
    for img in images:
        merged_image_horizontal.paste(img, (x_offset, 0))
        x_offset += img.width

    # Paste images on top of each other for vertical merge
    y_offset = 0
    for img in images:
        merged_image_vertical.paste(img, (0, y_offset))
        y_offset += img.height

# Save the merged images
    merged_image_horizontal.save('./images/merged_image_horizontal.jpg')
    merged_image_vertical.save('./images/merged_image_vertical.jpg')
    
    if show_type == 'hor':
        return merged_image_horizontal.show()
    else:
        return merged_image_vertical.show()


## Reading the Tables: Getting the Data Sets

In [2]:
# load_data.py

# first we read our data set from directroy

df_rating = pd.read_csv('./data/ml-latest-small/ratings.csv')
df_movies = pd.read_csv('./data/ml-latest-small/movies_modified.csv')

df_links = pd.read_csv('./data/ml-latest-small/links.csv')


In [3]:
def fun_min_rating_out(df, min_user_ratings, min_movie_ratings):
    """ It reduce the size of data set by dropping the userId with rating lower than a thershhold
    and movies rated lower than a thershdold"""
    #  min_user_ratings: minimum number rating per user
    # min_movie_ratings: minium number of rating per movies
    # min_user_ratings = 20
    filter_users = df['userId'].value_counts() > min_user_ratings
    filter_users = filter_users[filter_users].index.tolist()


    # min_movie_ratings = 20
    filter_movies = df['movieId'].value_counts() > min_movie_ratings
    filter_movies = filter_movies[filter_movies].index.tolist()
    df_new = df[(df['movieId'].isin(filter_movies)) & (df['userId'].isin(filter_users))]
    
    return df_new


In [4]:
# checking to see if the function fun_min_rating_out can drop unnecessary observavtions
print('The original data frame shape:\t{}'.format(df_rating.shape))

print('The new data frame shape:\t{}'.format(fun_min_rating_out(df_rating, 20, 20).shape))

The original data frame shape:	(100836, 4)
The new data frame shape:	(66405, 4)


## Reading the Tables: Getting the right data set for the Surprise 

In [33]:
df_rating_red = fun_min_rating_out(df_rating, 20, 20)
print(df_rating_red.head())
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(df_rating_red[['userId', 'movieId', 'rating']], reader)

data

   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931


<surprise.dataset.DatasetAutoFolds at 0x1e26de37f10>

## Benschmarking

In [7]:
benchmark = []
# Iterate over all algorithms
for algorithm in [SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering()]:
    # Perform cross validation
    results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False)
    # Get results & append algorithm name
    
    tmp = pd.DataFrame(results) #.mean(axis=1)
    tmp = pd.DataFrame(results) #.mean(axis = 0 )
    tmp.iloc[0,0] = tmp.iloc[:,0].mean()
    tmp.iloc[0,1] = tmp.iloc[:,1].mean()
    tmp.iloc[0,2] = tmp.iloc[:,2].mean()

    tmp = tmp.drop([1,2], axis = 0)


    new_data = pd.Series([str(algorithm).split(' ')[0].split('.')[-1]] , name='Algorithm')

    tmp = pd.concat([tmp, new_data], axis=1)
    print(tmp)

    benchmark.append(tmp)
   
# pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')  

   test_rmse  fit_time  test_time Algorithm
0    0.84893  0.557259   0.151026       SVD
   test_rmse   fit_time  test_time Algorithm
0   0.835842  13.629437   4.143286     SVDpp
   test_rmse  fit_time  test_time Algorithm
0    0.85203  0.411435   2.447098  SlopeOne
   test_rmse  fit_time  test_time Algorithm
0   0.876786  0.932221   0.145815       NMF
   test_rmse  fit_time  test_time        Algorithm
0   1.367854  0.062498   0.093739  NormalPredictor
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
   test_rmse  fit_time  test_time    Algorithm
0   0.841798  0.229144   2.057142  KNNBaseline
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity 

In [32]:
df = pd.DataFrame([benchmark[i].iloc[0,:] for i in range(len(benchmark)) ], columns = ['test_rmse', 'fit_time', 'test_time', 'Algorithm']).set_index('Algorithm').sort_values('test_rmse', ascending = True)
df

Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SVDpp,0.835842,13.629437,4.143286
KNNBaseline,0.841798,0.229144,2.057142
SVD,0.84893,0.557259,0.151026
BaselineOnly,0.849464,0.109367,0.119779
KNNWithZScore,0.851475,0.192694,1.85185
KNNWithMeans,0.851837,0.156232,1.676322
SlopeOne,0.85203,0.411435,2.447098
NMF,0.876786,0.932221,0.145815
CoClustering,0.895987,1.328033,0.15103
KNNBasic,0.904195,0.135416,1.541544
