In [1]:
import pandas as pd
import numpy as np
import math
import warnings

warnings.filterwarnings("error")

ratings = pd.read_csv("dataset/ratings.csv").drop('timestamp', axis = 1) #open csv file and drop 'timestamp' column, it is not necessary
movies = pd.read_csv("dataset/movies.csv").drop(['genres'], axis = 1) #open csv file and drop 'timestamp'  and 'genres' columns, they are not necessary
rows_count = len(ratings) #number of rows
print(f'ratings.csv has {rows_count} rows. ') #check number of rows


ratings.csv has 100836 rows. 


In [23]:
def jaccard_similarity(df_user1, df_user2): #Jaccard similarity function for computing similarities between users
    merged_ratings = df_user1.merge(df_user2, on = "movieId", how = "inner") #number of ratings of movies in common
    same_rates = len(set(merged_ratings['rating_x']) & set(merged_ratings['rating_y'])) #number of same rates on the same movies
    if merged_ratings.empty == True or same_rates == 0:
        return math.nan

    try:
        jac = len(same_rates)/len(merged_ratings)
    except (RuntimeWarning, ZeroDivisionError):
        return math.nan

    return jac

In [27]:
def euclidean_distance_similarity(df_user1, df_user2):
    merged_ratings = df_user1.merge(df_user2, on = "movieId", how = "inner") #merged df with movies rated by both users
    if merged_ratings.empty == True:
        return math.nan
    
    ratings_user1 = merged_ratings['rating_x']
    ratings_user2 = merged_ratings['rating_y']
    dis = np.sqrt(np.sum((ratings_user1**2) - (ratings_user2**2)))

    return dis

In [11]:
def cosine_similarity(df_user1, df_user2): #Cosine similarity function for computing similarities between users
    merged_ratings = df_user1.merge(df_user2, on = "movieId", how = "inner") #merged df with movies rated by both users
    if merged_ratings.empty == True:
        return math.nan
    
    ratings_user1 = merged_ratings['rating_x'] #vector of ratings of user1
    ratings_user2 = merged_ratings['rating_y'] #vector of ratings of user2
    
    num = np.sum((ratings_user1 * ratings_user2))
    den = np.sqrt(np.sum((ratings_user1**2))) * np.sqrt(np.sum((ratings_user2**2)))

    try:
        cos = num/den
    except (RuntimeWarning, ZeroDivisionError):
        return math.nan
    
    return cos

In [22]:
def pearson_correlation(df_user1, df_user2): #Pearson correlation function for computing similarities between users
    merged_ratings = df_user1.merge(df_user2, on = "movieId", how = "inner") #merged df with movies rated by both users
    if merged_ratings.empty == True:
        return math.nan
    
    ratings_user1 = merged_ratings['rating_x']
    ratings_user2 = merged_ratings['rating_y']
    mean_user1 = ratings_user1.mean() #mean of ratings of user1
    mean_user2 = ratings_user2.mean() #mean of ratings of user2
    
    num = np.sum((ratings_user1 - mean_user1)*(ratings_user2 - mean_user2))
    den = np.sqrt(np.sum((ratings_user1 - mean_user1)**2)) * np.sqrt(np.sum((ratings_user2 - mean_user2)**2))
    try:
        coef = num/den
    except (RuntimeWarning, ZeroDivisionError):
        return math.nan
    
    return coef

In [3]:
def prediction(userId, top10, movieId, ratings): #prediction function
    df_userA = ratings[ratings['userId'] == userId]
    userA_mean = df_userA['rating'].mean()
    users_for_film = ratings[ratings['movieId'] == movieId].drop(['movieId', 'rating'], axis = 1) #users that rated a specific film
    merged_users = set(users_for_film['userId']) & top10.keys() #get the users in top10 that rated the film in input
            
    num = 0
    den = 0
    if bool(set):
        for user in merged_users:
            df_userB = ratings[ratings['userId'] == user]
            sim = top10.get(user) #get pearson correlation value between the users
            if not math.isnan(sim):
                num += sim * (df_userB[df_userB['movieId'] == movieId].iloc[0]['rating'] - df_userB['rating'].mean())
                den += sim

        try:
            div = num/den
            pred = userA_mean + div
        except (RuntimeWarning, ZeroDivisionError):
            return math.nan
    else:
        return math.nan
    
    return pred

In [28]:
USER_A = 200
USER_A_ratings = ratings[ratings["userId"] == USER_A]
USER_A_mean = USER_A_ratings['rating'].mean()
USER_A_films = USER_A_ratings['movieId']

def get_similarity(user, dict):
    user_b_ratings = ratings[ratings['userId'] == user]
    
    #pearson correlation
    #corr = pearson_correlation(USER_A_ratings, user_b_ratings) #calculate pearson correlation value between the users
    #if not math.isnan(corr):
        #dict.update({user : corr}) #add the user and the related pearson correlation value
    
    #jaccard similarity
    #jac = jaccard_similarity(USER_A_ratings, user_b_ratings)
    #if not math.isnan(jac):
        #dict.update({user : jac})

    #cosine similarity
    #cos = cosine_similarity(USER_A_ratings, user_b_ratings)
    #if not math.isnan(cos):
        #dict.update({user : cos})

    #euclidean distance
    dis = euclidean_distance_similarity(USER_A_ratings, user_b_ratings)
    if not math.isnan(dis):
        dict.update({user : dis})

def get_prediction(movie, top10usr, dict):
    prd = prediction(USER_A, top10usr, movie, ratings) #calculate prediction on a specific movie
    if not math.isnan(prd):
        dict.update({movies[movies['movieId'] == movie].iloc[0]['title'] : prd}) #add the film title and the prediction to the dict


def main():
    #ratings = pd.read_csv("dataset/ratings.csv").drop('timestamp', axis = 1) #open csv file and drop 'timestamp' column, it is not necessary
    users = ratings.groupby('userId').groups.keys() #all the users of the dataset
    topusr = dict() #dictionary of the result
    pd.DataFrame.from_dict(users).map(lambda x: get_similarity(x, topusr) if(x != USER_A) else None) #call the function of similarity on every user
    top10usr = dict(sorted(topusr.items(), key=lambda x:x[1], reverse=True)[:10]) #get top 10 users similar to user in input
    print(f'top 10 users similar to user {USER_A} are {top10usr}')
    
    mvs = pd.concat([movies,USER_A_films]).drop_duplicates(keep=False) #get movies not rated by user
    #pred = dict()
    mvs_sugg = dict() #movies to sugget to user in input
    mvs['movieId'].map(lambda x: get_prediction(x, top10usr, mvs_sugg))
    top10mvs = dict(sorted(mvs_sugg.items(), key=lambda x:x[1], reverse=True)[:10]) #get top 10 films to suggest to user in input
    print(f'top 10 films to suggest to user {USER_A} in input are {top10mvs}')


main()

top 10 users similar to user 200 are {414: 95.06971126494494, 68: 90.8129946648606, 480: 81.11257609027098, 474: 80.02968199362034, 274: 78.16009211867653, 599: 76.83911764199274, 380: 76.04110730387873, 448: 75.06164133563827, 177: 74.8715566820939, 249: 74.30006729472053}
top 10 films to suggest to user 200 in input are {'Once Upon a Time in America (1984)': 5.962508994885765, 'Towering Inferno, The (1974)': 5.962508994885765, 'True Grit (1969)': 5.962508994885765, 'The Big Bus (1976)': 5.962508994885765, 'Rollerball (1975)': 5.962508994885765, 'Victory (a.k.a. Escape to Victory) (1981)': 5.962508994885765, 'Gigantic (A Tale of Two Johns) (2002)': 5.962508994885765, 'Seve (2014)': 5.962508994885765, 'On the Beach (1959)': 5.667830199165833, 'Badlands (1973)': 5.667830199165833}
