In [185]:
import pandas as pd
import numpy as np
import math
import warnings

warnings.filterwarnings("error")

ratings = pd.read_csv("dataset/ratings.csv").drop('timestamp', axis = 1) #open csv file and drop 'timestamp' column, it is not necessary
movies = pd.read_csv("dataset/movies.csv").drop(['title', 'genres'], axis = 1) #open csv file and drop 'timestamp'  and 'genres' columns, they are not necessary
rows_count = len(ratings) #number of rows
print(f'ratings.csv has {rows_count} rows. ') #check number of rows


ratings.csv has 100836 rows. 


In [186]:
def pearson_correlation(df_user1, df_user2): #Pearson correlation function for computing similarities between users
    merged_ratings = df_user1.merge(df_user2, on = "movieId", how = "inner") #merged df with movies rated by both users
    if merged_ratings.empty == True:
        return math.nan
    
    ratings_user1 = merged_ratings['rating_x']
    ratings_user2 = merged_ratings['rating_y']
    mean_user1 = ratings_user1.mean() #mean of ratings of user1
    mean_user2 = ratings_user2.mean() #mean of ratings of user2
    
    num = np.sum((ratings_user1 - mean_user1)*(ratings_user2 - mean_user2))
    den = np.sqrt(np.sum((ratings_user1 - mean_user1)**2)) * np.sqrt(np.sum((ratings_user2 - mean_user2)**2))
    try:
        coef = num/den
    except (RuntimeWarning, ZeroDivisionError):
        return math.nan
    
    return coef

#pearson_correlation(user1_ratings, user2_ratings)

In [215]:
def prediction(userId, movieId, ratings): #prediction function
    df_userA = ratings[ratings['userId'] == userId]
    userA_mean = df_userA['rating'].mean()
    users_for_film = ratings[ratings['movieId'] == movieId].drop(['movieId', 'rating'], axis = 1) #users that rated a specific film
    num = 0
    den = 0
    for user in users_for_film['userId']:
        df_userB = ratings[ratings['userId'] == user]
        sim = pearson_correlation(df_userA, df_userB)
        if not math.isnan(sim):
            num += sim * (df_userB[df_userB['movieId'] == movieId].iloc[0]['rating'] - df_userB['rating'].mean())
            den += sim

    try:
        div = num/den
        pred = userA_mean + div
    except (RuntimeWarning, ZeroDivisionError):
        return math.nan
    
    return pred


In [None]:
USER_A = 1
USER_A_ratings = ratings[ratings["userId"] == USER_A]
USER_A_mean = USER_A_ratings['rating'].mean()
USER_A_films = USER_A_ratings['movieId']

def func(x, dic):
    user_b_ratings = ratings[ratings['userId'] == x]
    corr = pearson_correlation(USER_A_ratings, user_b_ratings)
    if not math.isnan(corr):
        dic.update({x : corr})

def func2(x, dic):
    prd = prediction(USER_A, x, ratings)
    if not math.isnan(prd):
        dic.update({x : prd})


def main():
    #ratings = pd.read_csv("dataset/ratings.csv").drop('timestamp', axis = 1) #open csv file and drop 'timestamp' column, it is not necessary
    users = ratings.groupby('userId').groups.keys() #all the users of the dataset
    sim = dict() #dictionary of the result
    pd.DataFrame.from_dict(users).map(lambda x: func(x, sim) if(x != USER_A) else None) #call the function of similarity on every user
    top10usr = dict(sorted(sim.items(), key=lambda x:x[1], reverse=True)[:10]) #get top 10 users similar to user in input
    print(f'top 10 users similar to user in input are {top10usr}')
    
    mvs = pd.concat([movies,USER_A_films]).drop_duplicates(keep=False) #get movies not rated by user
    pred = dict()
    mvs['movieId'].map(lambda x: func2(x, pred))
    top10mvs = dict(sorted(pred.items(), key=lambda x:x[1], reverse=True)[:10]) #get top 10 users similar to user in input
    print(f'top 10 films to suggest to user in input are {top10mvs}')


main()