In [1]:
import pandas as pd
import numpy as np
import math
from scipy.spatial import distance

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


In [7]:
#Calculate the average euclidean distance between the movie scores of two users
def calculate_sim_score_users(u1_movie_scores, u2_movie_scores):
    n = 0
    sum_abs = 0
    for i in range(len(u1_movie_scores)):
        if (u1_movie_scores[i] != 0) & (u2_movie_scores[i] != 0):
            n = n + 1
            sum_abs = sum_abs + (abs(u1_movie_scores[i] - u2_movie_scores[i]) ** 2)
    if n == 0:
        return math.inf
    else:
        return round((math.sqrt(sum_abs)/n), 3)


In [8]:
#Predict the rating for the target book as the weighted average of ratings from the most similar books, weighted
#by their similarity scores
def calculate_weighted_average(sim_scores, ratings):
    return np.dot(sim_scores, ratings)/sum(sim_scores)


In [9]:
#Collaborative Filtering - User-User
def CFUserToUser(data_file, target_u, target_m, n_neigbors=2):
        
    #to read the file with index col
    df_utility = pd.read_csv(data_file).set_index("movies") 
    
    #Get the target user index
    target_user_index = int(target_u[-1]) - 1

    #only consider the users who rated the target movie
    target_movie_all_user_ratings = list(df_utility.loc[target_m])
    
    #calculate similarity between target user and the users who rated the target movie
    dic_user_sim = {}
    for index in range(len(target_movie_all_user_ratings)):
        if target_movie_all_user_ratings[index] == 0:
            dic_user_sim["u" + str(index + 1)] = math.inf
        elif index == target_user_index:
            dic_user_sim["u" + str(index + 1)] = math.inf
        else:

            #calculate similarity between target user and the users who rated the target movie
            target_user_movie_scores = list(df_utility[target_u])
            target_movie_user_scores = list(df_utility["u" + str(index + 1)])

            #use cosing similarity to measure between books
            sim_score_users = calculate_sim_score_users(target_user_movie_scores, target_movie_user_scores)
            dic_user_sim["u" + str(index + 1)] = sim_score_users

            
    #Extract the top n user ratings
    sorted_dic_user_sim = dict(sorted(dic_user_sim.items(), key=lambda item: item[1], reverse=False)[:n_neigbors])
    sim_scores = list(sorted_dic_user_sim.values())
    sorted_dic_user_sim_values_indices = [int(item[-1]) - 1 for item in list(sorted_dic_user_sim.keys())]
    ratings = [target_movie_all_user_ratings[x] for x in sorted_dic_user_sim_values_indices]
    weighted_average = round(calculate_weighted_average(sim_scores, ratings), 2)

            
    return sorted_dic_user_sim, weighted_average
    

In [10]:
data_file = "f22_utility_mat.csv"

# target user and movie, respectively
target_u, target_m = "u5", "m2" 

#User-user collaborative filtering to estimate the rating for the target movie of this target user
dic_neighbor, rating = CFUserToUser(data_file, target_u, target_m, n_neigbors=2)

# dic_neighbor={’u3’: 0.333, ’u4’: 0.707} # a dictionary of 2 closest users and their corresponding distances to u5
# rating=4.32
dic_neighbor
rating

{'u3': 0.333, 'u4': 0.707}

4.32

In [6]:
pd.read_csv("f22_utility_mat.csv").set_index("movies") 

Unnamed: 0_level_0,u1,u2,u3,u4,u5,u6,u7,u8,u9,u10,u11,u12
movies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
m1,1,0,3,0,0,5,0,0,5,0,4,0
m2,0,0,5,4,0,0,4,0,0,2,1,3
m3,2,4,0,1,2,0,3,0,4,3,5,0
m4,0,2,4,0,5,0,0,4,0,0,2,0
m5,0,0,4,3,4,2,0,0,0,0,2,5
m6,1,0,3,0,3,0,0,2,0,0,4,0
