# Collaborative Filtering
## Creating our example

In [1]:
movies = ["Harry Potter", "The Lord of the Rings", "James Bond", "Fast & Furious", "Rush Hour", "Pretty Woman"]
user_a = [8, 9, 7, -1, 4, 3]
user_b = [-1, 8, 8, 8, 3, 3]
user_c = [4, -1, 3, -1, 8, 9]
user_d = [5, 5, 1, 2, 9, -1]
users = [user_a, user_b, user_c, user_d]

In [2]:
import pandas as pd

df_movie_ratings = pd.DataFrame(users, ["User A", "User B", "User C", "User D"], movies)
df_movie_ratings

Unnamed: 0,Harry Potter,The Lord of the Rings,James Bond,Fast & Furious,Rush Hour,Pretty Woman
User A,8,9,7,-1,4,3
User B,-1,8,8,8,3,3
User C,4,-1,3,-1,8,9
User D,5,5,1,2,9,-1


## Functions needed to calculate our prediction

This function returns the lists of the matching ratings (i.e. those ratings where both have one):

In [3]:
def get_matching_ratings(ratings_a, ratings_b):
    matching_ratings = list(zip(*[pair for pair in zip(ratings_a, ratings_b) if (pair[0] != -1 and pair[1] != -1)]))

    return list(matching_ratings[0]), list(matching_ratings[1])

The Pearson correlation coefficient $c_{a,u} = \frac{covar(r_a,r_u)}{\sigma_{r_a}\sigma_{r_u}}$, where:
- $covar(r_a,r_u) = \frac{\sum_{i=1}^{m} (r_{a,i} - \overline{r}_a)(r_{u,i} - \overline{r}_u)}{m}$
- $\overline{r}_x = \frac{\sum_{i=1}^{m} r_{x,i}}{m}$
- $\sigma_{r_x} = \sqrt{\frac{\sum_{i=1}^{m} (r_{x,i} - \overline{r}_x)^2}{m}}$

In [4]:
import math

def pearsons_coefficient(ratings_a, ratings_b):
    matching_ratings_a, matching_ratings_b = get_matching_ratings(ratings_a, ratings_b)

    mean_a = sum(matching_ratings_a) / len(matching_ratings_a)
    mean_b = sum(matching_ratings_b) / len(matching_ratings_b)

    distance_to_avg_a = [rating - mean_a for rating in matching_ratings_a]
    distance_to_avg_b = [rating - mean_b for rating in matching_ratings_b]

    standard_deviation_a = math.sqrt(sum([distance*distance for distance in distance_to_avg_a]) / len(distance_to_avg_a))
    standard_deviation_b = math.sqrt(sum([distance*distance for distance in distance_to_avg_b]) / len(distance_to_avg_b))

    numerator = sum([pair[0]*pair[1] for pair in zip(distance_to_avg_a, distance_to_avg_b)]) / len(list(matching_ratings_a))
    denominator = standard_deviation_a * standard_deviation_b

    return round(numerator / denominator, 2), len(list(matching_ratings_a))

Every coefficient between each user:

In [5]:
c_ab = pearsons_coefficient(user_a, user_b)
c_ac = pearsons_coefficient(user_a, user_c)
c_ad = pearsons_coefficient(user_a, user_d)

c_bc = pearsons_coefficient(user_b, user_c)
c_bd = pearsons_coefficient(user_b, user_d)

c_cd = pearsons_coefficient(user_c, user_d)

coefficients = [c_ab, c_ac, c_ad, c_bc, c_bd, c_cd]

df_coefficients = pd.DataFrame(coefficients, ["A - B", "A - C", "A - D", "B - C", "B - D", "C - D"], ["Coefficient", "Number of matching ratings"])
df_coefficients

Unnamed: 0,Coefficient,Number of matching ratings
A - B,0.94,4
A - C,-0.95,4
A - D,-0.57,4
B - C,-0.99,3
B - D,-0.88,4
C - D,0.94,3


We get a prediction for one rating via $p_{a,i} = \overline{r}_a + \frac{\sum_{u=1}^{n} w_{a,u}(r_{u,i} - \overline{r}_u)}{\sum_{u=1}^{n} w_{a,u}}$, where:
- $w_{a,u} = s_{a,u} * c_{a,u} \ with \ s_{a,u} = 1 \ if \ m>50 \ else \ \frac{m}{50}$

In [6]:
def get_prediction_for_movie(users, user, movie, movies, weighting):
    index_of_movie = movies.index(movie)
    # if there is an existing rating we return that one
    if users[user][index_of_movie] != -1: return users[user][index_of_movie]

    # take the mean of the user
    mean_of_user = sum(list(filter(lambda x: x!=-1, users[user]))) / len(list(filter(lambda x: x!=-1, users[user])))

    # all users that have rated the movie in question
    valid_other_users = [other for other in users if other[index_of_movie] != -1]

    # entry: ((coefficient, number of matching ratings), index of the compared user)
    coefficients = [(pearsons_coefficient(users[user], other), users.index(other)) for other in valid_other_users]
    # sort by coefficient and only take the best one
    coefficients.sort(key=lambda x: x[0][0], reverse=True)
    coefficients = coefficients[:1] #adjust this here to compare to n others

    # weights are either (1, index of user) for matching < 50 or (m/50, index of user) otherwise
    weights = [(1, coefficient[1]) if coefficient[0][1] > 50 or not weighting else (round(coefficient[0][1]/50,2), coefficients[1]) for coefficient in coefficients]

    # sum of weight * distance to avg
    numerator = sum([weight[0] * (users[weight[1]][index_of_movie] - (sum(get_matching_ratings(users[user], users[weight[1]])[1]) / len((get_matching_ratings(users[user], users[weight[1]])[1])))) for weight in weights])
    # sum of weights
    denominator = sum([weight[0] for weight in weights])

    return round(mean_of_user + (numerator / denominator), 1)

## Resulting predictions

In [7]:
update_a = get_prediction_for_movie(users, 0, "Fast & Furious", movies, False)
update_b = get_prediction_for_movie(users, 1, "Harry Potter", movies, False)
update_c_1 = get_prediction_for_movie(users, 2, "The Lord of the Rings", movies, False)
update_c_2 = get_prediction_for_movie(users, 2, "Fast & Furious", movies, False)
update_d = get_prediction_for_movie(users, 3, "Pretty Woman", movies, False)

users[0][movies.index("Fast & Furious")] = update_a
users[1][movies.index("Harry Potter")] = update_b
users[2][movies.index("The Lord of the Rings")] = update_c_1
users[2][movies.index("Fast & Furious")] = update_c_2
users[3][movies.index("Pretty Woman")] = update_d

df_updated_movie_ratings = pd.DataFrame(users, ["User A", "User B", "User C", "User D"], movies)
df_updated_movie_ratings

Unnamed: 0,Harry Potter,The Lord of the Rings,James Bond,Fast & Furious,Rush Hour,Pretty Woman
User A,8.0,9.0,7,8.7,4,3.0
User B,8.2,8.0,8,8.0,3,3.0
User C,4.0,6.0,3,3.0,8,9.0
User D,5.0,5.0,1,2.0,9,8.4
