### User Collaborative Filtering


In [48]:
import numpy as np
import pandas as pd

In [49]:
df_utility = pd.DataFrame(
    {
        "Narnia1": np.array([5, 5, 1, np.nan]),
        "Narnia2": np.array([5, 4, 1, 1]),
        "LOTR1": np.array([4, 4, 5, 5]),
        "LOTR2": np.array([np.nan, 1, 5, 5]),
        "XMEN": np.array([np.nan, np.nan, 5, 4]),
        "XMENUnited": np.array([np.nan, np.nan, 5, 4]),
    },
    index=["Chloe", "Carlos", "Angelica", "Sara"],
)
df_utility.replace({np.nan: ""})

Unnamed: 0,Narnia1,Narnia2,LOTR1,LOTR2,XMEN,XMENUnited
Chloe,5.0,5,4,,,
Carlos,5.0,4,4,1.0,,
Angelica,1.0,1,5,5.0,5.0,5.0
Sara,,1,5,5.0,4.0,4.0


In [50]:
alice = pd.Series([np.nan, 1, np.nan, 5, np.nan, 5], index=df_utility.columns)
alice

Narnia1       NaN
Narnia2       1.0
LOTR1         NaN
LOTR2         5.0
XMEN          NaN
XMENUnited    5.0
dtype: float64

In [51]:
# add alice to the df_utility
df_utility.loc["Alice"] = alice

df_utility


Unnamed: 0,Narnia1,Narnia2,LOTR1,LOTR2,XMEN,XMENUnited
Chloe,5.0,5.0,4.0,,,
Carlos,5.0,4.0,4.0,1.0,,
Angelica,1.0,1.0,5.0,5.0,5.0,5.0
Sara,,1.0,5.0,5.0,4.0,4.0
Alice,,1.0,,5.0,,5.0


In [52]:
def raw_cos(a, b):
    """
      Accepts two pandas `Series` and returns the cosine distance 
      between the two with respect to common non-null values. 
      Return `0` if there is no common non-null value or one of them has length `0`.
    """

    # Return 0 if one of them has length `0`
    if a.empty or b.empty:
        return 0

    common = pd.concat([a, b], axis=1).dropna()
    # Return 0 if there are no common non-null values
    if common.empty:
        return 0

    # lets get common values from both series
    a_common = a[common.index]
    b_common = b[common.index]

    # Calculate the cosine distance
    dot_product = np.dot(a_common, b_common)
    norm_a = np.linalg.norm(a_common)
    norm_b = np.linalg.norm(b_common)
    # handle when norm is 0
    if norm_a == 0 or norm_b == 0:
        return 0

    return dot_product / (norm_a * norm_b)

There are multiple ways to compute for similarity of users. Below are 2 examples that uses cosine distance and adjusted cosine distance.
Adjusted cosine distance performs mean-centering of the values first before the distance calculation. 
Mean centering minimizes bias when user consistently rates high rating on all movies and also addresses outliers with very large values.

In [47]:
raw_cos_matrix = pd.DataFrame(index=df_utility.index, columns=df_utility.index)
for i in df_utility.index:
    for j in df_utility.index:
        raw_cos_matrix.loc[i, j] = raw_cos(
            df_utility.loc[i], df_utility.loc[j])
        
# let's compare results in a matrix
mean_centered = df_utility.sub(df_utility.mean(axis=1), axis=0)
adj_cos_matrix = pd.DataFrame(index=mean_centered.index, columns=mean_centered.index)
for i in mean_centered.index:
    for j in mean_centered.index:
        adj_cos_matrix.loc[i, j] = raw_cos(
            mean_centered.loc[i], mean_centered.loc[j])        

print('Cos -  Not Mean Centered')
print(raw_cos_matrix)
print('\nAdjusted Cost - Mean Centered')
print(adj_cos_matrix)

Cos -  Not Mean Centered
             Chloe    Carlos  Angelica      Sara     Alice
Chloe          1.0  0.994536  0.710669  0.765705       1.0
Carlos    0.994536       1.0  0.619103  0.706897  0.428086
Angelica  0.710669  0.619103       1.0  0.993898       1.0
Sara      0.765705  0.706897  0.993898       1.0  0.993912
Alice          1.0  0.428086       1.0  0.993912       1.0

Adjusted Cost - Mean Centered
             Chloe    Carlos  Angelica      Sara     Alice
Chloe          1.0  0.246183 -0.816497 -0.763386      -1.0
Carlos    0.246183       1.0 -0.632456 -0.446719 -0.613941
Angelica -0.816497 -0.632456       1.0  0.903696       1.0
Sara     -0.763386 -0.446719  0.903696       1.0  0.936083
Alice         -1.0 -0.613941       1.0  0.936083       1.0


Now that we have the similarity matrix, we can predict the item rating prediction for all users and on all items.
This builds a complete matrix of the user-item ratings prediction.

In [53]:

similarity_matrix = adj_cos_matrix
# Ensure diagonals are 0 (user similarity with themselves shouldn't count)
np.fill_diagonal(similarity_matrix.values, 0)
k=2
# Create a copy of the original matrix to fill in missing values
completed_matrix = df_utility.copy()

# Fill in missing values
for user in df_utility.index:
    # get the average rating of the user
    user_mean_rating = df_utility.loc[user].mean(skipna=True)
    # Get the k most similar users
    similar_users = similarity_matrix.loc[user].sort_values(
        ascending=False)

    for item in df_utility.columns:
        if pd.isnull(df_utility.loc[user, item]):
            # filter only similar users with value greater than 0
            top_k_rated_users = similar_users.head(k)
            # Get the weighted average rating of the k most similar users
            avg_rating = 0
            total_weight = 0
            for similar_user in top_k_rated_users.index:
                if pd.notnull(df_utility.loc[similar_user, item]):                        
                    similarity = similarity_matrix.loc[user, similar_user]
                    # get the mean rating of the similar user that is not null
                    mean_rating = df_utility.loc[similar_user].mean(
                        skipna=True)
                    avg_rating += similarity * \
                        (df_utility.loc[similar_user, item] - mean_rating)
                    total_weight += abs(similarity)

                          
            if total_weight > 0:
                completed_matrix.loc[user, item] = user_mean_rating + \
                    avg_rating / total_weight

# Sort by increasing user or itemId if distance is the same
completed_matrix = completed_matrix.sort_index(axis=0).sort_index(axis=1)

display(completed_matrix)

Unnamed: 0,LOTR1,LOTR2,Narnia1,Narnia2,XMEN,XMENUnited
Alice,4.935534,5.0,1.0,1.0,4.452041,5.0
Angelica,5.0,5.0,1.0,1.0,5.0,5.0
Carlos,4.0,1.0,5.0,4.0,,
Chloe,4.0,2.166667,5.0,5.0,,
Sara,5.0,5.0,1.133333,1.0,4.0,4.0


We can then use this matrix to recommend items that are highly rated by prediction, but not yet rated.

In [54]:
# get all items not yet rated by the user
not_rated = df_utility.loc[user][df_utility.loc[user].isnull()].index

# return the index to the n highest rated items
print(completed_matrix.loc[user, not_rated].sort_values(ascending=False).head(2))

LOTR1    4.935534
XMEN     4.452041
Name: Alice, dtype: float64


We can see the top 2 movies recommended are Narnia.