In [1]:
# data manipulation:
import pandas as pd
import numpy as np

# stats:
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy.stats import pearsonr
from scipy import stats
import math

# plotting and images:
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from IPython.display import Image

# import warnings
# warnings.filterwarnings("ignore")

---
## About the data:
In the first notebook in this chapter we look at one of the ways in which companies like Netflix provide recommendations based on the rating behavior of other users.

The technique we'll be looking at is called **"User-based Collaborative Filtering"**. It relies on finding the similarity of the target user (we're looking to predict a rating for) to other users that have previously rated the item the target user is looking to purchase/watch, etc. 

More details below!

---
### Import the data

In [2]:
user_df = pd.read_excel('FinalUserBased.xlsx', sheet_name='Model', header=6).iloc[:7, 2:9]
user_df.rename(columns={'Unnamed: 2': 'user'}, inplace=True)

# set "user" as the index, as it will make things easier down the line:
user_df.set_index('user', inplace=True)
user_df

Unnamed: 0_level_0,Sixth Sense,Sully,Still Alice,Superman,DodgeBall,Parasite
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Lana,2.5,3.5,3.0,3.5,2.5,
George,3.0,3.5,1.5,5.0,3.5,3.0
Manuel,2.5,3.0,,3.5,,4.0
Charles,,3.5,3.0,4.0,2.5,4.5
Noel,3.0,4.0,2.0,3.0,2.0,3.0
James,3.0,4.0,,5.0,3.5,3.0
Theresa,,4.5,,4.0,1.0,


---
### Get mean rating score for all users:
Get row mean rating for all movies per user.

In [3]:
# get all movie title columns:
movies = user_df.columns.to_list()

# take the mean:
user_df['mean_rating'] = round(user_df[movies].mean(axis=1), 2)
user_df

Unnamed: 0_level_0,Sixth Sense,Sully,Still Alice,Superman,DodgeBall,Parasite,mean_rating
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Lana,2.5,3.5,3.0,3.5,2.5,,3.0
George,3.0,3.5,1.5,5.0,3.5,3.0,3.25
Manuel,2.5,3.0,,3.5,,4.0,3.25
Charles,,3.5,3.0,4.0,2.5,4.5,3.5
Noel,3.0,4.0,2.0,3.0,2.0,3.0,2.83
James,3.0,4.0,,5.0,3.5,3.0,3.7
Theresa,,4.5,,4.0,1.0,,3.17


---
### Get similarity measure for all users:
First, we need to construct the right dataframe.

In [4]:
# drop the last row (mean value) and fill NaNs with correct values (to avoid breaking the correlation matrix):
sim_df = user_df.transpose().iloc[:-1,:].fillna(np.nan)
sim_df

user,Lana,George,Manuel,Charles,Noel,James,Theresa
Sixth Sense,2.5,3.0,2.5,,3.0,3.0,
Sully,3.5,3.5,3.0,3.5,4.0,4.0,4.5
Still Alice,3.0,1.5,,3.0,2.0,,
Superman,3.5,5.0,3.5,4.0,3.0,5.0,4.0
DodgeBall,2.5,3.5,,2.5,2.0,3.5,1.0
Parasite,,3.0,4.0,4.5,3.0,3.0,


### Similarity Matrix (correlations):

In [5]:
# calculate user-user similarity:
sim_df = round(sim_df.corr(), 2)
sim_df

user,Lana,George,Manuel,Charles,Noel,James,Theresa
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Lana,1.0,0.4,0.87,0.94,0.6,0.85,0.99
George,0.4,1.0,0.2,0.31,0.41,0.96,0.38
Manuel,0.87,0.2,1.0,1.0,-0.26,0.13,-1.0
Charles,0.94,0.31,1.0,1.0,0.57,0.03,0.89
Noel,0.6,0.41,-0.26,0.57,1.0,0.21,0.92
James,0.85,0.96,0.13,0.03,0.21,1.0,0.66
Theresa,0.99,0.38,-1.0,0.89,0.92,0.66,1.0


---
### Build Prediction Matrix:

In [6]:
# get columns from user df:
pred_df = user_df.merge(sim_df, left_index=True, right_index=True)
pred_df

Unnamed: 0_level_0,Sixth Sense,Sully,Still Alice,Superman,DodgeBall,Parasite,mean_rating,Lana,George,Manuel,Charles,Noel,James,Theresa
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Lana,2.5,3.5,3.0,3.5,2.5,,3.0,1.0,0.4,0.87,0.94,0.6,0.85,0.99
George,3.0,3.5,1.5,5.0,3.5,3.0,3.25,0.4,1.0,0.2,0.31,0.41,0.96,0.38
Manuel,2.5,3.0,,3.5,,4.0,3.25,0.87,0.2,1.0,1.0,-0.26,0.13,-1.0
Charles,,3.5,3.0,4.0,2.5,4.5,3.5,0.94,0.31,1.0,1.0,0.57,0.03,0.89
Noel,3.0,4.0,2.0,3.0,2.0,3.0,2.83,0.6,0.41,-0.26,0.57,1.0,0.21,0.92
James,3.0,4.0,,5.0,3.5,3.0,3.7,0.85,0.96,0.13,0.03,0.21,1.0,0.66
Theresa,,4.5,,4.0,1.0,,3.17,0.99,0.38,-1.0,0.89,0.92,0.66,1.0


---
### Define the "User-based Collaborative Filtering" similarity score function:
Using the prediction matrix above, we compute similarity scores using the formula below:
$$
\text{Similarity score }= \text{average ratings for $target\ user$ } + \frac{\sum_\limits{other\ users} \text{($other\ users$ similarity to $target\ user$)} * \text{(($other\ users$ ratings for $target\ item$}) - (\text{$other\ users$ average ratings)})}{\sum_\limits{other\ users}|\text{$other\ users$ similarity to $target\ user$}|}
$$
(See page 238)

In [115]:
# function definition:
def predict_rating(df, user_name, target_item):
    # get mean rating for target user:
    target_mean_rating = df.loc[user_name]['mean_rating']
    
    # drop target user from matrix (row) to avoid computing its own values:
    df = df.drop(user_name, axis=0)
    
    # filter any users without ratings in the target item (i.e. NaNs):
    df = df[df[target_item] >= 0]
    
    # compute other users' similarity to the target user:
    similarities = np.array(df[user_name])
    
    # compute rating for target items while accounting for the base rate:
    comp_user_item_ratings = np.array(df[target_item])
    comp_user_rating_mean = np.array(df['mean_rating'])
    # rating for item above the users mean rating:
    comp_users_base_rating = comp_user_item_ratings - comp_user_rating_mean
    
    # define numerator and denominator for the formula:
    numerator = sum(similarities*comp_users_base_rating)
    denominator = sum(abs(similarities))
    
    # simlarity score:
    sim_score = round(target_mean_rating + (numerator / denominator), 2)
    
    return sim_score

In [118]:
# get prediction for user 'Theresa', for movie 'Still Alice':
predict_rating(pred_df, user_name='Theresa', target_item='Still Alice')

2.58

As we can see from the output above, Theresa's predicted rating for "Still Alice" should be $2.58$.

---