In [1]:
# data manipulation:
import pandas as pd
import numpy as np

# stats:
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy.stats import pearsonr
from scipy import stats
import math

# plotting and images:
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from IPython.display import Image

# import warnings
# warnings.filterwarnings("ignore")

---
## About the data:
In the second notebook in this chapter we look at one of the ways in which companies like Amazon provide recommendations based on item similarity to previoulsy purchased items, or previously watched movies.

The technique we'll be looking at is called "Item-based Collaborative Filtering". It relies on finding the similarity of the target item under evaluation to previously purchased/watched items.

More details below!

---
### Import the data

In [2]:
item_df = pd.read_excel('FinalItemBased.xlsx', header=6).iloc[:7, 2:9].rename(columns={'Unnamed: 2': 'user'}).fillna(np.nan)
item_df.set_index('user', inplace=True)
item_df

Unnamed: 0_level_0,Sixth Sense,Sully,Still Alice,Superman,DodgeBall,Parasite
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Lana,2.5,3.5,3.0,3.5,2.5,
George,3.0,3.5,1.5,5.0,3.5,3.0
Manuel,2.5,3.0,,3.5,,4.0
Charles,,3.5,3.0,4.0,2.5,4.5
Noel,3.0,4.0,2.0,3.0,2.0,3.0
James,3.0,4.0,,5.0,3.5,3.0
Theresa,,4.5,,4.0,1.0,


---
### Find item correlations:
In this case, the item correlations (seem to) come from the user ratings. 

In [3]:
item_corr = item_df.corr()
item_corr

Unnamed: 0,Sixth Sense,Sully,Still Alice,Superman,DodgeBall,Parasite
Sixth Sense,1.0,0.763763,-0.944911,0.48795,0.333333,-1.0
Sully,0.763763,1.0,-0.333333,0.111803,-0.645497,-0.633866
Still Alice,-0.944911,-0.333333,1.0,-0.42289,-0.485662,0.944911
Superman,0.48795,0.111803,-0.42289,1.0,0.657952,-0.296464
DodgeBall,0.333333,-0.645497,-0.485662,0.657952,1.0,-0.333333
Parasite,-1.0,-0.633866,0.944911,-0.296464,-0.333333,1.0


---
### Define item similarity function:
$$
\text{Similarity score }= \text{average ratings for $target\ user$ } + \frac{\sum_\limits{other\ items} \text{($other\ items$ similarity to $target\ item$)} * \text{(($target\ user$ ratings for $target\ item$}) - (\text{$target\ user$ average ratings)})}{\sum_\limits{other\ items}|\text{$other\ items$ similarity to $target\ item$}|}
$$
(See page 229)

In [4]:
def item_based_similarity(user_df, item_corr_df, target_user, target_item):
    # calculate target user's mean rating:
    target_user_mean_rating = np.array(round(user_df.loc[target_user].mean(), 2))
    
    # drop target item from user df (to avoid computing its values in the formula):
    user_df.drop(target_item, axis=1, inplace=True)
    
    # drop target item from correlation df (to avoid computing its values in the formula):
    item_corr_df.drop(target_item, axis=1, inplace=True)
    
    # get other item's correlation to target item (long var name, I know...):
    other_items_corr_to_target_item = np.array(round(item_corr_df.loc[target_item], 2))
    
    # get target user's rating for other movies:
    target_user_ratings_other_items = np.array(round(user_df.loc[target_user], 2))
    
    # get the index of unrated (nan) item values:
    unrated = np.argwhere(np.isnan(target_user_ratings_other_items))
    
    # set correlations to NaN for unrated items (to avoid calculating lift and denominator for unrated items):
    # i.e. if user hasnt rated other items, then correlations are irrelevant.
    for item in unrated:
        other_items_corr_to_target_item[item] = np.nan
    
    # compute similarity score:
    lift = np.nansum(target_user_ratings_other_items - target_user_mean_rating)
    numerator = np.nansum(other_items_corr_to_target_item * lift)
    denominator = np.nansum(abs(other_items_corr_to_target_item))
    
    # score:
    sim_score = np.round(target_user_mean_rating + (numerator / denominator), 2)
    
    return sim_score

In [5]:
item_based_similarity(item_df, item_corr, 'Theresa', 'Still Alice')

3.18

**Note:** as per the text, Amazon uses item-based collaborative filtering since it is supposed to be more stable over time than user-based collaborative filtering. This is partly because the matrix of user correlations needs to be updated more frequently.

---