### Imports

In [1]:
import pandas as pd
import numpy as np
import random
import math

import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import seaborn as sns
import recmetrics

from sklearn.metrics import mean_absolute_error

from surprise import Dataset, get_dataset_dir, Reader, accuracy
from surprise import KNNWithZScore
from surprise.model_selection import train_test_split, GridSearchCV, cross_validate, KFold

import warnings
import itertools
from itertools import chain

np.random.seed(1) # control randomness

## Memory-Based Collaborative Filtering (Mean Prediction)

This section uses a small ratings matrix to demonstrate a basic memory-based collaborative filtering system by predicting the rating for an active user. The generated prediction is the mean rating those within the active user's neighbourhood have given the item.

In [2]:
# matrix dimensions
number_of_movies = 5
number_of_users = 5

movies = [] # array of movie names
for i in range(1, number_of_movies+1): movies.append('movie' + str(i))

ratings_matrix = pd.DataFrame(np.random.randint(1,5,size=(number_of_users, number_of_movies)),
                              columns=movies).rename_axis('userId')

# missing movie ratings
ratings_matrix.at[0,'movie1']= np.nan
ratings_matrix.at[1,'movie1']= np.nan
ratings_matrix.at[4,'movie4']= np.nan

ratings_matrix

Unnamed: 0_level_0,movie1,movie2,movie3,movie4,movie5
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,,4,1,1.0,4
1,,4,2,4.0,1
2,1.0,2,1,4.0,2
3,1.0,3,2,3.0,1
4,3.0,2,3,,4


In [3]:
def user_based_prediction(ratings_matrix, userId, k, movie_predict):
    """
    :type ratings_matrix: pd.DataFrame()
    :type userId: int
    :type k: int
    :type movie_predict: str
    :rtype: int
    """
    userId = ratings_matrix.index.get_loc(userId) # index of user_i
    
    # remove users with no ratings of the movie
    ratings_matrix = pd.concat([ratings_matrix.iloc[[userId]],
                                ratings_matrix[ratings_matrix[movie_predict].notna()]])
    
    distance_matrix = ratings_matrix.T.corr(method='pearson') # similarity matrix
    print(distance_matrix, end="\n\n")
    
    user_weights = distance_matrix[userId].drop(userId).sort_values(ascending=False) # similarity list for user_i
    print(user_weights, end="\n\n")

    # indices of the k most similar users
    most_similar_users = user_weights.iloc[:k].index
    print(most_similar_users, end="\n\n")
       
    _similar_user_ratings = []
    
    # obtain k most similar users ratings
    temp = ratings_matrix.loc[most_similar_users]    
    for index, row in temp.iterrows():
        _similar_user_ratings.append(row[movie_predict])
    
    return np.mean(_similar_user_ratings) # average neighbourhood rating


We use the function to generate a prediction on userId: 1 for the item: movie1 by taking the three most similar neighbours of the active user and computing the mean value of their ratings. As is shown, userId: 0 has no present rating for movie1 so this user is removed from our potential neighbourhood corpus.

In [4]:
print('User-Based Prediction: ' + str(user_based_prediction(ratings_matrix, 1, 2, 'movie1')))

userId         1         2         3         4
userId                                        
1       1.000000  0.573964  0.986440 -0.981981
2       0.573964  1.000000  0.612372  0.000000
3       0.986440  0.612372  1.000000 -0.852803
4      -0.981981  0.000000 -0.852803  1.000000

userId
3    0.986440
2    0.573964
4   -0.981981
Name: 1, dtype: float64

Int64Index([3, 2], dtype='int64', name='userId')

User-Based Prediction: 1.0


### Item-Based Prediction

The previous function computes a user-based prediction by creating a neighbourhood of similar users, however, memory-based filtering can also be item-based which generates a predicted rating in relation to how the active user has rated other items.

In [5]:
def item_based_prediction(ratings_matrix, userId, k, movie_predict):
    """
    :type ratings_matrix: pd.DataFrame()
    :type userId: int
    :type k: int
    :type movie_predict: str
    :rtype: int
    """
    userId = ratings_matrix.index.get_loc(userId) # index of user_i
    
    # remove items the active user has not rated ([active user, other users with no null values])
    ratings_matrix = pd.concat([ratings_matrix.iloc[[userId]],
                                ratings_matrix[ratings_matrix[movie_predict].notna()]])
    
    distance_matrix = ratings_matrix.corr(method='pearson') # similarity matrix
    print(distance_matrix, end="\n\n")
    
    item_weights = distance_matrix[movie_predict].drop(movie_predict).sort_values(ascending=False) # similarity list for item_j
    print(item_weights, end="\n\n")

    # indices of the k most similar items
    most_similar_items = item_weights.iloc[:k].index
    print(most_similar_items, end="\n\n")
       
    _similar_user_ratings = []
    
    # obtain k most similar item ratings
    temp = ratings_matrix.T.loc[most_similar_items]   
    for index, row in temp.iterrows():
        _similar_user_ratings.append(row[userId])
    
    return np.mean(_similar_user_ratings) # average neighbourhood rating

In [6]:
print('Item-Based Prediction: ' + str(item_based_prediction(ratings_matrix, 1, 2, 'movie1')))

          movie1    movie2    movie3  movie4    movie5
movie1  1.000000 -0.500000  0.866025     NaN  0.944911
movie2 -0.500000  1.000000  0.000000     0.0 -0.738549
movie3  0.866025  0.000000  1.000000    -0.5  0.577350
movie4       NaN  0.000000 -0.500000     1.0  0.500000
movie5  0.944911 -0.738549  0.577350     0.5  1.000000

movie5    0.944911
movie3    0.866025
movie2   -0.500000
movie4         NaN
Name: movie1, dtype: float64

Index(['movie5', 'movie3'], dtype='object')

Item-Based Prediction: 1.5


## Measuring Prediction Accuracy

To simulate a real-life example dataset, we use a larger matrix with the addition of sparsity i.e., a large portion of missing rating values. To assess a prediction, we use methods derived from machine learning to create a set of test users to generate an accuracy value by using an error measurement known as mean absolute error (MAE).

In [7]:
# matrix dimensions
number_of_movies = 100
number_of_users = 1000

# we use item indexes as opposed to regular strings to represent movies as it is easier for computation later
ratings_matrix = pd.DataFrame(np.random.randint(1,5,size=(number_of_users, number_of_movies)),
                              columns=range(0, number_of_movies)).rename_axis('userId')

# randomly add specific percentage of missing values 
percentage = 0.75
index = [(row, col) for row in range(ratings_matrix.shape[0]) for col in range(ratings_matrix.shape[1])]
for row, col in random.sample(index, int(round(percentage * len(index)))):
    ratings_matrix.iat[row, col] = np.nan
    
ratings_matrix # columns -> movieId

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,,,,3.0,3.0,,,4.0,,...,,,,,,,,,,
1,,4.0,,,,2.0,,,,,...,,,2.0,,,,4.0,,,3.0
2,,1.0,,,,,,,,,...,4.0,4.0,,,,,,,,
3,1.0,4.0,,,,2.0,,,4.0,4.0,...,,4.0,3.0,1.0,,,,,,4.0
4,4.0,,,,,,,,,,...,,4.0,,2.0,,1.0,,1.0,3.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,,1.0,,,,,,,3.0,,...,,,,,,4.0,,,,1.0
996,2.0,,,,,,2.0,4.0,1.0,,...,4.0,,3.0,,,,3.0,,,
997,,2.0,,,,,,,,,...,,,2.0,,,,,,,
998,,,,,1.0,,,,1.0,,...,,,,,,,,4.0,3.0,


In [8]:
# number of missing cells
ratings_matrix.isnull().sum().sum()

75000

### Redefine user-based prediction function

Sparser matrices can occassionally generate null correlation pair values which can skew the weights. We can simply replace these values with 0 to signify no correlation between the users.

In [9]:
def user_based_prediction(ratings_matrix, norm, userId, k, movie_predict):
    """
    :type ratings_matrix: pd.DataFrame()
    :type userId: int
    :type k: int
    :type movie_predict: str
    :rtype: int
    """
    # ensure entry is missing
    if (not math.isnan(ratings_matrix.at[userId, movie_predict])):
        raise Exception("Rating not Missing")
    
    userId = ratings_matrix.index.get_loc(userId) # index of user_i
    
    # remove users with no ratings of the movie
    ratings_matrix = pd.concat([ratings_matrix.iloc[[userId]],
                                ratings_matrix[ratings_matrix[movie_predict].notna()]])
    
    distance_matrix = ratings_matrix.T.corr(method='pearson') # similarity matrix
    user_weights = distance_matrix[userId].fillna(0) # similarity list for user_i

    # indices of the n most similar users
    most_similar_users = distance_matrix[userId].drop(userId).sort_values(ascending = False).index[:k]

    temp = ratings_matrix.loc[most_similar_users] # mean and weights for n most similar users

    temp['mean'] = temp.mean(axis=1)
    temp['std'] = temp.std(axis=1)
    temp['weight'] = user_weights
    
    _sum = 0
    _weight = 0
    
    if norm == 'mean_centred':
        # mean centred prediction
        for index, row in temp.iterrows():
            _sum += (row[movie_predict] - row['mean']) * row['weight']
            _weight += abs(row['weight'])

        return ratings_matrix.loc[userId].mean() + (_sum / _weight) # return mean centred prediction
    
    elif norm == 'z_score' :
        # z-score prediction
        for index, row in temp.iterrows():
            _sum +=  row['weight'] * ((row[movie_predict] - row['mean']) / row['std'])
            _weight += abs(row['weight'])
    
        return ratings_matrix.loc[userId].mean() + (_sum / _weight) # return z score prediction
    
    else: # weighted average
        for index, row in temp.iterrows():
            _sum +=  row['weight'] * row[movie_predict]
            _weight += abs(row['weight'])
    
        return ratings_matrix.loc[userId].mean() + (_sum / _weight) # return weighted average prediction

### Testing

A predetermined number of users is used as our test set to measure the accuracy of our prediction.

In [10]:
'''
Generates a set of test users from a ratings matrix by selecting a predetermined number of users, saving their ratings,
and hiding them to then generate a prediction and measure the accuracy score.
'''
def generate_test_users(n):
    """
    :type n: int
    :rtype: pd.DataFrame()
    """
    test_users = ratings_matrix.sample(n) # randomly sampled test users

    series = []
    for index, row in test_users.iterrows():
        while True:
            item_rating = row.sample(axis=0) # get random rating value
            if (item_rating.values == item_rating.values): # ensure rating is not missing         
                ratings_matrix.at[item_rating.name, item_rating.keys()[0]] = np.nan # hide rating

                s = pd.Series([item_rating.name, item_rating.keys()[0], item_rating.values[0]])           
                series.append(s)
                break

    # dataframe representing test users
    return pd.DataFrame(series).rename(columns={0: 'userId', 1: "itemId", 2: "actual"}).astype('int').set_index('userId')

In [11]:
test_users_df = generate_test_users(250) # 250 randomly generated test users
test_users_df

Unnamed: 0_level_0,itemId,actual
userId,Unnamed: 1_level_1,Unnamed: 2_level_1
616,92,4
444,25,4
463,94,4
571,50,4
741,68,3
...,...,...
961,53,2
750,69,2
415,15,1
760,53,4


For each user of our test set, we then compute a weighted average prediction which is an improvement from the mean as it takes into account the similarities between users.

In [12]:
k = 40 # neighbourhood size

predicted_ratings = []
for index, row in test_users_df.iterrows():
    pred = user_based_prediction(ratings_matrix, '', row.name, k, row.itemId) # weighted average
    predicted_ratings.append(pred)

testset = test_users_df.copy()
testset['predicted'] = predicted_ratings

In [13]:
print(testset, end='\n\n')
print('MAE: ' + str(mean_absolute_error(testset['actual'], testset['predicted'])))

        itemId  actual  predicted
userId                           
616         92       4   4.741269
444         25       4   4.671717
463         94       4   5.459725
571         50       4   5.615901
741         68       3   4.916254
...        ...     ...        ...
961         53       2   4.664261
750         69       2   4.945387
415         15       1   4.982232
760         53       4   5.356771
763         44       1   5.357222

[250 rows x 3 columns]

MAE: 2.587923411427463


### Mean-centring and Z-score Normalisation

The script below includes the use of two normalisation techniques to compute the predicted rating; mean centring and z-score normalisation. Similar to cross validation (another machine learning technique), the script below computes the mean absolute error for the sampled test users and repeats this for each normalisation technique 5 times with the aim of seeing how normalisation affects the MAE of our estimated ratings.

In [14]:
norm = ['weighted_average', 'mean_centred', 'z_score']
k = 40 # neighbourhood size

df_rows = []
for i in range(5):
    testset = generate_test_users(250)
    
    temp = []
    for i in range(len(norm)):
        predicted_ratings = []

        for index, row in test_users_df.iterrows():
            prediction = user_based_prediction(ratings_matrix, norm[i], row.name, k, row.itemId)
            predicted_ratings.append(prediction)
        
        temp.append(mean_absolute_error(testset['actual'], predicted_ratings))
    df_rows.append(temp)
    
results = pd.DataFrame(df_rows, columns=norm).T
results

Unnamed: 0,0,1,2,3,4
weighted_average,2.611374,2.519954,2.465037,2.445958,2.526026
mean_centred,1.047638,0.974495,1.006373,0.985126,1.038001
z_score,1.044736,0.974187,1.005482,0.982919,1.035998


In [15]:
results['mean'], results['median'] = results.apply(np.mean, axis=1), results.apply(np.median, axis=1)
results

Unnamed: 0,0,1,2,3,4,mean,median
weighted_average,2.611374,2.519954,2.465037,2.445958,2.526026,2.51367,2.519954
mean_centred,1.047638,0.974495,1.006373,0.985126,1.038001,1.010327,1.006373
z_score,1.044736,0.974187,1.005482,0.982919,1.035998,1.008664,1.005482


# Introduction to Surprise

Surprise implementation of the our user-based algorithm (kNN) used to evaluate the results of our previously defined test set

In [16]:
# convert ratings matrix to list of ratings
df = ratings_matrix.reset_index().set_index('userId').stack().reset_index()
df.columns = (["userId", "itemId", "rating"])

df

Unnamed: 0,userId,itemId,rating
0,0,4,3.0
1,0,5,3.0
2,0,8,4.0
3,0,13,1.0
4,0,14,3.0
...,...,...,...
23495,999,61,3.0
23496,999,84,2.0
23497,999,88,1.0
23498,999,89,1.0


In [17]:
# convert dataframe to surprise object dataset
data = Dataset.load_from_df(df[["userId", "itemId", "rating"]], Reader(rating_scale=(1, 5)))

trainset = data.build_full_trainset() # full dataset

In [18]:
sim_options = {
    "name": "pearson", # similarity method
    "user_based": True,  # user-based or item-based 
}

algo_user_based = KNNWithZScore(sim_options=sim_options)
algo_user_based.fit(trainset)

Computing the pearson similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithZScore at 0x23edf3ec970>

In [19]:
algo_user_based.estimate(0, 7) # demo

(2.565643959779639, {'actual_k': 40})

In [20]:
testset = test_users_df.copy()

predicted_ratings = []
for index, row in testset.iterrows():
    pred = algo_user_based.estimate(row.name, row.itemId)[0]
    predicted_ratings.append(pred)

testset['predicted'] = predicted_ratings

In [21]:
print(testset, end='\n\n')
print('MAE: ' + str(mean_absolute_error(testset['actual'], testset['predicted'])))

        itemId  actual  predicted
userId                           
616         92       4   2.877148
444         25       4   2.518736
463         94       4   2.830961
571         50       4   2.847812
741         68       3   2.353519
...        ...     ...        ...
961         53       2   2.476065
750         69       2   2.939575
415         15       1   2.455481
760         53       4   2.605621
763         44       1   2.445488

[250 rows x 3 columns]

MAE: 0.9551928485424012
