In [1]:
import pandas as pd
import numpy as np
from surprise import Dataset, Reader
from surprise import KNNWithMeans
from surprise.model_selection import KFold, cross_validate, train_test_split, LeaveOneOut
from surprise import accuracy
from tabulate import tabulate
from collections import defaultdict

# Modify the default prediction function for the TOP-N Scenario

In [2]:
from __future__ import (absolute_import, division, print_function,
                        unicode_literals)
from surprise.prediction_algorithms.algo_base import AlgoBase
from surprise.prediction_algorithms.predictions import PredictionImpossible
from six import iteritems
import heapq

class SymmetricAlgo(AlgoBase):
    """This is an abstract class aimed to ease the use of symmetric algorithms.
    A symmetric algorithm is an algorithm that can can be based on users or on
    items indifferently, e.g. all the algorithms in this module.
    When the algo is user-based x denotes a user and y an item. Else, it's
    reversed.
    """

    def __init__(self, sim_options={}, verbose=True, **kwargs):

        AlgoBase.__init__(self, sim_options=sim_options, **kwargs)
        self.verbose = verbose

    def fit(self, trainset):

        AlgoBase.fit(self, trainset)

        ub = self.sim_options['user_based']
        self.n_x = self.trainset.n_users if ub else self.trainset.n_items
        self.n_y = self.trainset.n_items if ub else self.trainset.n_users
        self.xr = self.trainset.ur if ub else self.trainset.ir
        self.yr = self.trainset.ir if ub else self.trainset.ur

        return self

    def switch(self, u_stuff, i_stuff):
        """Return x_stuff and y_stuff depending on the user_based field."""

        if self.sim_options['user_based']:
            return u_stuff, i_stuff
        else:
            return i_stuff, u_stuff


class RankingKNN(SymmetricAlgo):

    def __init__(self, k=40, min_k=1, sim_options={}, verbose=True, **kwargs):

        SymmetricAlgo.__init__(self, sim_options=sim_options,
                               verbose=verbose, **kwargs)

        self.k = k
        self.min_k = min_k

    def fit(self, trainset):

        SymmetricAlgo.fit(self, trainset)
        self.sim = self.compute_similarities()

        return self

    def estimate(self, u, i):

        if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
            raise PredictionImpossible('User and/or item is unknown.')

        x, y = self.switch(u, i)

        neighbors = [(x2, self.sim[x, x2], r) for (x2, r) in self.yr[y]]
        k_neighbors = heapq.nlargest(self.k, neighbors, key=lambda t: t[1])

        est = 0

        # compute weighted average
        sum_sim = actual_k = 0
        for (nb, sim, r) in k_neighbors:
            if sim > 0:
                sum_sim += sim
                actual_k += 1

        if actual_k < self.min_k:
            est = 0
        else:
            est += sum_sim

        details = {'actual_k': actual_k}
        return est, details

# Read and load data

In [9]:
#ub_algo = RankingKNN(k=80, min_k=5, sim_options={'name': 'cosine', 'user_based': True})
#ib_algo = RankingKNN(k=80, min_k=5, sim_options={'name': 'cosine', 'user_based': False})
ub_algo = KNNWithMeans(k=60, min_k=5, sim_options={'name': 'pearson_baseline', 'shrinkage': 50, 'user_based': True})
ib_algo = KNNWithMeans(k=40, min_k=5, sim_options={'name': 'pearson_baseline', 'shrinkage': 50, 'user_based': False})

In [4]:
# Read both the datasets from the files using pandas
movielens_df = pd.read_csv("../data/u.data", sep="\t", header=None)
movielens_df.columns = ["userID", "itemID", "rating", "timestamp"]
pda_df = pd.read_csv("../data/train-PDA2018.csv", sep=",")
print(movielens_df.head())
print("\n\n")
print(pda_df.head())

   userID  itemID  rating  timestamp
0     196     242       3  881250949
1     186     302       3  891717742
2      22     377       1  878887116
3     244      51       2  880606923
4     166     346       1  886397596



   userID  itemID  rating  timeStamp
0       5     648       5  978297876
1       5    1394       5  978298237
2       5    3534       5  978297149
3       5     104       4  978298558
4       5    2735       5  978297919


# Subsampling

In [5]:
# Sample the data such that every user has rated at least 10 items and every item has been by at least 10 users
print("Shapes before sub-sampling:")
print(movielens_df.shape)
print(pda_df.shape)

# Movielens users all have at least 20 ratings so no need to subsample the user values
ml_subsampled = movielens_df[movielens_df['itemID'].isin(movielens_df['itemID'].value_counts()[movielens_df['itemID'].value_counts()>10].index)]
pda_subsampled = pda_df[pda_df['itemID'].isin(pda_df['itemID'].value_counts()[pda_df['itemID'].value_counts()>10].index)]
pda_subsampled = pda_subsampled[pda_subsampled['userID'].isin(pda_subsampled['userID'].value_counts()[pda_subsampled['userID'].value_counts()>10].index)]
print("\nShapes after sub-sampling:")
print(ml_subsampled.shape)
print(pda_subsampled.shape)

Shapes before sub-sampling:
(100000, 4)
(470711, 4)

Shapes after sub-sampling:
(97623, 4)
(465154, 4)


In [6]:
# Create the training datasets using Surprise's reader class
reader = Reader(rating_scale=(1,5)) # We have ratings from 1 to 5 so we create the rating scale

# Load the data from the dataframes
movielens_dataset = Dataset.load_from_df(ml_subsampled.iloc[:,0:3], reader)
pda_dataset = Dataset.load_from_df(pda_subsampled.iloc[:,0:3], reader)

# Build full trainsets to print out the data loaded above
# mls_train = movielens_dataset.build_full_trainset()
# pda_train = pda_dataset.build_full_trainset()
mls_train, mls_test = train_test_split(data=movielens_dataset, test_size=0.2)
pda_train, pda_test = train_test_split(data=pda_dataset, test_size=0.2)

# Print out some basic information about the datasets
print("General information on the training sets we will be using \n")
print("1) Number of items in each dataset", " ML100k:", mls_train.n_items, "PDA:", pda_train.n_items)
print("2) Number of users in each dataset", " ML100k:", mls_train.n_users, "PDA:", pda_train.n_users)
print("3) Number of ratings in each dataset", " ML100k:", mls_train.n_ratings, "PDA:", pda_train.n_ratings)
print("4) Mean rating", " ML100k:", mls_train.global_mean, "PDA:", pda_train.global_mean)

General information on the training sets we will be using 

1) Number of items in each dataset  ML100k: 1119 PDA: 1753
2) Number of users in each dataset  ML100k: 943 PDA: 4706
3) Number of ratings in each dataset  ML100k: 78098 PDA: 372123
4) Mean rating  ML100k: 3.5481062255115368 PDA: 3.6341102269948378


In [7]:
# Check that we have ratings in the train set for all the users in the test set
items_not_in_train = []
for _,itemId, _ in mls_test:    
    if itemId not in mls_train.ir.keys():
        items_not_in_train.append(itemId)
        
def user_seen_items(userId):
    return [train_itemId for train_itemId, rating in mls_train.ur[userId]]

# Fit the data to the model and generate rating predictions

In [10]:
ub_algo.fit(mls_train)
predictions = ub_algo.test(mls_test)
predictions_df = pd.DataFrame(predictions)
predictions_df = predictions_df.iloc[:,:-1] 
predictions_df

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


Unnamed: 0,uid,iid,r_ui,est
0,648,746,4.0,3.628166
1,198,511,4.0,3.973553
2,360,1,3.0,4.043833
3,293,199,5.0,3.717179
4,115,284,2.0,3.487037
...,...,...,...,...
19520,450,164,4.0,3.864094
19521,6,496,4.0,4.217721
19522,255,447,3.0,2.982379
19523,735,321,3.0,2.897516


# Top-N Problem

In [11]:
def GetTopN(predictions, n, minimumRating, criterion):
    topN = defaultdict(list)
    
    for index, row in predictions.iterrows():
        if (row[criterion] >= minimumRating):
            topN[int(row.uid)].append((int(row.iid), row[criterion]))

    for userID, ratings in topN.items():
        ratings.sort(key=lambda x: x[1], reverse=True)
        topN[int(userID)] = ratings[:n]

    return topN

In [12]:
def precision_recall_at_k(predictions, k=5, threshold=3.5):
    top_n_recoms_est = GetTopN(predictions_df, n=k, minimumRating=threshold, criterion="est")
    top_n_recoms_real = GetTopN(predictions_df, n=k, minimumRating=threshold, criterion="r_ui")
    above_threshold = predictions_df[predictions_df.r_ui >= threshold]

    precisions = {}
    recalls = {}

    for uid, est_topn in top_n_recoms_est.items():
        # Get items the user has already rated
        already_seen = user_seen_items(uid)
        # Get relevant items for the user
        n_rel_for_user = len(above_threshold[above_threshold.uid == uid])        
        tp = 0
        # Penalize the scores if:
        # - The item we are recommending was never seen in the training set (how could we recommend what we don't know?)
        # - The user has already rated this item: It's not a good recommendation since the user already knows it/has seen it
        for est_itemId, _ in est_topn:
            if(est_itemId in items_not_in_train or est_itemId in already_seen):
                tp += 0
            else:
                for real_itemId, _ in top_n_recoms_real[uid]:
                    if (est_itemId == real_itemId):
                        tp +=1
        
        precisions[uid] = tp/k
        recalls[uid] = tp/n_rel_for_user if n_rel_for_user != 0 else 0

    return precisions, recalls

In [280]:
from sklearn.metrics import ndcg_score
def ndcg_for_rec(predictions, threshold=3.5):
    top_n_recoms_est = GetTopN(predictions, n=predictions.shape[0], minimumRating=threshold, criterion="est")
    top_n_recoms_real = GetTopN(predictions, n=predictions.shape[0], minimumRating=threshold, criterion="r_ui")
    ndcgs = {}

    for uid, real_topn in top_n_recoms_real.items():
        user_gt = [true_rating[0] for true_rating in real_topn]
        est_top_for_current = top_n_recoms_est[uid]
        predicted_items = [est_rating[0] for est_rating in est_top_for_current]
        predicted_scores = [est_rating[1] for est_rating in est_top_for_current]
        gain_scores = np.zeros(len(user_gt)).tolist()

        ndcg_val = 0
        
        if(len(predicted_items) == 0):
            ndcg_val += 0
        elif(len(predicted_items) == 1 or len(user_gt) == 1):
            if(predicted_items[0] == user_gt[0]):
                ndcg_val += 1
        else:
            for i, pred_item in enumerate(predicted_items):
                for j, gt_item in enumerate(user_gt):
                    if(gt_item==pred_item):
                        gain_scores[j] = i
            gain_scores =[int(elem) for elem in gain_scores]
            ndcg_val += ndcg_score(np.asarray([user_gt]), np.asarray([gain_scores]))
            
        #print("GT:", user_gt)
        #print("Pred:", predicted_items)
        #print(gain_scores)
        #print( "\n\n")
        ndcgs[uid] = ndcg_val

    return ndcgs

In [281]:
#pres_at_5, recalls_at_5 = precision_recall_at_k(predictions_df, k=5)
#pres_at_10, recalls_at_10 = precision_recall_at_k(predictions_df, k=10)
ndcgs = ndcg_for_rec(predictions_df)

In [282]:
avg_pre_5 = np.array([pres_at_5[k] for k in pres_at_5.keys()]).mean()
avg_rec_5 = np.array([recalls_at_5[k] for k in recalls_at_5.keys()]).mean()
print("Precision and Recall @5:", avg_pre_5, avg_rec_5)
avg_pre_10 = np.array([pres_at_10[k] for k in pres_at_10.keys()]).mean()
avg_rec_10 = np.array([recalls_at_10[k] for k in recalls_at_10.keys()]).mean()
print("Precision and Recall @10:", avg_pre_10, avg_rec_10)
avg_ndcg = np.array([ndcgs[k] for k in ndcgs.keys()]).mean()
print("NDCG:", avg_ndcg)

Precision and Recall @5: 0.36523754345307075 0.32982892985517315
Precision and Recall @10: 0.36326767091541134 0.48000042990615494
NDCG: 0.7041892827871443
