In [1]:
import pandas as pd
import numpy as np
from surprise import KNNWithMeans
from surprise import Dataset, Reader
from surprise.model_selection import KFold, cross_validate
from tabulate import tabulate
from collections import defaultdict

# Part 1: Using Surprise

## Read and load the data using Surprise

In [2]:
# Read both the datasets from the files using pandas
movielens_df = pd.read_csv("../data/u.data", sep="\t", header=None)
movielens_df.columns = ["userID", "itemID", "rating", "timestamp"]
pda_df = pd.read_csv("../data/train-PDA2018.csv", sep=",")
print("ML100K\n", movielens_df.head())
print("\n\n")
print("PDA2018\n", pda_df.head())

# Sample the data such that every user has rated at least 10 items and every item has been by at least 10 users
print("Shapes before sub-sampling:")
print(movielens_df.shape)
print(pda_df.shape)

# Movielens users all have at least 20 ratings so no need to subsample the user values
ml_subsampled = movielens_df[movielens_df['itemID'].isin(movielens_df['itemID'].value_counts()[movielens_df['itemID'].value_counts()>10].index)]
pda_subsampled = pda_df[pda_df['itemID'].isin(pda_df['itemID'].value_counts()[pda_df['itemID'].value_counts()>10].index)]
pda_subsampled = pda_subsampled[pda_subsampled['userID'].isin(pda_subsampled['userID'].value_counts()[pda_subsampled['userID'].value_counts()>10].index)]
print("\nShapes after sub-sampling:")
print(ml_subsampled.shape)
print(pda_subsampled.shape)

# Create the training datasets using Surprise's reader class
reader = Reader(rating_scale=(1,5)) # We have ratings from 1 to 5 so we create the rating scale

# Load the data from the dataframes
movielens_dataset = Dataset.load_from_df(movielens_df.iloc[:,0:3], reader)
pda_dataset = Dataset.load_from_df(pda_df.iloc[:,0:3], reader)

# Build full trainsets to print out the data loaded above
mls_train = movielens_dataset.build_full_trainset()
pda_train = pda_dataset.build_full_trainset()

# Print out some basic information about the datasets
print("\n Some general information on the training sets we will be using:")
print("1) Number of items in each dataset", " ML100k:", mls_train.n_items, "PDA:", pda_train.n_items)
print("2) Number of users in each dataset", " ML100k:", mls_train.n_users, "PDA:", pda_train.n_users)
print("3) Number of ratings in each dataset", " ML100k:", mls_train.n_ratings, "PDA:", pda_train.n_ratings)
print("4) Mean rating", " ML100k:", mls_train.global_mean, "PDA:", pda_train.global_mean)

ML100K
    userID  itemID  rating  timestamp
0     196     242       3  881250949
1     186     302       3  891717742
2      22     377       1  878887116
3     244      51       2  880606923
4     166     346       1  886397596



PDA2018
    userID  itemID  rating  timeStamp
0       5     648       5  978297876
1       5    1394       5  978298237
2       5    3534       5  978297149
3       5     104       4  978298558
4       5    2735       5  978297919
Shapes before sub-sampling:
(100000, 4)
(470711, 4)

Shapes after sub-sampling:
(97623, 4)
(465154, 4)

 Some general information on the training sets we will be using:
1) Number of items in each dataset  ML100k: 1682 PDA: 1824
2) Number of users in each dataset  ML100k: 943 PDA: 5690
3) Number of ratings in each dataset  ML100k: 100000 PDA: 470711
4) Mean rating  ML100k: 3.52986 PDA: 3.638361967321775


## Surprise experiment parameters and variables


### Define auxilliary functions to get Top-N and then calculate metrics

In [3]:
# Check that we have ratings in the train set for all the users in the test set
def find_items_not_in_trainset(trainset, testset):
    items_not_in_train = []
    for _,itemId, _ in testset:    
        if itemId not in trainset.ir.keys():
            items_not_in_train.append(itemId)
            
    return items_not_in_train

        
def user_seen_items(userId):
    return [train_itemId for train_itemId, rating in mls_train.ur[userId]]

In [4]:
def GetTopN(predictions, n, minimumRating, criterion):
    topN = defaultdict(list)
    
    for index, row in predictions.iterrows():
        if (row[criterion] >= minimumRating):
            topN[int(row.uid)].append((int(row.iid), row[criterion]))

    for userID, ratings in topN.items():
        ratings.sort(key=lambda x: x[1], reverse=True)
        topN[int(userID)] = ratings[:n]

    return topN

In [5]:
def precision_recall_at_k(predictions, items_not_in_train, k, threshold=3.5):
    top_n_recoms_est = GetTopN(predictions, n=k, minimumRating=threshold, criterion="est")
    top_n_recoms_real = GetTopN(predictions, n=k, minimumRating=threshold, criterion="r_ui")
    above_threshold = predictions[predictions.r_ui >= threshold]

    precisions = {}
    recalls = {}

    for uid, est_topn in top_n_recoms_est.items():
        # Get items the user has already rated
        already_seen = user_seen_items(uid)
        # Get relevant items for the user
        n_rel_for_user = len(above_threshold[above_threshold.uid == uid])        
        tp = 0
        # Penalize the scores if:
        # - The item we are recommending was never seen in the training set (how could we recommend what we don't know?)
        # - The user has already rated this item: It's not a good recommendation since the user already knows it/has seen it
        for est_itemId, _ in est_topn:
            if(est_itemId in items_not_in_train or est_itemId in already_seen):
                tp += 0
            else:
                for real_itemId, _ in top_n_recoms_real[uid]:
                    if (est_itemId == real_itemId):
                        tp +=1
        
        precisions[uid] = tp/k
        recalls[uid] = tp/n_rel_for_user if n_rel_for_user != 0 else 0

    return precisions, recalls

In [6]:
from sklearn.metrics import ndcg_score
def ndcg_for_rec(predictions, threshold=3.5):
    top_n_recoms_est = GetTopN(predictions, n=predictions.shape[0], minimumRating=threshold, criterion="est")
    top_n_recoms_real = GetTopN(predictions, n=predictions.shape[0], minimumRating=threshold, criterion="r_ui")
    ndcgs = {}

    for uid, real_topn in top_n_recoms_real.items():
        user_gt = [true_rating[0] for true_rating in real_topn]
        est_top_for_current = top_n_recoms_est[uid]
        predicted_items = [est_rating[0] for est_rating in est_top_for_current]
        predicted_scores = [est_rating[1] for est_rating in est_top_for_current]
        gain_scores = np.zeros(len(user_gt)).tolist()

        ndcg_val = 0
        
        if(len(predicted_items) == 0):
            ndcg_val += 0
        elif(len(predicted_items) == 1 or len(user_gt) == 1):
            if(predicted_items[0] == user_gt[0]):
                ndcg_val += 1
        else:
            for i, pred_item in enumerate(predicted_items):
                for j, gt_item in enumerate(user_gt):
                    if(gt_item==pred_item):
                        gain_scores[j] = i
            gain_scores =[int(elem) for elem in gain_scores]
            ndcg_val += ndcg_score(np.asarray([user_gt]), np.asarray([gain_scores]))
            
        #print("GT:", user_gt)
        #print("Pred:", predicted_items)
        #print(gain_scores)
        #print( "\n\n")
        ndcgs[uid] = ndcg_val

    return ndcgs

### Run 5-fold cross validation

In [8]:
""" EXPERIMENT VARIABLES"""
# List that will contain the results of all experiments
results_table = []
kf = KFold(n_splits=5) # define number of k splits for cross validation using Surprise KFold

# Algorithms we will be using in this section
algorithms = {
    "UserKNN": KNNWithMeans(k=60, sim_options={'name': 'cosine', 'shrinkage': 25, 'user_based': True}),
    "ItemKNN": KNNWithMeans(k=40, sim_options={'name': 'cosine', 'shrinkage': 25, 'user_based': False}),
}
# Datasets
datasets = {
    "ML100": movielens_dataset,
    "PDA2018": pda_dataset
}

In [9]:
for dataset in datasets.keys():
    for algorithm in algorithms.keys():
        print("Running 5-fold cross validation with", algorithm, "on", dataset, "dataset ...")
        
        # Run cross validation
        best_prec_5 = 0
        best_prec_10 = 0
        best_rec_5 = 0
        best_rec_10 = 0
        fold_nr = 0
        
        for trainset, testset in kf.split(movielens_dataset):
            print("Fold  number", fold_nr)
            algorithms[algorithm].fit(trainset)
            predictions = algorithms[algorithm].test(testset)
            predictions_df = pd.DataFrame(predictions)
            predictions_df = predictions_df.iloc[:,:-1]
            items_not_in_trainset = find_items_not_in_trainset(trainset, testset)
            print(len(items_not_in_trainset))
            pres_at_5, recalls_at_5 = precision_recall_at_k(predictions_df, items_not_in_trainset, k=5)
            pres_at_10, recalls_at_10 = precision_recall_at_k(predictions_df, items_not_in_trainset, k=10)
            ndcgs = ndcg_for_rec(predictions_df)
            avg_pre_5 = np.array([pres_at_5[k] for k in pres_at_5.keys()]).mean()
            avg_rec_5 = np.array([recalls_at_5[k] for k in recalls_at_5.keys()]).mean()
            avg_pre_10 = np.array([pres_at_10[k] for k in pres_at_10.keys()]).mean()
            avg_rec_10 = np.array([recalls_at_10[k] for k in recalls_at_10.keys()]).mean()
            avg_ndcg = np.array([ndcgs[k] for k in ndcgs.keys()]).mean()
            if (avg_pre_5 > best_prec_5):
                best_prec_5 = avg_pre_5
            if (avg_pre_10 > best_prec_10):
                best_prec_10 = avg_pre_10
            if (avg_rec_5 > best_rec_5):
                best_rec_5 = avg_rec_5
            if (avg_rec_10 > best_rec_10):
                best_rec_10 = avg_rec_10
            fold_nr += 1
            break
        
        new_line = [dataset+"-"+algorithm, best_prec_5, best_prec_10, best_rec_5, best_rec_10, avg_ndcg]
        results_table.append(new_line)
        print()

Running 5-fold cross validation with UserKNN on ML100 dataset ...
Fold  number 0
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
12

Running 5-fold cross validation with ItemKNN on ML100 dataset ...
Fold  number 0
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
6

Running 5-fold cross validation with UserKNN on PDA2018 dataset ...
Fold  number 0
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
13

Running 5-fold cross validation with ItemKNN on PDA2018 dataset ...
Fold  number 0
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
5



# Results

In [127]:
# Display results of running the algorithms
results_table_headers = ["Recommender", "Pre@5", "Pre@10", "Rec@5", "Rec@10", "NDCG"]
print(tabulate(results_table, results_table_headers, tablefmt="pipe"))

| Recommender     |    Pre@5 |   Pre@10 |    Rec@5 |   Rec@10 |     NDCG |
|:----------------|---------:|---------:|---------:|---------:|---------:|
| ML100-UserKNN   | 0.353387 | 0.360161 | 0.320818 | 0.477432 | 0.719624 |
| ML100-ItemKNN   | 0.345903 | 0.348597 | 0.306494 | 0.452833 | 0.733329 |
| PDA2018-UserKNN | 0.345392 | 0.358295 | 0.302015 | 0.459918 | 0.729926 |
| PDA2018-ItemKNN | 0.352242 | 0.354596 | 0.315589 | 0.458352 | 0.732392 |


In [142]:
# Export data
# Export the results to a csv file
# results_df = pd.DataFrame(results_table, columns=["Recommender", "Pre@5", "Pre@10", "Rec@5", "Rec@10", "NDCG"])
# results_df.to_csv("../data/items_recommendation_results.csv")