In [1]:
import sys

import re
import pandas as pd
from pandas import Series, DataFrame

%pylab inline
import matplotlib.pyplot as plt
from collections import defaultdict

from surprise import SVD
from surprise import NMF
from surprise import SVDpp
from surprise import Reader
from surprise import Dataset
from surprise import accuracy
from surprise import NormalPredictor
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split

print(sys.version)

Populating the interactive namespace from numpy and matplotlib
3.6.9 |Anaconda, Inc.| (default, Jul 30 2019, 19:07:31) 
[GCC 7.3.0]


In [2]:
def get_top_n(predictions, n=10):


    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est, true_r))
    #uid:[(iid,est),(iid,est)]
    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings#[:n]

    return top_n

In [3]:
def ndcg(y_true, y_pred, k=None, powered=False):
    def dcg(scores, k=None, powered=False):
        if k is None:
            k = scores.shape[0]
        if not powered:
            ret = scores[0]
            for i in range(1, k):
                ret += scores[i] / np.log2(i + 1)
            return ret
        else:
            ret = 0
            for i in range(k):
                ret += (2 ** scores[i] - 1) / np.log2(i + 2)
            return ret
    
    ideal_sorted_scores = np.sort(y_true)[::-1]
    ideal_dcg_score = dcg(ideal_sorted_scores, k=k, powered=powered)
    
    pred_sorted_ind = np.argsort(y_pred)[::-1]
    pred_sorted_scores = y_true[pred_sorted_ind]
    dcg_score = dcg(pred_sorted_scores, k=k, powered=powered)
    
    return dcg_score / ideal_dcg_score

def ndcg1(y_true, y_pred, k=None):
    return ndcg(y_true, y_pred, k=k, powered=False)

def ndcg2(y_true, y_pred, k=None):
    return ndcg(y_true, y_pred, k=k, powered=True)

In [6]:
# Load the training data set
data_frame1 = pd.read_csv("res/sample/train.csv")
reader1 = Reader(rating_scale=(1, data_frame1.shape[0]))
trainset = Dataset.load_from_df(data_frame1[['userId','movieId','rating']], reader1).build_full_trainset()

In [7]:
# Train the algorithm on the trainset, and predict ratings for the testset
algo = SVD()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f936cdf5898>

In [8]:
# Load the testing dataset
data_frame = pd.read_csv("res/sample/test.csv")
reader = Reader(rating_scale=(1, data_frame.shape[0]))
testset = Dataset.load_from_df(data_frame[['userId','movieId','rating']], reader)

In [15]:
# Get the predictions
predictions = algo.test(testset.build_full_trainset().build_testset())

In [16]:
# Then compute RMSE
accuracy.rmse(predictions)

RMSE: 0.7874


0.7874242509812788

In [17]:
# Then compute MAE
accuracy.mae(predictions)

MAE:  0.5969


0.5968981805741898

In [19]:
# Then get top 10 recommendations for each user from the test data predictions
top_n = get_top_n(predictions, n=10)

#print(top_n)

users_est = defaultdict(list)
users_true=defaultdict(list)

for uid, user_ratings in top_n.items():
    users_est[uid].append([est for (_, est,_) in user_ratings])
    users_true[uid].append([true_r for (_,_,true_r) in user_ratings])


#print (users_true)
#print (users_est)

In [20]:
# Then compute NDCG
ndcg_list=[]
for uid in top_n:
    
    for i in users_true[uid]:
        y_true=np.asarray(i)#.reshape(-1,1)
    for i in users_est[uid]:
        y_pred=np.asarray(i)#.reshape(-1,1)
        ndcg_list.append(ndcg1(y_true, y_pred, k=None))
      

ndcg_list = [i for i in ndcg_list if str(i) != 'nan']
mean(ndcg_list)

0.9718838203880806

In [30]:
# Then print the predictions to csv file
pred_df = pd.DataFrame(predictions)
pred_df.rename(columns={'uid': 'UserID', 'iid': 'MovieID', 'r_ui': 'TrueRating', 'est': 'PredictedRating'}, inplace=True)
pred_df.head()
pred_df = pred_df[['UserID','MovieID', 'TrueRating','PredictedRating']]
pred_df.to_csv("svd_predictions.csv", index=False, header = True)