In [0]:
# imports

import pandas as pd

In [0]:
# size of the raw dataset

raw_data_size = '1M'

In [0]:
# choose algorithm/model for which output is required

algorithm = 'SVD'
# algorithm = 'NMF'
# algorithm = 'SlopeOne'
# algorithm = 'KNNBasic'
# algorithm = 'KNNBaseline'
# algorithm = 'KNNWithMeans'
# algorithm = 'KNNWithZScore'
# algorithm = 'CoClustering'

In [5]:
# model output

PATH_DIR = '/content/drive/My Drive/'

df_prediction = pd.read_csv(PATH_DIR + algorithm + '_' + raw_data_size + '.csv')
df_prediction

Unnamed: 0,userId,movieId,predicted_rating,true_rating
0,1,1193,4.520033,5
1,1,3408,4.334860,4
2,1,919,4.775469,4
3,1,2797,4.091292,4
4,1,720,4.366757,3
...,...,...,...,...
200011,6040,2791,3.528225,4
200012,6040,3751,3.475510,4
200013,6040,541,4.870984,4
200014,6040,1077,3.759067,5


In [6]:
# all movie ids vs titles

df_movies = \
  pd.read_csv(PATH_DIR + 'movies.dat', sep='::', usecols=[0,1], names=['movieId', 'title'])

all_movies_dict = \
  pd.Series(df_movies['title'].values,index=df_movies['movieId']).to_dict()

  


In [0]:
# all users in test dataset 

all_users = df_prediction['userId'].unique().tolist()

In [0]:
# results as per predicted ratings in test dataset 

df_prediction1 = df_prediction[['userId', 'movieId', 'predicted_rating']]
df_prediction1['predicted_movies'] = df_prediction1 \
  .apply(lambda x: (x['movieId'], x['predicted_rating']), axis=1)

df_prediction2 = df_prediction1[['userId', 'predicted_movies']]

df_prediction_formatted = df_prediction2 \
  .groupby('userId')['predicted_movies'].apply(list).reset_index(name='recommendation')

df_prediction_sorted = df_prediction_formatted

df_prediction_sorted['recommendation'] = \
  df_prediction_sorted['recommendation'].apply( \
    lambda x: sorted(x, key=lambda tup: tup[1], reverse=True))
  
# df_prediction_sorted

sorted_reco_by_userid = pd.Series( \
  df_prediction_sorted['recommendation'].values,index=df_prediction_sorted['userId']).to_dict()

In [0]:
def get_top_n_recommendations(user, n=10):
  top_n_reco = sorted_reco_by_userid[user][:n]
  
  return [all_movies_dict[int(x[0])] for x in top_n_reco]

In [0]:
# results as per true ratings in test dataset

df_true1 = df_prediction[['userId', 'movieId', 'true_rating']]
df_true1['movies'] = df_true1 \
  .apply(lambda x: (x['movieId'], x['true_rating']), axis=1)

df_true2 = df_true1[['userId', 'movies']]

df_true_formatted = df_true2 \
  .groupby('userId')['movies'].apply(list).reset_index(name='recommendation')

df_true_sorted = df_true_formatted

df_true_sorted['recommendation'] = \
  df_true_sorted['recommendation'].apply( \
    lambda x: sorted(x, key=lambda tup: tup[1], reverse=True))

# df_true_sorted

sorted_true_ratings_by_userid = pd.Series( \
  df_true_sorted['recommendation'].values,index=df_true_sorted['userId']).to_dict()

In [0]:
def get_top_n_true_ratings(user, n=10):
  top_n_true_ratings = sorted_true_ratings_by_userid[user][:n]
  
  return [all_movies_dict[int(x[0])] for x in top_n_true_ratings]

In [0]:
# output for each user in test dataset

top_n_recommendations_by_user = {}

user_out = []
for user in all_users:
  out = []

  # user
  out.append(user)

  # top n recommendations (n = 10 by default)
  top_n_reco_for_user = get_top_n_recommendations(user)
  top_n_recommendations_by_user[user] = top_n_reco_for_user

  top_n_true_ratings_for_user = get_top_n_true_ratings(user)
  true_positives = list(set(top_n_reco_for_user) & set(top_n_true_ratings_for_user))
  false_positives = list(set(top_n_reco_for_user) - set(top_n_true_ratings_for_user))
  false_negatives = list(set(top_n_true_ratings_for_user) - set(top_n_reco_for_user))

  # precision for user
  precision_for_user = len(true_positives) / float(len(true_positives) + len(false_positives))
  out.append(precision_for_user)

  # recall for user
  recall_for_user = len(true_positives) / float(len(true_positives) + len(false_negatives))
  out.append(recall_for_user)
  
  user_out.append(out)

In [13]:
# output for all users in test dataset

df_out = pd.DataFrame(user_out, columns=['userId', 'precision_user', 'recall_user'])
df_out

Unnamed: 0,userId,precision_user,recall_user
0,1,0.9,0.9
1,2,0.6,0.6
2,3,1.0,1.0
3,4,1.0,1.0
4,5,0.5,0.5
...,...,...,...
6035,6036,0.4,0.4
6036,6037,0.3,0.3
6037,6038,1.0,1.0
6038,6039,0.7,0.7


In [14]:
# top n recommendations for user 1

top_n_recommendations_by_user[1]

["Schindler's List (1993)",
 'Wizard of Oz, The (1939)',
 'Star Wars: Episode IV - A New Hope (1977)',
 "One Flew Over the Cuckoo's Nest (1975)",
 'To Kill a Mockingbird (1962)',
 'Apollo 13 (1995)',
 'Wallace & Gromit: The Best of Aardman Animation (1996)',
 'Erin Brockovich (2000)',
 'Big (1988)',
 'Secret Garden, The (1993)']

In [15]:
# EVALUATION

# mae
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(df_prediction['true_rating'], df_prediction['predicted_rating'])
print('mae', mae)

# rmse
from sklearn.metrics import mean_squared_error
from math import sqrt
rmse = sqrt(mean_squared_error(df_prediction['true_rating'], df_prediction['predicted_rating']))
print('rmse', rmse)

# precision
precision = df_out['precision_user'].mean()
print('avg precision', precision)

# recall
recall = df_out['recall_user'].mean()
print('avg recall', recall)

# f measure
f_measure = (2.0 * precision * recall) / (precision + recall)
print('f measure', f_measure)

mae 0.6843450103638451
rmse 0.872773145208243
avg precision 0.6828807947019897
avg recall 0.6828807947019897
f measure 0.6828807947019897
