In [None]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns
from surprise import accuracy
from surprise.reader import Reader
from surprise.dataset import Dataset
from surprise.model_selection import GridSearchCV
from surprise.model_selection import train_test_split
from surprise.prediction_algorithms.knns import KNNBasic
from surprise.prediction_algorithms.matrix_factorization import SVD
from surprise.model_selection import KFold
from surprise import CoClustering
from collections import deque
import random
import pickle

In [None]:
movie_titles = pd.read_csv('movie_titles.csv', encoding = 'ISO-8859-1', header = None, names = ['Id', 'Year', 'Name']).set_index('Id')

In [None]:
df = pd.read_csv('probe.txt', header = None, names = ['Cust_Id', 'Rating'], usecols = [0,1], low_memory=False)
df.index = np.arange(0,len(df))
df = df[1:len(df)]
tmp_movies = df[df['Rating'].isna()]['Cust_Id'].reset_index()
movie_indices = [[index, int(movie[:-1])] for index, movie in tmp_movies.values]
shifted_movie_indices = deque(movie_indices)
shifted_movie_indices.rotate(-1)

user_data = []

for [df_id_1, movie_id], [df_id_2, next_movie_id] in zip(movie_indices, shifted_movie_indices):
    
    if df_id_1<df_id_2:
        tmp_df = df.loc[df_id_1+1:df_id_2-1].copy()
    else:
        tmp_df = df.loc[df_id_1+1:].copy()
        
    tmp_df['Movie'] = movie_id
    user_data.append(tmp_df)

rating = pd.concat(user_data)
del user_data, df, tmp_movies, tmp_df, shifted_movie_indices, movie_indices, df_id_1, movie_id, df_id_2, next_movie_id
print('Shape Cust_Id-Ratings:\t{}'.format(rating.shape))
rating.sample(5)

In [None]:
ratings = rating[['Cust_Id','Movie','Rating']]
ratings.columns = ['userId','movieId','rating']
ratings_dict = {'itemID': list(ratings.movieId),
                'userID': list(ratings.userId),
                'rating': list(ratings.rating)}

df = pd.DataFrame(ratings_dict)
df.shape

In [None]:
customers = df.userID
ratings_count = dict()
for customer in customers:
    if customer in ratings_count:
        ratings_count[customer] += 1
    else:
        ratings_count[customer] = 1

In [None]:
sns.histplot(ratings_count.values())

In [None]:
def precision_recall_at_k(model, k=30, threshold=1.5):
    user_est_true = defaultdict(list)
    predictions=model.test(testset)
    
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        user_ratings.sort(key=lambda x: x[0], reverse=True)
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])
        n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0
    precision = round((sum(prec for prec in precisions.values()) / len(precisions)),3)
    recall = round((sum(rec for rec in recalls.values()) / len(recalls)),3)
    accuracy.rmse(predictions)
    accuracy.mae(predictions)
    print('Precision: ', precision)
    print('Recall: ', recall)
    print('F_1 score: ', round((2*precision*recall)/(precision+recall),3)) #

In [None]:
reader = Reader(rating_scale = (0,5))
testset = Dataset.load_from_df(df[["userID", "itemID", "rating"]], reader)

In [None]:
filename = 'user_user_model.pkl’
sim_user_user = pickle.load(open(filename, 'rb'))

In [None]:
sim_user_user.test(testset)
precision_recall_at_k(sim_user_user)

In [None]:
filename = 'svd_model.pkl’
svd = pickle.load(open(filename, 'rb'))

In [None]:
svd.test(testset)
precision_recall_at_k(svd)

In [None]:
filename = 'cocluster_model.pkl’
CoCluster = pickle.load(open(filename, 'rb'))

In [None]:
CoCluster.test(testset)
precision_recall_at_k(CoCluster)