# Install data

In [1]:
!wget --no-check-certificate 'https://files.grouplens.org/datasets/movielens/ml-latest-small.zip'
!unzip './ml-latest-small.zip'

--2021-04-19 22:14:06--  https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
  Unable to locally verify the issuer's authority.
HTTP request sent, awaiting response... 200 OK
Length: 978202 (955K) [application/zip]
Saving to: ‘ml-latest-small.zip’


2021-04-19 22:14:07 (2.88 MB/s) - ‘ml-latest-small.zip’ saved [978202/978202]

Archive:  ./ml-latest-small.zip
   creating: ml-latest-small/
  inflating: ml-latest-small/links.csv  
  inflating: ml-latest-small/tags.csv  
  inflating: ml-latest-small/ratings.csv  
  inflating: ml-latest-small/README.txt  
  inflating: ml-latest-small/movies.csv  


# Import packages

In [2]:
import pandas as pd
import numpy as np
from random import shuffle
from tqdm.notebook import tqdm
from collections import Counter
import warnings 
warnings.filterwarnings("ignore")

# Code

In [3]:
ratings_full = pd.read_csv('/content/ml-latest-small/ratings.csv')
ratings_full['date'] = pd.to_datetime(1e9*ratings_full['timestamp']).dt.date
ratings_full = ratings_full[['userId', 'movieId', 'rating', 'date']]

In [4]:
sorted_movies = sorted(ratings_full['movieId'].unique())
map_id_to_index = {movieId:index for index, movieId in enumerate(sorted_movies)}
ratings_full['movie'] = ratings_full['movieId'].apply(lambda movieId : map_id_to_index[movieId])
ratings_full = ratings_full.rename(columns={"movieId": "initial_movieId", "movie": "movieId"})

ratings_full = ratings_full[['userId', 'movieId', 'rating', 'date']]

In [5]:
ratings_full[:10]

Unnamed: 0,userId,movieId,rating,date
0,1,0,4.0,2000-07-30
1,1,2,4.0,2000-07-30
2,1,5,4.0,2000-07-30
3,1,43,5.0,2000-07-30
4,1,46,5.0,2000-07-30
5,1,62,3.0,2000-07-30
6,1,89,5.0,2000-07-30
7,1,97,4.0,2000-07-30
8,1,124,5.0,2000-07-30
9,1,130,5.0,2000-07-30


In [12]:
def recommender(userId, training_data, training, rating_matrix, ratings_full, limit=5, thresh_sim=20):
    mask_movies = [m for m,r in training]
    mask_users = np.array(list(training_data.keys())) - 1 # convert to indices

    ratings_table = pd.DataFrame(rating_matrix[mask_users][:, mask_movies], index=mask_users+1)
    ratings_table['sim'] = ((ratings_table.values > 0)*np.abs(5 - (ratings_table.values - rating_matrix[userId-1][mask_movies]))).sum(axis=1)

    similar_users = list(ratings_table[ratings_table['sim'] >= thresh_sim].index)

    condition_1 = ~ratings_full['movieId'].isin(mask_movies)
    condition_2 = ratings_full['userId'].isin(similar_users)
    possible_recommendations = ratings_full[condition_1 & condition_2]['movieId']
    most_common_recommendations = Counter(possible_recommendations).most_common(limit)
    most_common_recommendations = [m for m,c in most_common_recommendations]

    if len(most_common_recommendations) == limit:
        return most_common_recommendations
    else:
        n_add_movies = limit - len(most_common_recommendations)
        popular_movies = Counter(ratings_full['movieId'].tolist()).most_common(n_add_movies)
        popular_movies = [m for m,c in popular_movies]

        return most_common_recommendations + popular_movies

In [13]:
def load_users_ratings(ratings_full=ratings_full):
    ratings_frame = ratings_full.copy()
    users_ratings = {}
    for _, row in ratings_frame.iterrows():
        if int(row["userId"]) not in users_ratings:
            users_ratings[int(row["userId"])] = {}
        users_ratings[int(row["userId"])][int(row["movieId"])] = row["rating"]
    return users_ratings

In [14]:
def computeAP(relevant, predicted):
    ap = 0.0
    good_predictions = 0.0
    for i, item in enumerate(predicted):
        if item in relevant:
            good_predictions += 1
            ap += 1.0/(i+1) * good_predictions/(i+1)
    return ap

In [15]:
def compute_rating_matrix(ratings_full):
    X = []
    num_movies = len(ratings_full.movieId.unique())
    for userId in sorted(ratings_full.userId.unique()):
        watched_movies = ratings_full[ratings_full['userId'] == userId]['movieId'].tolist()
        user_ratings = ratings_full[ratings_full['userId'] == userId]['rating'].tolist()

        row = np.zeros(num_movies)
        row[watched_movies] = user_ratings
        X.append(row)
    return np.array(X)

In [16]:
def computeMAP(ratings_full=ratings_full, relevant_treshold=3.0, topN=5):
    k_cross = 5
    total_aps = 0.0
    total = 0
    users_ratings = load_users_ratings(ratings_full)
    rating_matrix = compute_rating_matrix(ratings_full)
    training_data = {user: user_ratings for user, user_ratings in users_ratings.items() if user < 0.8*len(users_ratings)}
    test_data = {user: user_ratings for user, user_ratings in users_ratings.items() if user not in training_data}
    # recommender.train(training_data)
    for userId, user_ratings in tqdm(test_data.items()):
        user_items = list(user_ratings.items())
        shuffle(user_items)
        parts = [
        user_items[int(k*(len(user_items)/k_cross)):int((k+1)*(len(user_items)/k_cross)) if k < k_cross-1 else len(user_items)]
            for k in range(k_cross)
        ]
        for i in range(k_cross):
            test, training = parts[i], [rat for part in parts[:i]+parts[i+1:] for rat in part]
            relevant = [movieId for (movieId, rating) in test if rating >= relevant_treshold]
            # user_profile = recommender.create_user_profile(dict(training))
            predicted = recommender(userId, training_data, training, rating_matrix, ratings_full, limit=topN)
            if relevant:
                total_aps += computeAP(relevant, predicted)
                total += 1
    return total_aps/total

In [17]:
map = computeMAP()

print('MAP result :', round(map, 4))

HBox(children=(FloatProgress(value=0.0, max=123.0), HTML(value='')))


MAP result : 0.4229
