In [None]:
%load_ext autoreload
%autoreload 2

import sys; sys.path.append('../')

import os
from abc import ABC
from typing import List

import numpy as np
import pandas as pd
import scipy
import scipy.sparse
import seaborn as sns
import sklearn
import sklearn.metrics
import sklearn.model_selection
import sklearn.neighbors
from tqdm.auto import tqdm
from src.models.recommender import RecommenderSystem
from src.util.plot import Plot
from pathlib import Path
tqdm.pandas()


plot = Plot()

RATINGS_PATH = Path('../data/ratings_small.csv')
MOVIES_PATH = Path('../data/movies_metadata.csv')
OUTPUT_PATH = Path('../models/cb_knn.pickle')

# data loading and overview

In [None]:
movies_metadata = pd.read_csv(MOVIES_PATH)
# there are 3 bad records in columm "id", removing them:
movies_metadata.id = pd.to_numeric(movies_metadata.id, errors='coerce')

ratings = pd.read_csv(RATINGS_PATH)

In [None]:
movies_metadata.head()

In [None]:
# some dataset records + titles
ratings.merge(movies_metadata[['id', 'title']], left_on='movieId', right_on='id').head()

# user-user collaborative KNN
(not enough data for item-item KNN)

In [None]:
train_ratings, test_ratings = sklearn.model_selection.train_test_split(ratings,
        test_size=1000,
        random_state=42
        )

In [None]:
train_ratings.head()

In [None]:
class KNNrecommender(ABC):
    def __init__(self, data, value_name='rating', row_name='userId',
                 col_name='movieId', metric='euclidean',):
        super(KNNrecommender, self).__init__()
        self.row_name = row_name
        values = data[value_name].to_list()
        row, column = data[row_name].to_list(), data[col_name].to_list()

        self.sparse_ratings = scipy.sparse.coo_matrix((values, (row, column)))
        self.knn = sklearn.neighbors.NearestNeighbors(n_neighbors=5,
                                                      metric=metric)
        self.knn.fit(self.sparse_ratings)

    def predict(self, user_id: int) -> List[int]:
        """
        Predicts ranking of movies to watch for a user.

        Parameters
        ----------
        user_id : int
            User's id from the data set.

        Returns
        -------
        List[int]
            List of movies ids. Best recommendations first.
        """
        feature_vector = self.sparse_ratings.getrow(user_id)
        distances, indexes = self.knn.kneighbors(feature_vector, 5)
        recommended = []

        for i in indexes[0][1:]:
            K = 5
            neighbour = self.sparse_ratings.getrow(i).toarray()[0]
            # search for 5 best rated movies by neighbout among unseen ones
            cols_to_consider = (
                (feature_vector.toarray() == 0) & (neighbour != 0)
                )[0]
            cols_to_consider = np.arange(
                len(cols_to_consider)
                )[cols_to_consider]
            # get the indexes of the best guesses
            Kbest_from_cols_to_consider = np.argsort(
                neighbour[cols_to_consider]
                )[::-1][-K:]
            Kbest_idxs_from_neighbour = \
                cols_to_consider[Kbest_from_cols_to_consider]

            recommended.append(Kbest_idxs_from_neighbour)
        return np.concatenate(recommended)

    def predict_score(self, user_id: int, movie_id: int) -> float:
        """
        Predicts score for a given movie that a user would give.

        Parameters
        ----------
        user_id : int
            User's id from the data set.
        movie_id : int
            Movie's id from the data set.

        Returns
        -------
        float
            Predicted movie's score in range [0.5, 5]
        """
        # the 2 lines below make sure that this method works
        # for both user-user and item-item filtering
        j = movie_id if (self.row_name == 'userId') else user_id
        row = user_id if (self.row_name == 'userId') else movie_id
        # early stop if the movie was not present in training set
        if self.sparse_ratings.getcol(j).toarray().sum() == 0:
            return np.nan
        # search for nearest neighbours in training set
        distances, indexes = self.knn.kneighbors(
            self.sparse_ratings.getrow(row), 100
            )
        # make a list of "movie_id" movie ratings abong neighbours
        users_alike_ratings = np.stack(
            [self.sparse_ratings.getrow(i).getcol(j).toarray()[0, 0]
             for i in indexes[0]
             ])
        if sum(users_alike_ratings != 0) == 0:
            return np.nan

        # create weights based on calculated distance
        weights = scipy.special.softmax(
            -distances[0][users_alike_ratings != 0]
            )
        # calculate weighted rating for a movie
        predicted_rating = \
            (weights * users_alike_ratings[users_alike_ratings != 0]).sum()
        return predicted_rating

In [None]:
user_user_RS = KNNrecommender(data=train_ratings, value_name='rating', row_name='userId', col_name='movieId', metric='euclidean')

In [None]:
user_user_RS.predict(1)

In [None]:
user_user_RS.predict_score(128,1028)

In [None]:
test_ratings

In [None]:
test_ratings[:5].progress_apply(lambda x: user_user_RS.predict_score(x.userId, x.movieId), axis=1)

In [None]:
for metric in ['euclidean', 'cosine', 'manhattan']:
    user_user_RS = KNNrecommender(data=train_ratings, value_name='rating', row_name='userId', col_name='movieId', metric=metric)
    test_ratings['preds_'+metric] = test_ratings.progress_apply(lambda x: user_user_RS.predict_score(x.userId, x.movieId), axis=1)

In [None]:
for metric in ['euclidean', 'cosine', 'manhattan']:
    tmp_ratings = test_ratings[['rating', 'preds_'+metric]].dropna()
    RMSE = (tmp_ratings['rating'] - tmp_ratings['preds_'+metric]).pow(2).mean()**0.5
    sns.relplot(data=test_ratings, x='rating', y='preds_'+metric).set(title=f'correlation of predicted and ground truth ratings\nbased on {metric} distance, RMSE={RMSE:1.2f}')

In [None]:
test_ratings.to_csv('/content/drive/MyDrive/DANsem3/SR/KNNresults.csv', index=False)

In [None]:
import pickle

In [None]:
for metric in ['euclidean', 'cosine', 'manhattan']:
    user_user_RS = KNNrecommender(data=train_ratings, value_name='rating', row_name='userId', col_name='movieId', metric=metric)
    
    with open(f'../models/KNN_{metric}_metric.pickle', 'wb') as handle:
        pickle.dump(user_user_RS, handle, protocol=pickle.HIGHEST_PROTOCOL)

    # with open(f'models/KNN_{metric}_metric.pickle', 'rb') as handle:
    #     b = pickle.load(handle)