In [None]:
%load_ext autoreload
%autoreload 2

import sys; sys.path.append('../')

import os
from abc import ABC
from typing import List, Tuple

import numpy as np
import pandas as pd
import scipy
import scipy.sparse
import seaborn as sns
import sklearn
import sklearn.metrics
import sklearn.model_selection
import sklearn.neighbors
from tqdm.auto import tqdm
from src.models.recommender import RecommenderSystem
from src.util.plot import Plot
from pathlib import Path
tqdm.pandas()


plot = Plot()

RATINGS_PATH = Path('../data/ratings_small.csv')
MOVIES_PATH = Path('../data/movies_metadata.csv')
OUTPUT_PATH = Path('../models/cb_knn.pickle')

# data loading and overview

In [None]:
movies_metadata = pd.read_csv(MOVIES_PATH)
# there are 3 bad records in columm "id", removing them:
movies_metadata.id = pd.to_numeric(movies_metadata.id, errors='coerce')

ratings = pd.read_csv(RATINGS_PATH)

In [None]:
movies_metadata.head()

In [None]:
# some dataset records + titles
ratings.merge(movies_metadata[['id', 'title']], left_on='movieId', right_on='id').head()

# user-user collaborative KNN
(not enough data for item-item KNN)

In [42]:
train_ratings, test_ratings = sklearn.model_selection.train_test_split(ratings,
        test_size=1000,
        random_state=42
        )

In [43]:
train_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
24079,173,107,3.0,875357448
22729,159,1911,1.0,1183518395
21590,150,1784,2.5,1114308739
41186,295,1517,4.0,1100129122
10926,73,4713,4.5,1409205919


In [66]:
class KNNrecommender(ABC):
    def __init__(self, data, value_name='rating', row_name='userId',
                 col_name='movieId', metric='euclidean',):
        super(KNNrecommender, self).__init__()
        self.row_name = row_name
        values = data[value_name].to_list()
        row, column = data[row_name].to_list(), data[col_name].to_list()
        
        self.column_mapping_true_to_internal = {k:v for v,k in enumerate(list(set(column)))}
        self.column_mapping = [i for i, k in enumerate(self.column_mapping_true_to_internal)]
        
        column = [self.column_mapping_true_to_internal[i] for i in column]

        self.sparse_ratings = scipy.sparse.coo_matrix((values, (row, column)))
        self.knn = sklearn.neighbors.NearestNeighbors(n_neighbors=5,
                                                      metric=metric)
        self.knn.fit(self.sparse_ratings)

    def predict(self, user_id: int) -> List[int]:
        """
        Predicts ranking of movies to watch for a user.

        Parameters
        ----------
        user_id : int
            User's id from the data set.

        Returns
        -------
        List[int]
            List of movies ids. Best recommendations first.
        """
        feature_vector = self.sparse_ratings.getrow(user_id)
        distances, indexes = self.knn.kneighbors(feature_vector, 5)
        recommended = []

        for i in indexes[0][1:]:
            K = 5
            neighbour = self.sparse_ratings.getrow(i).toarray()[0]
            # search for 5 best rated movies by neighbout among unseen ones
            cols_to_consider = (
                (feature_vector.toarray() == 0) & (neighbour != 0)
                )[0]
            cols_to_consider = np.arange(
                len(cols_to_consider)
                )[cols_to_consider]
            # get the indexes of the best guesses
            Kbest_from_cols_to_consider = np.argsort(
                neighbour[cols_to_consider]
                )[::-1][-K:]
            Kbest_idxs_from_neighbour = \
                cols_to_consider[Kbest_from_cols_to_consider]

            recommended.append(Kbest_idxs_from_neighbour)
        mapped_ids = [self.column_mapping[i] for i in np.concatenate(recommended)]
        return np.array(mapped_ids)

    def predict_score(self, user_id: int, movie_id: int) -> float:
        """
        Predicts score for a given movie that a user would give.

        Parameters
        ----------
        user_id : int
            User's id from the data set.
        movie_id : int
            Movie's id from the data set.

        Returns
        -------
        float
            Predicted movie's score in range [0.5, 5]
        """
        # the 2 lines below make sure that this method works
        # for both user-user and item-item filtering
        j = movie_id if (self.row_name == 'userId') else user_id
        j = self.column_mapping_true_to_internal[j]
        row = user_id if (self.row_name == 'userId') else movie_id
        # early stop if the movie was not present in training set
        if self.sparse_ratings.getcol(j).toarray().sum() == 0:
            return np.nan
        # search for nearest neighbours in training set
        distances, indexes = self.knn.kneighbors(
            self.sparse_ratings.getrow(row), 100
            )
        # make a list of "movie_id" movie ratings abong neighbours
        users_alike_ratings = np.stack(
            [self.sparse_ratings.getrow(i).getcol(j).toarray()[0, 0]
             for i in indexes[0]
             ])
        if sum(users_alike_ratings != 0) == 0:
            return np.nan

        # create weights based on calculated distance
        weights = scipy.special.softmax(
            -distances[0][users_alike_ratings != 0]
            )
        # calculate weighted rating for a movie
        predicted_rating = \
            (weights * users_alike_ratings[users_alike_ratings != 0]).sum()
        return predicted_rating

    def predict_scores(self, user_id: int) -> Tuple[np.ndarray, np.ndarray]:
        """
        Predicts scores for all the movies, that a user would give.
        Parameters
        ----------
        user_id : int
            User's id from the data set.
        Returns
        -------
        Tuple[np.ndarray, np.ndarray]:
            Ranked movies with their scores.
        """
        #j = movie_id if (self.row_name == 'userId') else user_id
        row = user_id #if (self.row_name == 'userId') else movie_id

        # if self.sparse_ratings.getcol(j).toarray().sum() == 0:
        #     return np.nan

        distances, indexes = self.knn.kneighbors(
            self.sparse_ratings.getrow(row), 100
            )

        
        predicted_ratings = []
        
        for j in range(self.sparse_ratings.shape[1]):
            users_alike_ratings = np.stack(
                [self.sparse_ratings.getrow(i).getcol(j).toarray()[0, 0]
                 for i in indexes[0]
                 ])
            if sum(users_alike_ratings != 0) == 0:
                predicted_ratings.append(np.nan)
                continue

            # create weights based on calculated distance
            weights = scipy.special.softmax(
                -distances[0][users_alike_ratings != 0]
                )
            # calculate weighted rating for a movie
            predicted_rating = \
                (weights * users_alike_ratings[users_alike_ratings != 0]).sum()
            predicted_ratings.append(predicted_rating)
        predicted_ratings = np.array(predicted_ratings)
        # print(predicted_ratings)
        
        movies = np.arange(self.sparse_ratings.shape[1])

        ranking = pd.DataFrame(
            zip(movies, predicted_ratings),
            columns=['movie', 'rating']
        )

        ranking = ranking.sort_values(
            by='rating',
            ascending=False
        )

        movies = ranking['movie'].values
        ratings = ranking['rating'].values
        
        mapped_ids = [self.column_mapping[i] for i in movies]

        return mapped_ids, ratings

In [67]:
user_user_RS = KNNrecommender(data=train_ratings, value_name='rating', row_name='userId', col_name='movieId', metric='euclidean')

In [68]:
user_user_RS.predict(1)

array([ 909,  808,  658,  263, 1374, 1812, 1628, 1481,   86,   63,  931,
        808,  216, 1124, 1047])

In [69]:
user_user_RS.predict_scores(123)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=9029.0), HTML(value='')))




KeyboardInterrupt: 

In [18]:
m, r =user_user_RS.predict_scores(128)
m.shape, r.shape

TypeError: cannot unpack non-iterable float object

In [None]:
test_ratings

In [None]:
test_ratings[:5].progress_apply(lambda x: user_user_RS.predict_score(x.userId, x.movieId), axis=1)

In [None]:
for metric in ['euclidean', 'cosine', 'manhattan']:
    user_user_RS = KNNrecommender(data=train_ratings, value_name='rating', row_name='userId', col_name='movieId', metric=metric)
    test_ratings['preds_'+metric] = test_ratings.progress_apply(lambda x: user_user_RS.predict_score(x.userId, x.movieId), axis=1)

In [None]:
for metric in ['euclidean', 'cosine', 'manhattan']:
    tmp_ratings = test_ratings[['rating', 'preds_'+metric]].dropna()
    RMSE = (tmp_ratings['rating'] - tmp_ratings['preds_'+metric]).pow(2).mean()**0.5
    sns.relplot(data=test_ratings, x='rating', y='preds_'+metric).set(title=f'correlation of predicted and ground truth ratings\nbased on {metric} distance, RMSE={RMSE:1.2f}')

In [None]:
test_ratings.to_csv('/content/drive/MyDrive/DANsem3/SR/KNNresults.csv', index=False)

In [None]:
import pickle

In [None]:
for metric in ['euclidean', 'cosine', 'manhattan']:
    user_user_RS = KNNrecommender(data=train_ratings, value_name='rating', row_name='userId', col_name='movieId', metric=metric)
    
    with open(f'../models/KNN_{metric}_metric.pickle', 'wb') as handle:
        pickle.dump(user_user_RS, handle, protocol=pickle.HIGHEST_PROTOCOL)

    # with open(f'models/KNN_{metric}_metric.pickle', 'rb') as handle:
    #     b = pickle.load(handle)

## Tests

In [None]:
model = KNNrecommender(data=train_ratings, value_name='rating', row_name='userId', col_name='movieId', metric='cosine')

In [None]:
_, scores = model.predict_scores(2)
plot.histogram(
    x=scores,
    title=f'Recommended movie ratings for user_id: {2}',
    xaxis_title='Rating',
    yaxis_title='Count'
)

In [None]:
jaccard_ranking = get_jaccard_ranking(model, test_ratings)
plot.histogram(
    x=jaccard_ranking,
    title='Jaccard index per each user ranking (liked_movie >= 3.5)',
    xaxis_title='Jaccard index',
    yaxis_title='Count'
)

In [None]:
classification_metrics = get_classification_ranking_metrics(model, test_ratings)
plot.bar(
    classification_metrics,
    title='Classification metrics',
    xaxis_title='Metrics',
    yaxis_title='Score'
)

In [None]:
r2_score = get_r2_score(model, test_ratings)
print(f'\n\nr2 score: {r2_score:.3f}')