In [26]:
# imports
import pandas as pd
import numpy as np
from IPython.core.debugger import set_trace
import torch
import matplotlib.pyplot as plt
import subprocess
from abc import ABC, abstractmethod
from typing import Iterable
from functools import reduce

# Export notebook as python script to the ../python-code - folder

In [8]:
subprocess.run("jupyter nbcbonvert --output-dir='../python-code' --to python User_Based_Neighborhood.ipynb --TemplateExporter.exclude_markdown=True --TemplateExporter.exclude_input_prompt=True", shell=True)

CompletedProcess(args="jupyter nbcbonvert --output-dir='../python-code' --to python User_Based_Neighborhood.ipynb --TemplateExporter.exclude_markdown=True --TemplateExporter.exclude_input_prompt=True", returncode=1)

In [25]:
class Neighborhood_Model(ABC):
    """
    Abstract base class for all neighborhood based models. The 'predict', and the 'compute_similarity' - functions need to be implemented by inheriting classes.
    """
    
    def __init__(self, rmh):
        """
        Params:
            rmh (Rating_Matrix_Handler): A Rating_Matrix_Handler object that provides the relevant rating matrices as well as test indices.
        """
        super().__init__()
        self.rmh_ = rmh
    
    def build_lookups(self) -> None:
        """
        Map users and items to numerical values for further indexing.
        """
        self.userid_lookup_ = {username: i+1 for i, username in enumerate(self.train_rating_matrix["username"])}
        self.itemid_lookup_ = {item: i+1 for i, item in enumerate(list(self.train_rating_matrix.columns))}
        # Reverse the two calculates mappings for bidirectional lookup
        self.username_lookup = {user_id: username for username, user_id in self.userid_lookup_}
        self.itemname_lookup = {item_id: itemname for itemname, item_id in self.itemid_lookup_}
        
    def calculate_items_rated_by_user(self) -> None:
        """
        Calculate a dictionary containing usernames as keys and a numpy array of rated items as key.
        """
        self.items_rated_by_user = {}
        users = set(self.rmh.final_rating_matrix_w_usernames["username"])
        for u in users:
            # Calculate the item-indices that are non-na for each user
            self.items_rated_by_user[u] =  np.argwhere(~pd.isna(self.rmh.final_rating_matrix_w_usernames[self.rmh.final_rating_matrix_w_usernames["username"] == u].values))[:,0] 
        for rated_items in self.calculate_items_rated_by_user.values():
            # Delete the first entry, as its the username which will not be used for similarity computation
            rated_items = rated_items[1:]
    
    def compute_mean_ratings(self, for_users:bool=True) -> None:
        """
        Compute the mean rating for users/items depending on the for_users - flag as a dictionary with users/items as key and the average rating as value.

        Params:
            for_users (bool, optional): If set to True, calculate user-rating means. If set to False, calculate item-rating means. Defaults to True.
        """
        self.mean_ratings = {}
        
        if for_users:
            for user in self.calculate_items_rated_by_user.keys():
                self.mean_ratings[user] = np.nanmean(self.rmh.final_rating_matrix_w_usernames[self.final_rating_matrix_w_usernames["username"] == user].values[0][1:])
        else:
            # Exclude the username column
            for item in self.train_rating_matrix.columns[1:]:
                self.mean_ratings[item] = np.nanmean(self.train_rating_matrix[item].values)
    
    def compute_mutual_objects(self, iterable1:Iterable, iterable2:Iterable) -> set:
        """
        Computes the mutual objects of two iterables.

        Args:
            iterable1 (Iterable): First iterable object.
            iterable2 (Iterable): Second iterable object.

        Returns:
            set: The mutual objects of the first and second iterable object.
        """
        return set(iterable1).intersection(set(iterable2))

    @abstractmethod
    def predict(self, user:str, item:str) -> int:
        pass
    
    @abstractmethod
    def compute_similarity(self, object1, object2) -> float:
        """
        Compute the similarity between the ratings of both objects. As the similarity is only computed over mutual ratings of both objects, the dimension of both rating vectors must be equal.

        Args:
            ratings1 (np.array): First object.
            ratings2 (np.array): Second object.

        Returns:
            float: Similarity score of both rating vectors.
        """
        pass
    
    @abstractmethod
    def compute_similarity_matrix(self) -> np.array:
        pass

    def evaluate(self, task:str="Conviction") -> float:
        """
        Evaluate the performance of the model on the test dataset for a specific task.

        Args:
            task (str, optional): The task that the model should be evaluated on. Can either be "Conviction" (columns with ratings 0 & 1) or "Weight" (columns with rating [0-6]). Defaults to "Conviction".

        Returns:
            float: RMSE if task is "Weight" and mean accuracy if task is "Conviction".
        """
        if task=="Conviction":
            test_eval_indices = np.array([idx for idx in self.rmh.test_eval_indices if idx[1] % 2 == 1])
        else:
            test_eval_indices = np.array([idx for idx in self.rmh.test_eval_indices if idx[1] % 2 == 1])

In [None]:
class User_Neighborhood_Pearson_Centered(Neighborhood_Model):
    """
    A user - based neighborhood model that takes into account rating bias by centering the raw data for each user and applying the Pearson Correlation Coefficient for predicting the similarity of user-pairs. 
    """
    def predict(self, user, item):
        super().predict(user, item)
    
    def compute_similarity(self, user1:str, user2:str) -> float:
        """
        Compute the Pearson Correlation Coefficient for the two rating vectors.

        Args:
            user1 (str): First user.
            user2 (str): Second user.

        Returns:
            float: Pearson Correlation Coefficient of both rating vectors.
        """

        # Get rated items of both users
        rated_items1 = self.items_rated_by_user[user1]
        rated_items2 = self.items_rated_by_user[user2]
     
        # Get mutual rated items
        mutual_rated_items = self.compute_mutual_objects(rated_items1, rated_items2)
        # If there are no mutual rated items, return 0 for the Pearson Correlation Coefficient
        if len(mutual_rated_items) == 0:
            return 0.0
        
        # Get mean rating of both users
        mean_rating1 = self.mean_ratings[user1]
        mean_rating2 = self.mean_ratings[user2]
        
        # Variable holding the difference between actual rating and mean rating for both users, as this value needs to be calculated multiple times
        diffs = []
        for i, item in enumerate(mutual_rated_items):
           r_u1 = self.rmh.final_rating_matrix_w_usernames[self.rmh.final_rating_matrix_w_usernames["username"] == user1][item]
           r_u2 = self.rmh.final_rating_matrix_w_usernames[self.rmh.final_rating_matrix_w_usernames["username"] == user2][item]
           diffs.append(tuple(r_u1 - mean_rating1, r_u2 - mean_rating2))
        
        # Calculate the nominator and denominator of the Pearson Correlation Coefficient
                       