In [2]:
# imports
import pandas as pd
import numpy as np
from IPython.core.debugger import set_trace
import torch
import matplotlib.pyplot as plt
import subprocess
from abc import ABC, abstractmethod

# Export notebook as python script to the ../python-code - folder

In [8]:
subprocess.run("jupyter nbcbonvert --output-dir='../python-code' --to python User_Based_Neighborhood.ipynb --TemplateExporter.exclude_markdown=True --TemplateExporter.exclude_input_prompt=True", shell=True)

CompletedProcess(args="jupyter nbcbonvert --output-dir='../python-code' --to python User_Based_Neighborhood.ipynb --TemplateExporter.exclude_markdown=True --TemplateExporter.exclude_input_prompt=True", returncode=1)

In [13]:
class Neighborhood_Model(ABC):
    """
    Abstract base class for all neighborhood based models. The 'predict', and the 'compute_similarity' - functions need to be implemented by inheriting classes.
    """
    
    def __init__(self, train_rating_matrix:pd.DataFrame, test_rating_matrix:pd.DataFrame, test_indices:np.array):
        """
        Params:
            train_rating_matrix (pd.DataFrame): The rating matrix that the model is supposed to calculate the neighborhoods on.
            test_rating_matrix (pd.DataFrame): The test rating matrix that contains the true values.
            test_indices (np.array): The indices of the test dataset that the model is supposed to predict on.
        """
        super().__init__()
        self.rating_matrix_, self.test_rating_matrix_, self.test_indices_ = train_rating_matrix, test_rating_matrix, test_indices
    
    def build_lookups(self) -> None:
        """
        Map users and items to numerical values for further indexing.
        """
        self.userid_lookup_ = {username: i+1 for i, username in enumerate(self.train_rating_matrix["username"])}
        self.itemid_lookup_ = {item: i+1 for i, item in enumerate(list(self.train_rating_matrix.columns))}
        # Reverse the two calculates mappings for bidirectional lookup
        self.username_lookup = {user_id: username for username, user_id in self.userid_lookup_}
        self.itemname_lookup = {item_id: itemname for itemname, item_id in self.itemid_lookup_}
        
    def calculate_items_rated_by_user(self) -> None:
        """
        Calculate a dictionary containing usernames as keys and a numpy array of rated items as key.
        """
        self.items_rated_by_user = {}
        users = set(self.train_rating_matrix["username"])
        for u in users:
            # Calculate the item-indices that are non-na for each user
            self.items_rated_by_user[u] =  np.argwhere(~pd.isna(self.train_rating_matrix[self.train_rating_matrix["username"] == u].values))[:,0]
        # Delete the first entry, as its the username which will not be used for similarity computation 
        for rated_items in self.calculate_items_rated_by_user.values():
            rated_items = rated_items[1:]
        
    @abstractmethod
    def predict(self, user:str, item:str) -> int:
        pass
    
    @abstractmethod
    def compute_similarity(self, user1:str, user2:str) -> float:
        pass

In [None]:
class User_Neighborhood_Pearson_Centered(Neighborhood_Model):
    """
    A user - based neighborhood model that takes into account rating bias by centering the raw data for each user and applying the Pearson Correlation Coefficient for predicting the similarity of user-pairs. 
    """
    