In [16]:
import numpy as np
import pandas as pd
from PIL import Image
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
import os
import sys

sys.path.append('../')

ROOT = os.getcwd()
WORK_DIR = os.path.dirname(ROOT)

# ***User-Item Profile***

In [3]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']

rated_train = pd.read_csv(os.path.join(WORK_DIR, 'ml-100k/ua.base'), sep='\t', names=r_cols, encoding='latin-1')
rated_test = pd.read_csv(os.path.join(WORK_DIR, 'ml-100k/ua.test'), sep='\t', names=r_cols, encoding='latin-1')

print('Number of traing rates:', rated_train.shape[0])
print('Number of test rates:', rated_test.shape[0])

Number of traing rates: 90570
Number of test rates: 9430


In [8]:
X_train = rated_train.to_numpy()
X_test = rated_test.to_numpy()

In [5]:
rated_train

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712
...,...,...,...,...
90565,943,1047,2,875502146
90566,943,1074,4,888640250
90567,943,1188,3,888640250
90568,943,1228,3,888640275


# ***Modeling***

In [70]:
from typing import Optional, Any
from scipy import sparse


class CollaborativeFiltering:
    
    def __init__(self, K: int, uuCF: bool=True, distance_function: Any = cosine_similarity) -> None:
        self.K = K
        self.uuCF=uuCF
        self.distance_function = distance_function
    
    @staticmethod
    def normalization(X):
        return (X - X.min()) / (X.max() - X.min())
    
    @staticmethod
    def standardization(X):
        return (X - X.mean()) / X.std()
    
    def normalize(self, X_train: np) -> np.array:
        users = X_train[:, 0]
        items = X_train[:, 1]
        
        self.n_users = np.unique(users).shape[0]
        self.n_items = np.unique(items).shape[0]
        
        self.normalized_X_train = X_train.copy()

        
        for u in range(1, self.n_users):
            
            ids = np.where(users==u)[0].astype(np.int32)
            items = X_train[ids, 1]
            ratings = X_train[ids, 2]
            
            mean_ratings = np.mean(ratings)
            
            if np.isnan(mean_ratings):
                mean_ratings=0
                
            self.normalized_X_train[ids, 2] = self.standardization(ratings)

        
    def similarity(self) -> np.array:
        self.similarity_score = self.distance_function(self.normalized_X_train.T, self.normalized_X_train.T)
    
    def fit(self, X_train: np) -> None:
        self.X_train = X_train
        
        self.normalize(X_train)
        self.similarity()

    
    def predict(self, user: int, item: int) -> np.array:
        ids = np.where(self.X_train[:, 1] == item)[0].astype(np.int32)
        
        users = self.X_train[ids, 0].astype(np.int32)
        sim = self.similarity_score[users, user]
        
        if not self.uuCF:
            items = (self.X_train[ids, 1]).astype(np.int32)
            sim = self.similarity_score[items, item]
            
        ids_k_sim_users = np.argsort(sim)[-self.K: ]
        nearest_sim_users = sim[ids_k_sim_users]
        
        most_rated_item_by_users = self.X_train[item, users[ids_k_sim_users]]
        
        if not self.uuCF:
            most_rated_item_by_users = self.X_train[item, items[ids_k_sim_users]]
            
        return most_rated_item_by_users * nearest_sim_users / (np.sum(np.abs(nearest_sim_users)) + 1e-8)

    
    def recommend(self, user: int):
        ids = np.where(self.X_train[:, 0] == user)[0]
        items = self.X_train[ids, 1].tolist()
        recommended_items = []
        
        for item in range(self.n_items):
            
            if item not in items:
                rating = self.predict(user, item)
                
                if rating > 0:
                    recommended_items.append(item)
                    
        return recommended_items
    
    def export_recommendation(self):  
        print('Recommendation Sytem: ')
        for user in range(self.n_users):
            recommended_items = self.recommend(user)
            
            if self.uuCF:
                print(f'Recommend items: {recommended_items} to user {user}')
            
            else:
                print(f'Recommend items: {user} to user {recommended_items}')
    
    

In [71]:
cf = CollaborativeFiltering(K=30, uuCF=True)
cf.fit(X_train)

ValueError: row index exceeds matrix dimensions

In [68]:
cf.similarity_score

array([[ 1.        ,  0.6868806 , -0.00932452,  0.86643289],
       [ 0.6868806 ,  1.        , -0.09874128,  0.78938768],
       [-0.00932452, -0.09874128,  1.        , -0.02326853],
       [ 0.86643289,  0.78938768, -0.02326853,  1.        ]])

In [69]:
cf.export_recommendation()

Recommendation Sytem: 


  if rating > 0:


IndexError: index 6 is out of bounds for axis 0 with size 4