In [36]:
import pandas as pd 
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse 
class CF(object):
    """docstring for CF"""
    def __init__(self, Y_data, k, dist_func = cosine_similarity, uuCF = 1):
        self.uuCF = uuCF # user-user (1) or item-item (0) CF
        self.Y_data = Y_data if uuCF else Y_data[:, [1, 0, 2]]
        self.k = k
        self.dist_func = dist_func
        self.Ybar_data = None
        # number of users and items. Remember to add 1 since id starts from 0
        self.n_users = int(np.max(self.Y_data[:, 0])) + 1 
        self.n_items = int(np.max(self.Y_data[:, 1])) + 1
    
    def add(self, new_data):
        """
        Update Y_data matrix when new ratings come.
        For simplicity, suppose that there is no new user or item.
        """
        self.Y_data = np.concatenate((self.Y_data, new_data), axis = 0)
    
    def normalize_Y(self):
        users = self.Y_data[:, 0] # all users - first col of the Y_data
        self.Ybar_data = self.Y_data.copy()
        self.mu = np.zeros((self.n_users,))
        for n in range(self.n_users):
            # row indices of rating done by user n
            # since indices need to be integers, we need to convert
            ids = np.where(users == n)[0].astype(np.int32)
            # indices of all ratings associated with user n
            item_ids = self.Y_data[ids, 1] 
            # and the corresponding ratings 
            ratings = self.Y_data[ids, 2]
            # take mean
            m = np.mean(ratings) 
            if np.isnan(m):
                m = 0 # to avoid empty array and nan value
            self.mu[n] = m
            # normalize
            self.Ybar_data[ids, 2] = ratings - self.mu[n]

        ################################################
        # form the rating matrix as a sparse matrix. Sparsity is important 
        # for both memory and computing efficiency. For example, if #user = 1M, 
        # #item = 100k, then shape of the rating matrix would be (100k, 1M), 
        # you may not have enough memory to store this. Then, instead, we store 
        # nonzeros only, and, of course, their locations.
        self.Ybar = sparse.coo_matrix((self.Ybar_data[:, 2],
            (self.Ybar_data[:, 1], self.Ybar_data[:, 0])), (self.n_items, self.n_users))
        self.Ybar = self.Ybar.tocsr()

    def similarity(self):
        eps = 1e-6
        self.S = self.dist_func(self.Ybar.T, self.Ybar.T)
    
        
    def refresh(self):
        """
        Normalize data and calculate similarity matrix again (after
        some few ratings added)
        """
        self.normalize_Y()
        self.similarity() 
        
    def fit(self):
        self.refresh()
        
    
    def __pred(self, u, i, normalized = 1):
        """ 
        predict the rating of user u for item i (normalized)
        if you need the un
        """
        # Step 1: find all users who rated i
        ids = np.where(self.Y_data[:, 1] == i)[0].astype(np.int32)
        # Step 2: 
        users_rated_i = (self.Y_data[ids, 0]).astype(np.int32)
        # Step 3: find similarity btw the current user and others 
        # who already rated i
        sim = self.S[u, users_rated_i]
        # Step 4: find the k most similarity users
        a = np.argsort(sim)[-self.k:] 
        # and the corresponding similarity levels
        nearest_s = sim[a]
        # How did each of 'near' users rated item i
        r = self.Ybar[i, users_rated_i[a]]
        if normalized:
            # add a small number, for instance, 1e-8, to avoid dividing by 0
            return (r*nearest_s)[0]/(np.abs(nearest_s).sum() + 1e-8)

        return (r*nearest_s)[0]/(np.abs(nearest_s).sum() + 1e-8) + self.mu[u]
    
    def pred(self, u, i, normalized = 1):
        """ 
        predict the rating of user u for item i (normalized)
        if you need the un
        """
        if self.uuCF: return self.__pred(u, i, normalized)
        return self.__pred(i, u, normalized)
            
    
    def recommend(self, u):
        """
        Determine all items should be recommended for user u.
        The decision is made based on all i such that:
        self.pred(u, i) > 0. Suppose we are considering items which 
        have not been rated by u yet. 
        """
        ids = np.where(self.Y_data[:, 0] == u)[0]
        items_rated_by_u = self.Y_data[ids, 1].tolist()              
        recommended_items = []
        for i in range(self.n_items):
            if i not in items_rated_by_u:
                rating = self.__pred(u, i)
                if rating > 0: 
                    recommended_items.append(i)
        
        return recommended_items 
    
    def recommend2(self, u):
        """
        Determine all items should be recommended for user u.
        The decision is made based on all i such that:
        self.pred(u, i) > 0. Suppose we are considering items which 
        have not been rated by u yet. 
        """
        ids = np.where(self.Y_data[:, 0] == u)[0]
        items_rated_by_u = self.Y_data[ids, 1].tolist()              
        recommended_items = []
    
        for i in range(self.n_items):
            if i not in items_rated_by_u:
                rating = self.__pred(u, i)
                if rating > 0: 
                    recommended_items.append(i)
        
        return recommended_items 

    def print_recommendation(self):
        """
        print all items which should be recommended for each user 
        """
        print ('Recommendation: ')
        for u in range(self.n_users):
            recommended_items = self.recommend(u)
            if self.uuCF:
                print ('    Recommend item(s):', recommended_items, 'for user', u)
            else: 
                print ('    Recommend item', u, 'for user(s) : ', recommended_items)

In [37]:
# data file 
r_cols = ['user_id', 'item_id', 'rating']
ratings = pd.read_csv('ex.dat', sep = ' ', names = r_cols, encoding='latin-1')
Y_data = ratings.values

rs = CF(Y_data, k = 2, uuCF = 1)
rs.fit()

rs.print_recommendation()

Recommendation: 
    Recommend item(s): [2] for user 0
    Recommend item(s): [1] for user 1
    Recommend item(s): [] for user 2
    Recommend item(s): [4] for user 3
    Recommend item(s): [4] for user 4
    Recommend item(s): [0, 3, 4] for user 5
    Recommend item(s): [1] for user 6


In [50]:
import pandas as pd 
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse 

class CF(object):
    """docstring for CF"""
    def __init__(self, Y_data, k, dist_func = cosine_similarity, uuCF = 1):
        self.uuCF = uuCF # user-user (1) or item-item (0) CF
        self.Y_data = Y_data if uuCF else Y_data[:, [1, 0, 2]]
        self.k = k
        self.dist_func = dist_func
        self.Ybar_data = None

        # number of users and items. Remember to add 1 since id starts from 0
        self.n_users = int(np.max(self.Y_data['user_id'].to_numpy())) + 1 
        self.n_items = int(np.max(self.Y_data['cluster'].to_numpy())) + 1

    def add(self, new_data):
        """
        Update Y_data matrix when new ratings come.
        For simplicity, suppose that there is no new user or item.
        """
        self.Y_data = np.concatenate((self.Y_data, new_data), axis = 0)

    def normalize_Y(self):
        users = self.Y_data["user_id"] # all users - first col of the Y_data
        self.Ybar_data = self.Y_data.copy()
        self.mu = np.zeros((self.n_users,))
        for n in range(self.n_users):
            # row indices of rec_score done by user n
            # since indices need to be integers, we need to convert
            ids = np.where(users == n)[0].astype(np.int32)
            # indices of all ratings associated with user n
            item_ids = self.Y_data.iloc[ids, 1] 
            # and the corresponding ratings 
            rec_score = self.Y_data.iloc[ids, 2]
            # take mean
            m = np.mean(rec_score) 
            if np.isnan(m):
                m = 0 # to avoid empty array and nan value
            self.mu[n] = m
            # normalize
            self.Ybar_data.iloc[ids, 2] = rec_score - self.mu[n]

        self.Ybar = sparse.coo_matrix((self.Ybar_data.iloc[:, 2],
            (self.Ybar_data.iloc[:, 1], self.Ybar_data.iloc[:, 0])), (self.n_items, self.n_users))
        self.Ybar = self.Ybar.tocsr()

    def similarity(self):
        eps = 1e-6
        self.S = self.dist_func(self.Ybar.T, self.Ybar.T)

    def __pred(self, u, i, normalized = 1):
        """ 
        predict the rec_score of user u for item i (normalized)
        if you need the un
        """
        ids = np.where(self.Y_data.iloc[:, 1] == i)[0].astype(np.int32)
        print(f"ids: {ids}")
        users_done_i = (self.Y_data.iloc[ids, 0]).astype(np.int32)
        print(f"users_done_i: {users_done_i}")
        # Step 3: find similarity btw the current user and others 
        # who already done i
        sim = self.S[u, users_done_i]
        print(f"sim: {sim}")
        # Step 4: find the k most similarity users
        a = np.argsort(sim)[-self.k:] 
        print(f"a: {a}")

        # and the corresponding similarity levels
        nearest_s = sim[a]
        print(f"nearest_s: {nearest_s}")

        # How did each of 'near' users done item i
        r = self.Ybar[i, users_done_i[a]]
        print(f"r: {r}")

        if normalized:
            # add a small number, for instance, 1e-8, to avoid dividing by 0
            return (r*nearest_s)[0]/(np.abs(nearest_s).sum() + 1e-8)

        return (r*nearest_s)[0]/(np.abs(nearest_s).sum() + 1e-8) + self.mu[u]
    
    def pred(self, u, i, normalized = 1):
        if self.uuCF: 
            return self.__pred(u, i, normalized)
        return self.__pred(i, u, normalized)
    
    def recommend(self, u):
        """
        Determine all items should be recommended for user u.
        The decision is made based on all i such that:
        self.pred(u, i) > 0. Suppose we are considering items which 
        have not been done by u yet. 
        """
        ids = np.where(self.Y_data.iloc[:, 0] == u)[0]
        items_done_by_u = self.Y_data.iloc[ids, 1].tolist()              
        recommended_items = []
        for i in range(self.n_items):
            if i not in items_done_by_u:
                # rec_score = self.__pred(u, i)
                # test print
                recommended_items.append(i)
                # if rec_score > 0: 
                #     recommended_items.append(i)
        
        return recommended_items 

In [51]:
from src.models.collaborative_filtering import CF
y_data = pd.read_csv("src/data/y_data.csv")
cf = CF(Y_data=y_data, k = 2, uuCF = 1)

cf.normalize_Y()
cf.similarity()

InvalidIndexError: (array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11]), 1)

In [44]:
users = cf.Y_data["user_id"]
cf.mu = np.zeros((cf.n_users,))
cf.Ybar_data = cf.Y_data.copy()
for n in range(cf.n_users):
    # row indices of rating done by user n
    # since indices need to be integers, we need to convert
    ids = np.where(users == n)[0]
    # print(f"ids: {ids}")
    # indices of all ratings associated with user n
    item_ids = cf.Y_data.iloc[ids, 1] 
    # and the corresponding ratings 
    rec_score = cf.Y_data.iloc[ids, 2]

    # print(f"item_ids: {item_ids}")
    # print(f"rec_score:\n {rec_score}")

    # take mean
    m = np.mean(rec_score) 
    if np.isnan(m):
        m = 0 # to avoid empty array and nan value
    cf.mu[n] = m
    # print(f"mu: {cf.mu[n]}")
    # normalize
    cf.Ybar_data.iloc[ids, 2] = rec_score - cf.mu[n]
cf.Ybar = sparse.coo_matrix((cf.Ybar_data.iloc[:, 2],
    (cf.Ybar_data.iloc[:, 1], cf.Ybar_data.iloc[:, 0])), (cf.n_items, cf.n_users))


In [46]:
print(cf.Ybar.tocsr())

  (0, 34)	-1.4159899238102955
  (0, 40)	-2.629684014115868
  (0, 50)	1.3310367171675281
  (0, 102)	1.7078283940545078
  (0, 118)	-1.4281967982602168
  (0, 120)	0.25079214503712066
  (0, 157)	-1.1428572003024984
  (0, 176)	0.0631580053973666
  (0, 178)	0.6400067719110929
  (0, 179)	0.8153547020135425
  (0, 184)	0.6307655577118867
  (0, 196)	0.28000276830120807
  (0, 198)	0.329824438368008
  (0, 200)	0.31333275098966684
  (0, 202)	-0.01333334695842403
  (0, 203)	1.6774136350251507
  (0, 205)	-0.08888896819145575
  (0, 207)	-0.6817759632620637
  (0, 212)	0.7846169823787292
  (0, 213)	0.7857141830005823
  (0, 215)	-0.2631585472322122
  (0, 216)	-0.6000007719142504
  (0, 217)	1.5492772914406383
  (0, 223)	-0.2777973146780508
  (0, 224)	-0.08998088140272475
  :	:
  (45, 118)	-0.09486371761356693
  (45, 178)	-0.026659973735603693
  (45, 198)	-0.0035088915737482296
  (45, 200)	0.3133327448402996
  (45, 205)	-0.08888898032416392
  (45, 223)	-0.2777770863729243
  (46, 34)	-1.215554203088384
  (4

In [39]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

class CollaborativeFiltering:
    def __init__(self, Ybar):
        self.Ybar = Ybar
        self.dist_func = cosine_similarity

    def similarity(self):
        eps = 1e-6
        self.S = self.dist_func(self.Ybar.T)  # Compute cosine similarity between columns

# Example usage
Ybar = np.array([[1, 0, 3], [4, 5, 6], [7, 8, 9]])
cf = CollaborativeFiltering(Ybar)
cf.similarity()
print(cf.S)

[[1.         0.99162307 0.98692754]
 [0.99162307 1.         0.96320759]
 [0.98692754 0.96320759 1.        ]]
