**Import** **Libraries + Insert Data**

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse 
import datetime

In [None]:
data =  pd.read_csv('df_user_product.csv')

In [None]:
data.info ()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1173849 entries, 0 to 1173848
Data columns (total 4 columns):
Unnamed: 0    1173849 non-null int64
user_id       1173848 non-null object
product_id    1173848 non-null float64
count         1173848 non-null float64
dtypes: float64(2), int64(1), object(1)
memory usage: 35.8+ MB


In [None]:
data = data.drop(columns='Unnamed: 0', axis = 0)
data.head(10)

Unnamed: 0,user_id,product_id,count
0,000021E7765634E8A1A1E27DE21E7DF3,765942.0,2.0
1,000021E7765634E8A1A1E27DE21E7DF3,3037477.0,1.0
2,00003D04C11A33F674B8F64CB1BAA004,2249761.0,1.0
3,00003D04C11A33F674B8F64CB1BAA004,2796826.0,1.0
4,00005F933267D65FF8159DC1A6C70925,2569826.0,1.0
5,0000C89FF2311CE11E9D9FB5B0C38BD1,948229.0,2.0
6,0000C89FF2311CE11E9D9FB5B0C38BD1,1438849.0,1.0
7,0000C89FF2311CE11E9D9FB5B0C38BD1,1945467.0,2.0
8,0000C89FF2311CE11E9D9FB5B0C38BD1,2038106.0,1.0
9,0000C89FF2311CE11E9D9FB5B0C38BD1,2202496.0,1.0


In [None]:
user_unique = pd.DataFrame(columns=['user_id'])

In [None]:
user_unique['user_id'] = data['user_id'].unique()
user_unique.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 224596 entries, 0 to 224595
Data columns (total 1 columns):
user_id    224595 non-null object
dtypes: object(1)
memory usage: 1.7+ MB


In [None]:
user_unique.to_csv('user_unique.csv')

In [None]:
user_unique = pd.read_csv('user_unique.csv')
user_unique.head(10)

Unnamed: 0.1,Unnamed: 0,user_id
0,0,000021E7765634E8A1A1E27DE21E7DF3
1,1,00003D04C11A33F674B8F64CB1BAA004
2,2,00005F933267D65FF8159DC1A6C70925
3,3,0000C89FF2311CE11E9D9FB5B0C38BD1
4,4,0000E78EE6184DBD861594816DF3F8A9
5,5,00012E7446373FB0EDE068F62CA61966
6,6,00013E628C86B5C235B93C887E229F8D
7,7,00018586640750B168513AE56676296E
8,8,000197013B33988F264E1BB963722974
9,9,0001A5658302FF8EB7C40557CC983BCB


In [None]:
user_unique['ID'] = user_unique['Unnamed: 0'] + 1

In [None]:
user_unique.head(20)

Unnamed: 0.1,Unnamed: 0,user_id,ID
0,0,000021E7765634E8A1A1E27DE21E7DF3,1
1,1,00003D04C11A33F674B8F64CB1BAA004,2
2,2,00005F933267D65FF8159DC1A6C70925,3
3,3,0000C89FF2311CE11E9D9FB5B0C38BD1,4
4,4,0000E78EE6184DBD861594816DF3F8A9,5
5,5,00012E7446373FB0EDE068F62CA61966,6
6,6,00013E628C86B5C235B93C887E229F8D,7
7,7,00018586640750B168513AE56676296E,8
8,8,000197013B33988F264E1BB963722974,9
9,9,0001A5658302FF8EB7C40557CC983BCB,10


In [None]:
len_df = len(data)
len_df
len_us = len(user_unique)

In [None]:
#chuyển user_id thành số cho dễ tính toán
data['ID'] = data['user_id']

for i in range(len_df):
  for j in range (len_us):
    if data.iloc[i,0] == user_unique.iloc[j,1]:
      data.iloc[i,3] = user_unique.iloc[j,2]
      break

In [None]:
data.head(20)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test= train_test_split(data, test_size=0.3, random_state=68)

In [None]:
X_train.info()

In [None]:
X_test.info()

In [None]:
n_user = X_train['user_id'].nunique()
n_product = X_train['product_id'].nunique()
m_rate = X_train['count'].mean()
m_rate

**CLASS CF**

*Input của class CF là ma trận Utility Y_data được lưu dưới dạng một matrix với 3 columns, k là số lượng các điểm lân cận được sử dụng để dự đoán kết quả. dist_func là hàm đó similarity giữa hai vectors, là cosine_similarity được lấy từ sklearn.metrics.pairwise. Biến uuCF thể hiện việc đang sử dụng User-user CF (1) hay Item-item CF(0).*

In [None]:
class CF(object):
    """docstring for CF"""
    def __init__(self, Y_data, k, n_users , n_products, m_rating, dist_func = cosine_similarity, uuCF = 1 ):
        self.uuCF = uuCF # user-user (1) or item-item (0) CF
        self.Y_data = Y_data if uuCF else Y_data[:, [1, 0, 2]] #Khi làm việc với Item-item CF, chúng ta chỉ cần đổi vị trí của hai cột đầu tiên để nhận được ma trận chuyển vị.
        self.k = k # number of neighbor points
        self.dist_func = dist_func
        self.Ybar_data = None
        # number of users and items. Remember to add 1 since id starts from 0
        self.n_users = n_user 
        self.n_items = n_product
        self.m_rating = m_rating

    def add(self, new_data):
        self.Y_data = np.concatenate((self.Y_data, new_data), axis = 0)

    #Tính toán normalized utility matrix và Similarity matrix
    def normalize_Y(self):
        users = self.Y_data[:, 0] # all users - first col of the Y_data
        self.Ybar_data = self.Y_data.copy()
        self.mu = np.zeros((self.n_users,))
        for n in range(self.n_users):
            # row indices of rating done by user n
            # since indices need to be integers, we need to convert
            ids = np.where(users == n)[0].astype(np.int32)
            # indices of all ratings associated with user n
            item_ids = self.Y_data[ids, 1] 
            # and the corresponding ratings 
            ratings = self.Y_data[ids, 2]
            # take mean
            m = self.m_rating 
            if np.isnan(m):
                m = 0 # to avoid empty array and nan value
            # normalize
            self.Ybar_data[ids, 2] = ratings - self.mu[n]

        ################################################
        # form the rating matrix as a sparse matrix. Sparsity is important 
        # for both memory and computing efficiency. For example, if #user = 1M, 
        # #item = 100k, then shape of the rating matrix would be (100k, 1M), 
        # you may not have enough memory to store this. Then, instead, we store 
        # nonzeros only, and, of course, their locations.
        self.Ybar = sparse.coo_matrix((self.Ybar_data[:, 2],
            (self.Ybar_data[:, 1], self.Ybar_data[:, 0])), (self.n_items, self.n_users))
        self.Ybar = self.Ybar.tocsr()

    def similarity(self):
        self.S = self.dist_func(self.Ybar.T, self.Ybar.T)

    def refresh(self):
        """
        Normalize data and calculate similarity matrix again (after
        some few ratings added)
        """
        self.normalize_Y()
        self.similarity() 
        
    def fit(self):
        self.refresh()

    #Predict function
    def __pred(self, u, i, normalized = 1):
        """ 
        predict the rating of user u for item i (normalized)
        if you need the un
        """
        # Step 1: find all users who rated i
        ids = np.where(self.Y_data[:, 1] == i)[0].astype(np.int32)
        # Step 2: 
        users_rated_i = (self.Y_data[ids, 0]).astype(np.int32)
        # Step 3: find similarity btw the current user and others 
        # who already rated i
        sim = self.S[u, users_rated_i]
        # Step 4: find the k most similarity users
        a = np.argsort(sim)[-self.k:] 
        # and the corresponding similarity levels
        nearest_s = sim[a]
        # How did each of 'near' users rated item i
        r = self.Ybar[i, users_rated_i[a]]
        if normalized:
            # add a small number, for instance, 1e-8, to avoid dividing by 0
            return (r*nearest_s)[0]/(np.abs(nearest_s).sum() + 1e-8)

        return (r*nearest_s)[0]/(np.abs(nearest_s).sum() + 1e-8) + self.mu[u]
    
    
    def pred(self, u, i, normalized = 1):
        """ 
        predict the rating of user u for item i (normalized)
        if you need the un
        """
        if self.uuCF: return self.__pred(u, i, normalize)
        return self.__pred(i, u, normalize)

    def recommend(self, u, normalized = 1):
        """
        Determine all items should be recommended for user u. (uuCF =1)
        or all users who might have interest on item u (uuCF = 0)
        The decision is made based on all i such that:
        self.pred(u, i) > 0. Suppose we are considering items which 
        have not been rated by u yet. 
        """
        ids = np.where(self.Y_data[:, 0] == u)[0]
        items_rated_by_u = self.Y_data[ids, 1].tolist()              
        recommended_items = []
        for i in range(self.n_items):
            if i not in items_rated_by_u:
                rating = self.__pred(u, i)
                if rating > 0: 
                    recommended_items.append(i)
        
        return recommended_items

    def print_recommendation(self):
        """
        print all items which should be recommended for each user 
        """
        print ('Recommendation: ')
        for u in range(self.n_users):
            recommended_items = self.recommend(u)
            if self.uuCF:
                print ('    Recommend item(s):', recommended_items, 'to user', u)
            else: 
                print ('    Recommend item', u, 'to user(s) : ', recommended_items)

*Khi có dữ liệu mới, cập nhận Utility matrix bằng cách thêm các hàng này vào cuối Utility Matrix. Để cho đơn giản, giả sử rằng không có users hay items mới, cũng không có ratings nào bị thay đổi.*

In [None]:
Y_data = X_train.values



In [None]:
rs = CF(Y_data, k = 30, n_users = n_user, n_products = n_product , m_rating= m_rate, uuCF = 0 )
rs.fit()


In [None]:
n_tests = X_test.shape[0]
SE = 0 # squared error
for n in range(n_tests):
    pred = rs.pred(rate_test[n, 0], rate_test[n, 1], normalized = 0)
    SE += (pred - rate_test[n, 2])**2 

RMSE = np.sqrt(SE/n_tests)
print ('User-user CF, RMSE =', RMSE)