<a href="https://colab.research.google.com/github/PhanTheMinhChau/TTNTCB/blob/main/collaborativefiltering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!unzip /content/ml-100k.zip

In [134]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse

#Khởi tạo class collaborativefiltering
class CF(object):
    def __init__(self, Y_data, k, dist_func = cosine_similarity, uuCF = 1):
        self.uuCF = uuCF # user-user (1) hoặc item-item (0) CF
        self.Y_data = Y_data if uuCF else Y_data[:, [1, 0, 2]]
        self.k = k
        self.dist_func = dist_func
        self.Ybar_data = None
        self.n_users = int(np.max(self.Y_data[:, 0])) + 1
        self.n_items = int(np.max(self.Y_data[:, 1])) + 1

    def add(self, new_data):
        self.Y_data = np.concatenate((self.Y_data, new_data), axis = 0)

    #Hàm chuẩn hóa
    def normalize_Y(self):
        users = self.Y_data[:, 0] # all users - first col of the Y_data
        self.Ybar_data = self.Y_data.copy()
        self.mu = np.zeros((self.n_users,))
        for n in range(self.n_users):
            ids = np.where(users == n)[0].astype(np.int32)
            item_ids = self.Y_data[ids, 1]
            ratings = self.Y_data[ids, 2]
            m = np.mean(ratings)
            if np.isnan(m):
              self.mu[n] = m
            self.Ybar_data[ids, 2] = ratings - self.mu[n]
        self.Ybar = sparse.coo_matrix((self.Ybar_data[:, 2],
            (self.Ybar_data[:, 1], self.Ybar_data[:, 0])), (self.n_items, self.n_users))
        self.Ybar = self.Ybar.tocsr()

    #Hàm tính khoảng cách tương đồng
    def similarity(self):
        eps = 1e-6
        self.S = self.dist_func(self.Ybar.T, self.Ybar.T)


    def refresh(self):
        self.normalize_Y()
        self.similarity()

    def fit(self):
        self.refresh()

    #Hàm dự đoán rating
    def pred(self, u, i, normalized = 1):
        if self.uuCF: return self.__pred(u, i, normalized)
        return self.__pred(i, u, normalized)

    def __pred(self, u, i, normalized = 1):
        ids = np.where(self.Y_data[:, 1] == i)[0].astype(np.int32)
        users_rated_i = (self.Y_data[ids, 0]).astype(np.int32)
        sim = self.S[u, users_rated_i]
        a = np.argsort(sim)[-self.k:]
        nearest_s = sim[a]
        r = self.Ybar[i, users_rated_i[a]]
        if normalized:
            return (r*nearest_s)[0]/(np.abs(nearest_s).sum() + 1e-8)
        return (r*nearest_s)[0]/(np.abs(nearest_s).sum() + 1e-8) + self.mu[u]

    #Hàm in ra t bộ phim mà người dùng có id u có thể thích nhất
    def recommend(self, u, t):
        ids = np.where(self.Y_data[:, 1] == u)[0]
        items_rated_by_u = self.Y_data[ids, 0].tolist()
        recommended_items = []
        for i in range(self.n_users):
            if i not in items_rated_by_u:
                rating = self.__pred(i, u)
                if rating > 0:
                    recommended_items.append([i,rating])
        sorted_data = sorted(recommended_items, key=lambda x: x[1], reverse=True)
        top_3_elements = sorted_data[:t]
        print(str(t) + " bộ phim mà người dùng "+ str(u) +" có thể thích nhất là:")
        return top_3_elements

In [36]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings_base = pd.read_csv('ml-100k/ub.base', sep='\t', names=r_cols, encoding='latin-1') #đọc dữ liệu tập train
ratings_test = pd.read_csv('ml-100k/ub.test', sep='\t', names=r_cols, encoding='latin-1') #đọc dữ liệu tập test

rate_train = ratings_base.values
rate_test = ratings_test.values

rate_train[:, :2] -= 1
rate_test[:, :2] -= 1

#User-user CF

In [179]:
rs = CF(rate_train, k = 30, uuCF = 1)
rs.fit()

n_tests = rate_test.shape[0]
SE = 0 # squared error
for n in range(n_tests):
    pred = rs.pred(rate_test[n, 0], rate_test[n, 1], normalized = 0)
    SE += (pred - rate_test[n, 2])**2

RMSE = np.sqrt(SE/n_tests)
print('User-user CF, RMSE =', RMSE)

User-user CF, RMSE = 1.0369740376881258


In [180]:
rs.recommend2(108,10)

10 bộ phim mà người dùng 108 có thể thích nhất là:


[[336, 4.048851989600761],
 [821, 3.935895959308213],
 [123, 3.909549250284672],
 [27, 3.8711046308637274],
 [52, 3.8475861964416747],
 [515, 3.828508477897678],
 [136, 3.827088678055541],
 [699, 3.8242032770130514],
 [659, 3.823208842743754],
 [358, 3.818204012907152]]

#Item-item CF

In [177]:
rs = CF(rate_train, k = 30, uuCF = 0)
rs.fit()

n_tests = rate_test.shape[0]
SE = 0 # squared error
for n in range(n_tests):
    pred = rs.pred(rate_test[n, 0], rate_test[n, 1], normalized = 0)
    if str(pred) == "nan":
      n_tests = n_tests - 1
    else:
      SE += (pred - rate_test[n, 2])**2

RMSE = np.sqrt(SE/n_tests)
print('Item-item CF, RMSE =', RMSE)

Item-item CF, RMSE = 1.0159981337543753


In [178]:
rs.recommend2(108,10)

10 bộ phim mà người dùng 108 có thể thích nhất là:


[[234, 4.430914678227091],
 [683, 4.403141724927008],
 [140, 4.376049598882261],
 [469, 4.354735429921141],
 [257, 4.343761619439394],
 [240, 4.335488358825304],
 [587, 4.32800717033914],
 [23, 4.293723648338789],
 [270, 4.289102753905997],
 [509, 4.283191888096937]]