<a href="https://colab.research.google.com/github/PhanTheMinhChau/TTNTCB/blob/main/ContentbasedFiltering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!unzip /content/ml-100k.zip

In [24]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import Ridge
from sklearn import linear_model

def get_items_rated_by_user(rate_matrix, user_id):
    y = rate_matrix[:,0]
    ids = np.where(y == user_id +1)[0]
    item_ids = rate_matrix[ids, 1] - 1
    scores = rate_matrix[ids, 2]
    return (item_ids, scores)

class Contentbased:
    def __init__(self, Y, X_train, n_users, n_items, lamda = 1):
        self.Y = Y
        self.lamda = lamda
        self.X_train = X_train
        self.n_users = n_users
        self.n_items = n_items

    def fit(self):
        from sklearn.feature_extraction.text import TfidfTransformer
        transformer = TfidfTransformer(smooth_idf=True, norm ='l2') #Chuyển feature vector ở dạng binary thành tfidf
        tfidf = transformer.fit_transform(self.X_train.tolist()).toarray() #Kết quả tfidf thu được là ma trận mà mỗi hàng sẽ là feature vector của một bộ phim
        #tìm các hệ số của Ridge Regression cho mỗi user:
        d = tfidf.shape[1] # data dimension
        W = np.zeros((d, self.n_users))
        b = np.zeros((1, self.n_users))
        for n in range(self.n_users):
            ids, scores = get_items_rated_by_user(self.Y, n)
            clf = Ridge(alpha= self.lamda, fit_intercept  = True)
            Xhat = tfidf[ids, :]
            clf.fit(Xhat, scores)
            W[:, n] = clf.coef_
            b[0, n] = clf.intercept_
        #Sau khi tính được các hệ số W và b, ratings cho mỗi items được dự đoán bằng cách tính:
        self.Yhat = tfidf.dot(W) + b

    def RMSE(self, Data_test):
        se = 0
        cnt = 0
        for n in range(self.n_users):
            ids, scores_truth = get_items_rated_by_user(Data_test, n)
            scores_pred = self.Yhat[ids, n]
            e = scores_truth - scores_pred
            se += (e*e).sum(axis = 0)
            cnt += e.size
        return np.sqrt(se/cnt)

    def recommend(self, user_id, top):
        a = np.zeros((self.n_items,))
        recommended_items = []
        items_rated_by_user, score = get_items_rated_by_user(self.Y, user_id)
        for i in range(self.n_items):
            if i not in items_rated_by_user:
                a[i] = self.Yhat[i, user_id]
        if len(a) < top:
            recommended_items = np.argsort(a)[-len(a):]
        else:
            recommended_items = np.argsort(a)[-top:]
        print(str(top) + " bộ phim mà user có id " + str(user_id) + " có thể thích nhất là:")
        return recommended_items


In [25]:
u_cols =  ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('ml-100k/u.user', sep='|', names=u_cols,
 encoding='latin-1')
n_users = users.shape[0]

r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']

ratings_base = pd.read_csv('ml-100k/ua.base', sep='\t', names=r_cols, encoding='latin-1')
ratings_test = pd.read_csv('ml-100k/ua.test', sep='\t', names=r_cols, encoding='latin-1')

rate_train = ratings_base.values
rate_test = ratings_test.values

i_cols = ['movie id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

items = pd.read_csv('ml-100k/u.item', sep='|', names=i_cols, encoding='latin-1')

n_items = items.shape[0]

X0 = items.values
X_train_counts = X0[:, -19:]


In [41]:
X_train_counts

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 1, 1, ..., 1, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=object)

In [26]:
items

Unnamed: 0,movie id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1678,Mat' i syn (1997),06-Feb-1998,,http://us.imdb.com/M/title-exact?Mat%27+i+syn+...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1678,1679,B. Monkey (1998),06-Feb-1998,,http://us.imdb.com/M/title-exact?B%2E+Monkey+(...,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1679,1680,Sliding Doors (1998),01-Jan-1998,,http://us.imdb.com/Title?Sliding+Doors+(1998),0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1680,1681,You So Crazy (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?You%20So%20Cr...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
cb = Contentbased(rate_train, X_train_counts, n_users= n_users, n_items = n_items, lamda=7)
cb.fit()

1.024520792639543

In [30]:
items[items['movie id'].isin(cb.recommend(1,5))]

5 bộ phim mà user có id 1 có thể thích nhất là:


Unnamed: 0,movie id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
159,160,Glengarry Glen Ross (1992),01-Jan-1992,,http://us.imdb.com/M/title-exact?Glengarry%20G...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
205,206,Akira (1988),01-Jan-1988,,http://us.imdb.com/M/title-exact?Akira%20(1988),0,0,1,1,0,...,0,0,0,0,0,0,1,1,0,0
718,719,Canadian Bacon (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?Canadian%20Ba...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
847,848,"Murder, My Sweet (1944)",01-Jan-1944,,"http://us.imdb.com/M/title-exact?Murder,%20My%...",0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
1481,1482,"Gate of Heavenly Peace, The (1995)",10-May-1996,,http://us.imdb.com/M/title-exact?Gate%20of%20H...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [38]:
cb.RMSE(Data_test=rate_test)

1.24520792639543