In [1]:
import pandas as pd 
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import Ridge

### Thông tin tập dữ liệu: 
- u.data: Chứa toàn bộ các ratings của 943 users cho 1682 movies. Mỗi user rate ít nhất 20 movies. Thông tin về thời gian rate cũng được cho nhưng chúng ta không sử dụng trong bài viết này.

- ua.base, ua.test, ub.base, ub.test: là hai cách chia toàn bộ dữ liệu ra thành hai tập con, một cho training, một cho test.

- u.user: Chứa thông tin về users, bao gồm: id, tuổi, giới tính, nghề nghiệp, zipcode (vùng miền), vì những thông tin này cũng có thể ảnh hưởng tới sở thích của các users. Tuy nhiên, trong bài viết này, chúng ta sẽ không sử dụng các thông tin này, trừ thông tin về id để xác định các user khác nhau.

- u.genre: Chứa tên của 19 thể loại phim. Các thể loại bao gồm: unknown, Action, Adventure, Animation, Children's, Comedy, Crime, Documentary, Drama, Fantasy, Film-Noir, Horror, Musical, Mystery, Romance, Sci-Fi, Thriller, War, Western

- u.item: thông tin về mỗi bộ phim. 

In [2]:
u_cols =  ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('u.user', sep='|', names=u_cols, encoding='latin-1')
users

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213
...,...,...,...,...,...
938,939,26,F,student,33319
939,940,32,M,administrator,02215
940,941,20,M,student,97229
941,942,48,F,librarian,78209


#### Dữ liệu tập ua.base và ua.test

In [3]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']

ratings_base_ua = pd.read_csv('ua.base', sep='\t', names=r_cols, encoding='latin-1')
ratings_test_ua = pd.read_csv('ua.test', sep='\t', names=r_cols, encoding='latin-1')

rate_train = ratings_base_ua.to_numpy()
rate_test = ratings_test_ua.to_numpy()

print (rate_train)
print (rate_test)


[[        1         1         5 874965758]
 [        1         2         3 876893171]
 [        1         3         4 878542960]
 ...
 [      943      1188         3 888640250]
 [      943      1228         3 888640275]
 [      943      1330         3 888692465]]
[[        1        20         4 887431883]
 [        1        33         4 878542699]
 [        1        61         4 878542420]
 ...
 [      943       570         1 888640125]
 [      943       808         4 888639868]
 [      943      1067         2 875501756]]


#### Dữ liệu tập ub.base và ub.test

In [4]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']

ratings_base_ub = pd.read_csv('ub.base', sep='\t', names=r_cols, encoding='latin-1')
ratings_test_ub = pd.read_csv('ub.test', sep='\t', names=r_cols, encoding='latin-1')

rate_train_b = ratings_base_ub.to_numpy()
rate_test_b = ratings_test_ub.to_numpy()

print (rate_train_b)
print (rate_test_b)

[[        1         1         5 874965758]
 [        1         2         3 876893171]
 [        1         3         4 878542960]
 ...
 [      943      1188         3 888640250]
 [      943      1228         3 888640275]
 [      943      1330         3 888692465]]
[[        1        17         3 875073198]
 [        1        47         4 875072125]
 [        1        64         5 875072404]
 ...
 [      943       595         2 875502597]
 [      943       685         4 875502042]
 [      943      1011         2 875502560]]


In [5]:
i_cols = ['movie id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

items = pd.read_csv('u.item', sep='|', names= i_cols, encoding='latin-1')

items 

Unnamed: 0,movie id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1678,Mat' i syn (1997),06-Feb-1998,,http://us.imdb.com/M/title-exact?Mat%27+i+syn+...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1678,1679,B. Monkey (1998),06-Feb-1998,,http://us.imdb.com/M/title-exact?B%2E+Monkey+(...,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1679,1680,Sliding Doors (1998),01-Jan-1998,,http://us.imdb.com/Title?Sliding+Doors+(1998),0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1680,1681,You So Crazy (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?You%20So%20Cr...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


- Chỉ cần lấy 19 giá trị nhị phân

In [6]:
X0 = items.to_numpy()
X_train_counts = X0[:, -19:]
X_train_counts.shape

(1682, 19)

In [7]:

transformer = TfidfTransformer(smooth_idf=True, norm ='l2')
tfidf = transformer.fit_transform(X_train_counts.tolist()).toarray()

In [8]:
tfidf= tfidf.astype('int')

In [9]:
def get_items_rated_by_user(rate_matrix, user_id):
    """
    in each line of rate_matrix, we have infor: user_id, item_id, rating (scores), time_stamp
    we care about the first three values
    return (item_ids, scores) rated by user user_id
    """
    y = rate_matrix[:,0] # all users
    # item indices rated by user_id
    # we need to +1 to user_id since in the rate_matrix, id starts from 1 
    # while index in python starts from 0
    ids = np.where(y == user_id +1)[0]
    item_ids = rate_matrix[ids, 1] - 1 # index starts from 0 
    scores = rate_matrix[ids, 2]
    return (item_ids, scores)

In [10]:



d = tfidf.shape[1]# data dimension
W = np.zeros((d, users.shape[0]))
b = np.zeros((1, users.shape[0]))

for n in range(users.shape[0]):    
    ids, scores = get_items_rated_by_user(rate_train, n)
    clf = Ridge(alpha=0.01, fit_intercept  = True)
    Xhat = tfidf[ids, :]
    
    clf.fit(Xhat, scores) 
    W[:, n] = clf.coef_
    b[0, n] = clf.intercept_

In [11]:
Yhat = tfidf.dot(W) + b

In [12]:
n = 10
np.set_printoptions(precision=2) # 2 digits after . 
ids, scores = get_items_rated_by_user(rate_test, n)
Yhat[n, ids]
print('Rated movies ids :', ids )
print('True ratings     :', scores)
print('Predicted ratings:', Yhat[ids, n])

Rated movies ids : [ 37 109 110 226 424 557 722 724 731 739]
True ratings     : [3 3 4 3 4 3 5 3 3 4]
Predicted ratings: [3.39 3.39 3.39 3.39 3.4  3.39 3.39 3.4  3.39 3.39]


In [13]:
def evaluate_MSE(Yhat, rates, W, b):
    se = 0
    cnt = 0
    for n in range (users.shape[0]):
        ids, scores_truth = get_items_rated_by_user(rates, n)
        scores_pred = Yhat[ids, n]
        e = scores_truth - scores_pred 
        se += (e*e).sum(axis = 0)
        cnt += e.size 
    return np.sqrt(se/cnt)

print ('MSE for training:', evaluate_MSE(Yhat, rate_train, W, b))
print ('MSE for test    :', evaluate_MSE(Yhat, rate_test, W, b))

MSE for training: 0.9884902920946139
MSE for test    : 1.0601818875913767


#### Làm tương tự cho ub.base

In [14]:
for n in range(users.shape[0]):    
    ids, scores = get_items_rated_by_user(rate_train_b, n)
    clf = Ridge(alpha=0.01, fit_intercept  = True)
    Xhat = tfidf[ids, :]
    
    clf.fit(Xhat, scores) 
    W[:, n] = clf.coef_
    b[0, n] = clf.intercept_

In [15]:
Yhat = tfidf.dot(W) + b

In [16]:
n = 20
np.set_printoptions(precision=2) # 2 digits after . 
ids, scores = get_items_rated_by_user(rate_test_b, n)
Yhat[n, ids]
print('Rated movies ids :', ids )
print('True ratings     :', scores)
print('Predicted ratings:', Yhat[ids, n])

Rated movies ids : [ 97 183 297 298 456 589 595 741 838 927]
True ratings     : [5 4 5 1 1 1 3 3 1 3]
Predicted ratings: [2.68 2.68 2.68 2.68 2.68 2.68 2.68 2.68 2.68 2.68]


In [17]:
print ('MSE for training:', evaluate_MSE(Yhat, rate_train_b, W, b))
print ('MSE for test    :', evaluate_MSE(Yhat, rate_test_b, W, b))

MSE for training: 0.986172193904518
MSE for test    : 1.085433935121025


### Dường như ua.base và ub.base có độ lỗi gần bằng nhau 