In [12]:
import pandas as pd
from google.colab import files
import numpy as np


In [5]:
uploaded = files.upload()

In [8]:
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('/content/drive/MyDrive/ml-100k/u.user', sep='|', names=u_cols, encoding='latin-1')
users.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [9]:
n_users = users.shape[0]
n_users

943

In [11]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']

ratings_base = pd.read_csv('/content/drive/MyDrive/ml-100k/ua.base', sep='\t', names=r_cols, encoding='latin-1')
ratings_test = pd.read_csv('/content/drive/MyDrive/ml-100k/ua.test', sep='\t', names=r_cols, encoding='latin-1')

rate_train = ratings_base.to_numpy()
rate_test = ratings_test.to_numpy()

rate_train, rate_test

(array([[        1,         1,         5, 874965758],
        [        1,         2,         3, 876893171],
        [        1,         3,         4, 878542960],
        ...,
        [      943,      1188,         3, 888640250],
        [      943,      1228,         3, 888640275],
        [      943,      1330,         3, 888692465]]),
 array([[        1,        20,         4, 887431883],
        [        1,        33,         4, 878542699],
        [        1,        61,         4, 878542420],
        ...,
        [      943,       570,         1, 888640125],
        [      943,       808,         4, 888639868],
        [      943,      1067,         2, 875501756]]))

In [14]:
## Builing items profile
i_cols = ['movie id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
items = pd.read_csv('/content/drive/MyDrive/ml-100k/u.item', sep='|', names=i_cols,
 encoding='latin-1')
n_items = items.shape[0]
n_items

1682

In [16]:
X0 = items.to_numpy()
X_train_count = X0[:,-19:]

X0, X_train_count

(array([[1, 'Toy Story (1995)', '01-Jan-1995', ..., 0, 0, 0],
        [2, 'GoldenEye (1995)', '01-Jan-1995', ..., 1, 0, 0],
        [3, 'Four Rooms (1995)', '01-Jan-1995', ..., 1, 0, 0],
        ...,
        [1680, 'Sliding Doors (1998)', '01-Jan-1998', ..., 0, 0, 0],
        [1681, 'You So Crazy (1994)', '01-Jan-1994', ..., 0, 0, 0],
        [1682, 'Scream of Stone (Schrei aus Stein) (1991)', '08-Mar-1996',
         ..., 0, 0, 0]], dtype=object),
 array([[0, 0, 0, ..., 0, 0, 0],
        [0, 1, 1, ..., 1, 0, 0],
        [0, 0, 0, ..., 1, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=object))

In [19]:
## Import Library TFIDF
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer(smooth_idf = True, norm = 'l2')
tfidf = transformer.fit_transform(X_train_count.tolist()).toarray()

tfidf

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.53676706, 0.65097024, ..., 0.53676706, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [20]:
def get_items_rated_by_user(rate_matrix, user_id):
    """
    in each line of rate_matrix, we have infor: user_id, item_id, rating (scores), time_stamp
    we care about the first three values
    return (item_ids, scores) rated by user user_id
    """
    y = rate_matrix[:,0] # all users
    # item indices rated by user_id
    # we need to +1 to user_id since in the rate_matrix, id starts from 1
    # while index in python starts from 0
    ids = np.where(y == user_id +1)[0]
    item_ids = rate_matrix[ids, 1] - 1 # index starts from 0
    scores = rate_matrix[ids, 2]
    return (item_ids, scores)

In [21]:
from sklearn.linear_model import Ridge
from sklearn import linear_model

d = tfidf.shape[1] # data dimension
W = np.zeros((d, n_users))
b = np.zeros((1, n_users))

for n in range(n_users):
    ids, scores = get_items_rated_by_user(rate_train, n)
    clf = Ridge(alpha=0.01, fit_intercept  = True)
    Xhat = tfidf[ids, :]

    clf.fit(Xhat, scores)
    W[:, n] = clf.coef_
    b[0, n] = clf.intercept_

In [22]:
# predicted scores
Yhat = tfidf.dot(W) + b

In [23]:
n = 10
np.set_printoptions(precision=2) # 2 digits after .
ids, scores = get_items_rated_by_user(rate_test, n)
Yhat[n, ids]
print('Rated movies ids :', ids )
print('True ratings     :', scores)
print('Predicted ratings:', Yhat[ids, n])

Rated movies ids : [ 37 109 110 226 424 557 722 724 731 739]
True ratings     : [3 3 4 3 4 3 5 3 3 4]
Predicted ratings: [3.18 3.13 3.42 3.09 3.35 5.2  4.01 3.35 3.42 3.72]


In [27]:
def evaluate(Yhat, rates, W, b):
    se = 0
    cnt = 0
    for n in range(n_users):
        ids, scores_truth = get_items_rated_by_user(rates, n)
        scores_pred = Yhat[ids, n]
        e = scores_truth - scores_pred
        se += (e*e).sum(axis = 0)
        cnt += e.size
    return np.sqrt(se/cnt)

print('RMSE for training:', evaluate(Yhat, rate_train, W, b))
print('RMSE for test    :', evaluate(Yhat, rate_test, W, b))

RMSE for training: 0.908980456282672
RMSE for test    : 1.2703282700393035
