In [1]:
import math
import numpy as np
import pandas as pd
import pickle
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import norm
from sklearn.neighbors import NearestNeighbors

In [2]:
# GoodBooks-10K Data
df_books = pd.read_csv('books.csv', usecols=['book_id', 'title'])
df_ratings = pd.read_csv('ratings.csv', usecols=['user_id', 'book_id', 'rating'])

In [3]:
num_users = len(df_ratings.user_id.unique())
num_items = len(df_ratings.book_id.unique())
print('There are {} unique users and {} unique movies in this data set'.format(num_users, num_items))

There are 53424 unique users and 10000 unique movies in this data set


In [4]:
# get rating frequency
df_books_cnt = pd.DataFrame(df_ratings.groupby('book_id').size(), columns=['count'])
df_books_cnt.head(3000)

Unnamed: 0_level_0,count
book_id,Unnamed: 1_level_1
1,22806
2,21850
3,16931
4,19088
5,16604
...,...
2996,435
2997,562
2998,459
2999,483


In [5]:
df_ratings_cnt_tmp = pd.DataFrame(df_ratings.groupby('rating').size(), columns=['count'])
df_ratings_cnt_tmp

Unnamed: 0_level_0,count
rating,Unnamed: 1_level_1
1,124195
2,359257
3,1370916
4,2139018
5,1983093


In [6]:
# there are a lot more counts in rating of zero
total_cnt = num_users * num_items
rating_zero_cnt = total_cnt - df_ratings.shape[0]
# append counts of zero rating to df_ratings_cnt
df_ratings_cnt = df_ratings_cnt_tmp.append(
    pd.DataFrame({'count': rating_zero_cnt}, index=[0.0]),
    verify_integrity=True,
).sort_index()
df_ratings_cnt

Unnamed: 0,count
0.0,528263521
1.0,124195
2.0,359257
3.0,1370916
4.0,2139018
5.0,1983093


In [7]:
# add log count
df_ratings_cnt['log_count'] = np.log(df_ratings_cnt['count'])
df_ratings_cnt

Unnamed: 0,count,log_count
0.0,528263521,20.085106
1.0,124195,11.729608
2.0,359257,12.791793
3.0,1370916,14.13099
4.0,2139018,14.575857
5.0,1983093,14.500168


In [8]:
# filter data
popularity_thres = 100
popular_books = list(set(df_books_cnt.query('count >= @popularity_thres').index))
df_ratings_drop_books = df_ratings[df_ratings.book_id.isin(popular_books)]
print('shape of original ratings data: ', df_ratings.shape)
print('shape of ratings data after dropping unpopular books: ', df_ratings_drop_books.shape)
print('Number of popular book', len(popular_books))

shape of original ratings data:  (5976479, 3)
shape of ratings data after dropping unpopular books:  (5935973, 3)
Number of popular book 9511


In [9]:
# get number of ratings given by every user
df_users_cnt = pd.DataFrame(df_ratings_drop_books.groupby('user_id').size(), columns=['count'])
df_users_cnt.head()

Unnamed: 0_level_0,count
user_id,Unnamed: 1_level_1
1,117
2,62
3,89
4,134
5,100


In [10]:
# filter data
ratings_thres = 120
active_users = list(set(df_users_cnt.query('count >= @ratings_thres').index))
df_ratings_drop_users = df_ratings_drop_books[df_ratings_drop_books.user_id.isin(active_users)]
print('shape of original ratings data: ', df_ratings.shape)
print('shape of ratings data after dropping both unpopular movies and inactive users: ', df_ratings_drop_users.shape)
print('number of active user: ', len(active_users))

shape of original ratings data:  (5976479, 3)
shape of ratings data after dropping both unpopular movies and inactive users:  (2600373, 3)
number of active user:  18753


In [11]:
# pivot and create book-user matrix
book_user_mat = df_ratings_drop_users.pivot(index='book_id', columns='user_id', values='rating')
book_user_mat_standardized = (book_user_mat - np.mean(book_user_mat, axis=0)).fillna(0)
book_user_mat_bool = ~np.isnan(book_user_mat)

In [47]:
book_user_mat

user_id,4,7,9,10,22,23,24,27,28,31,...,53390,53400,53401,53403,53409,53411,53413,53419,53422,53424
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,4.0,,3.0,,,4.0,5.0,4.0,...,5.0,,5.0,,4.0,,,4.0,4.0,4.0
2,5.0,,4.0,,,,3.0,,,3.0,...,,,5.0,4.0,4.0,,,5.0,5.0,5.0
3,,,4.0,,3.0,,,,,,...,5.0,,4.0,,,,,3.0,,4.0
4,4.0,,,5.0,,4.0,4.0,,5.0,,...,5.0,,,,,,,3.0,,5.0
5,4.0,3.0,5.0,5.0,,,4.0,,3.0,,...,1.0,,,,,,,3.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,,,,,,,,,,,...,,,,,,,,,,
9996,,,,,,,,,,,...,,,,,,,,,,
9998,,,,,,,,,,,...,,,,,,,,,,
9999,,,,,,,,,,,...,,,,,,,,,,


In [53]:
book_id_to_idx = {
    book_id: idx for idx, book_id in enumerate(book_user_mat.index)
}

In [58]:
user_id_to_idx = {
    user_id: idx for idx, user_id in enumerate(book_user_mat.columns)
}

In [12]:
# convert to sparse matrix
book_user_mat_sparse = csr_matrix(book_user_mat_standardized.values)
book_user_mat_bool_sparse = csr_matrix(book_user_mat_bool.values)
del book_user_mat_standardized
del book_user_mat_bool

In [13]:
# square the matrix for weight calculation
x_squared = book_user_mat_sparse.power(2)

In [14]:
# only take the value where rating is not nan for weight calculation
weight_left = x_squared.dot(book_user_mat_bool_sparse.T)
del x_squared

In [15]:
# calculate weight matrix
weight_left = weight_left.toarray()
weight = np.sqrt(weight_left*weight_left.T)
del weight_left

In [16]:
ADJUST_THRESHOLD = 15

# calculate the adjust matrix
confidence_matrix = np.dot(book_user_mat_bool_sparse, book_user_mat_bool_sparse.T.astype(int))
adjusted_matrix = confidence_matrix / ADJUST_THRESHOLD
adjusted_matrix[adjusted_matrix > 1] = 1
adjusted_matrix = adjusted_matrix.toarray()
del confidence_matrix
del book_user_mat_bool_sparse

In [17]:
# calculate product matrix
prod = book_user_mat_sparse.dot(book_user_mat_sparse.T)
prod = prod.toarray()

In [18]:
# calculate similarity matrix
similarity_matrix = (prod/weight) * adjusted_matrix
del prod
del weight
del adjusted_matrix

  similarity_matrix = (prod/weight) * adjusted_matrix


In [19]:
# replace invalid value with -1
similarity_matrix[np.isnan(similarity_matrix)] = -1

# replace similarities larger than 1 with 1
similarity_matrix[(similarity_matrix > 1) & (similarity_matrix < 1.01)] = 0

# distance
distance_matrix = 1 - similarity_matrix

In [20]:
distance_matrix

array([[0.        , 0.600407  , 1.05268761, ..., 1.22350967, 1.31825116,
        0.9600385 ],
       [0.600407  , 0.        , 1.26039175, ..., 0.70887133, 1.43762811,
        0.98310372],
       [1.05268761, 1.26039175, 0.        , ..., 1.13307415, 0.99697735,
        1.19517543],
       ...,
       [1.22350967, 0.70887133, 1.13307415, ..., 0.        , 2.        ,
        2.        ],
       [1.31825116, 1.43762811, 0.99697735, ..., 2.        , 0.        ,
        2.        ],
       [0.9600385 , 0.98310372, 1.19517543, ..., 2.        , 2.        ,
        0.        ]])

In [21]:
# create mapper from book title to index
# transform matrix to scipy sparse matrix
#book_user_mat_sparse = csr_matrix(book_user_mat.values)

In [22]:
NUM_NEIGHTBORS = 900

In [23]:
%env JOBLIB_TEMP_FOLDER=/tmp
# define model
model_knn = NearestNeighbors(metric='precomputed', algorithm='brute', n_neighbors=NUM_NEIGHTBORS, n_jobs=-1)
# fit
model_knn.fit(distance_matrix)

env: JOBLIB_TEMP_FOLDER=/tmp


NearestNeighbors(algorithm='brute', metric='precomputed', n_jobs=-1,
                 n_neighbors=900)

In [24]:
distances, indices = model_knn.kneighbors(distance_matrix, n_neighbors=NUM_NEIGHTBORS)

In [25]:
del distance_matrix
del similarity_matrix

In [26]:
distances

array([[0.        , 0.10514008, 0.15361455, ..., 0.74866674, 0.74871946,
        0.74877593],
       [0.        , 0.12084236, 0.15180762, ..., 0.75233135, 0.75233361,
        0.75244985],
       [0.        , 0.1115017 , 0.15552748, ..., 0.72831454, 0.72833685,
        0.72874472],
       ...,
       [0.        , 0.09348925, 0.14828318, ..., 0.93333333, 0.93333333,
        0.93333333],
       [0.        , 0.46039548, 0.51517195, ..., 0.93333333, 0.93333333,
        0.93333333],
       [0.        , 0.45646257, 0.45839179, ..., 0.93333333, 0.93333333,
        0.93333333]])

In [27]:
indices

array([[   0, 5880, 1354, ..., 1209, 5710, 8618],
       [   1, 3270, 3730, ..., 1952, 1720, 4223],
       [   2,  991, 1607, ...,  200, 7805, 9032],
       ...,
       [9485, 8458, 7402, ..., 6093, 5233, 8098],
       [9486,   54,  100, ...,  964,  971, 5747],
       [9487,   69,  407, ..., 1952, 1903, 7001]], dtype=int64)

# Offline phase

In [None]:
similarities = 1 - distances

In [None]:
k = 10

In [60]:
def predict_rating_for_user(user_id):
    # get index of user from user id
    user_idx = user_id_to_idx[user_id]
    
    # get rating array of this user
    user_ratings = np.array(book_user_mat.iloc[:, user_idx])
    
    # calculate has rating array
    user_has_ratings_1d = ~np.isnan(user_ratings)
    
    user_ratings_matrix = user_ratings[indices]
    
    user_has_ratings = ~np.isnan(user_ratings_matrix)
    del user_ratings_matrix
    
    # filter all simiarity value that has no ratings
    similarities_filtered = similarities * user_has_ratings
    del user_has_ratings
    
    # get the indexes of top k largest similarity value for each row
    top_k = (-similarities_filtered).argpartition(k, axis=1)[:, :k]
    
    rows = np.arange(0, similarities.shape[0])[:,None]
    
    # apply the index to both the simialrities and the indices
    top_k_similarity = similarities_filtered[rows, top_k]
    top_k_similarity[top_k_similarity < 0] = 0
    
    indices_filtered = indices[rows, top_k]
    
    # predict the rating
    predicted_ratings = np.nansum((user_ratings[indices_filtered] * top_k_similarity), axis = 1) / np.sum(top_k_similarity, axis = 1)
    
    return predicted_ratings

In [62]:
predicted = predict_rating_for_user(4)

  predicted_ratings = np.nansum((user_ratings[indices_filtered] * top_k_similarity), axis = 1) / np.sum(top_k_similarity, axis = 1)


In [65]:
user_ratings = np.array(book_user_mat.iloc[:, 4])
for idx, i in enumerate(predicted):
    if user_ratings[idx] != 0:
        print("user rating: ", user_ratings[idx], " predicted: ", i)
    else:
        print(i)

4.861953297284746
4.9524991621176335
2.9720278822963224
4.585141128100335
3.887844820079221
4.999999999999999
3.647072153746423
3.723140517692385
2.957075282897866
4.815216792905265
4.386060241269762
nan
4.114150541027166
3.6427945395473493
4.669957836766041
4.0
5.0
4.892826543546362
3.4455309948379624
nan
4.953115368001825
1.4397818413256198
4.912830849831398
4.950424699912299
4.941739999954407
3.1200646912337846
4.953894095087951
3.5939380966422
4.0
nan
4.806165960613045
3.9335800318583196
3.189664217974772
2.6548433203824144
4.587234300118806
4.888864282563524
4.176597648132348
nan
4.414337933149335
3.685069382753994
4.534407963300742
4.819624204012116
4.661001096003769
3.125084635391226
3.6154174451833736
4.0
4.906965991619074
4.0843528394413156
3.174774631757596
4.586110664064612
2.4541074632628646
3.2232062415024982
2.580166473465156
4.104566223552446
3.814594307333138
3.279836038168285
3.0000000000000004
4.1092096747526154
4.487409587436078
nan
3.0
4.0
nan
3.497718593633897
4.05