In [1]:
import math
import numpy as np
import pandas as pd
import pickle
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import norm
from sklearn.neighbors import NearestNeighbors

In [2]:
# GoodBooks-10K Data
df_books = pd.read_csv('books.csv', usecols=['book_id', 'title'])
df_ratings = pd.read_csv('ratings.csv', usecols=['user_id', 'book_id', 'rating'])

In [3]:
num_users = len(df_ratings.user_id.unique())
num_items = len(df_ratings.book_id.unique())
print('There are {} unique users and {} unique movies in this data set'.format(num_users, num_items))

There are 53424 unique users and 10000 unique movies in this data set


In [4]:
# get rating frequency
df_books_cnt = pd.DataFrame(df_ratings.groupby('book_id').size(), columns=['count'])
df_books_cnt.head(3000)

Unnamed: 0_level_0,count
book_id,Unnamed: 1_level_1
1,22806
2,21850
3,16931
4,19088
5,16604
...,...
2996,435
2997,562
2998,459
2999,483


In [5]:
df_ratings_cnt_tmp = pd.DataFrame(df_ratings.groupby('rating').size(), columns=['count'])
df_ratings_cnt_tmp

Unnamed: 0_level_0,count
rating,Unnamed: 1_level_1
1,124195
2,359257
3,1370916
4,2139018
5,1983093


In [6]:
# there are a lot more counts in rating of zero
total_cnt = num_users * num_items
rating_zero_cnt = total_cnt - df_ratings.shape[0]
# append counts of zero rating to df_ratings_cnt
df_ratings_cnt = df_ratings_cnt_tmp.append(
    pd.DataFrame({'count': rating_zero_cnt}, index=[0.0]),
    verify_integrity=True,
).sort_index()
df_ratings_cnt

Unnamed: 0,count
0.0,528263521
1.0,124195
2.0,359257
3.0,1370916
4.0,2139018
5.0,1983093


In [7]:
# add log count
df_ratings_cnt['log_count'] = np.log(df_ratings_cnt['count'])
df_ratings_cnt

Unnamed: 0,count,log_count
0.0,528263521,20.085106
1.0,124195,11.729608
2.0,359257,12.791793
3.0,1370916,14.13099
4.0,2139018,14.575857
5.0,1983093,14.500168


In [8]:
# filter data
popularity_thres = 150
popular_books = list(set(df_books_cnt.query('count >= @popularity_thres').index))
df_ratings_drop_books = df_ratings[df_ratings.book_id.isin(popular_books)]
print('shape of original ratings data: ', df_ratings.shape)
print('shape of ratings data after dropping unpopular books: ', df_ratings_drop_books.shape)
print('Number of popular book', len(popular_books))

shape of original ratings data:  (5976479, 3)
shape of ratings data after dropping unpopular books:  (5707246, 3)
Number of popular book 7698


In [9]:
# get number of ratings given by every user
df_users_cnt = pd.DataFrame(df_ratings_drop_books.groupby('user_id').size(), columns=['count'])
df_users_cnt.head()

Unnamed: 0_level_0,count
user_id,Unnamed: 1_level_1
1,117
2,55
3,84
4,133
5,95


In [10]:
# filter data
ratings_thres = 110
active_users = list(set(df_users_cnt.query('count >= @ratings_thres').index))
df_ratings_drop_users = df_ratings_drop_books[df_ratings_drop_books.user_id.isin(active_users)]
print('shape of original ratings data: ', df_ratings.shape)
print('shape of ratings data after dropping both unpopular movies and inactive users: ', df_ratings_drop_users.shape)
print('number of active user: ', len(active_users))

shape of original ratings data:  (5976479, 3)
shape of ratings data after dropping both unpopular movies and inactive users:  (3117897, 3)
number of active user:  23892


In [11]:
# pivot and create book-user matrix
book_user_mat = df_ratings_drop_users.pivot(index='book_id', columns='user_id', values='rating')
book_user_mat_standardized = (book_user_mat - np.mean(book_user_mat, axis=0)).fillna(0)
book_user_mat_bool = ~np.isnan(book_user_mat)
del book_user_mat

In [12]:
# convert to sparse matrix
book_user_mat_sparse = csr_matrix(book_user_mat_standardized.values)
book_user_mat_bool_sparse = csr_matrix(book_user_mat_bool.values)
del book_user_mat_standardized
del book_user_mat_bool

In [13]:
# square the matrix for weight calculation
x_squared = book_user_mat_sparse.power(2)

In [14]:
# only take the value where rating is not nan for weight calculation
weight_left = x_squared.dot(book_user_mat_bool_sparse.T)
del x_squared

In [15]:
# calculate weight matrix
weight_left = weight_left.toarray()
weight = np.sqrt(weight_left*weight_left.T)
del weight_left

In [16]:
ADJUST_THRESHOLD = 15

# calculate the adjust matrix
confidence_matrix = np.dot(book_user_mat_bool_sparse, book_user_mat_bool_sparse.T.astype(int))
adjusted_matrix = confidence_matrix / ADJUST_THRESHOLD
adjusted_matrix[adjusted_matrix > 1] = 1
adjusted_matrix = adjusted_matrix.toarray()
del confidence_matrix
del book_user_mat_bool_sparse

In [17]:
# calculate product matrix
prod = book_user_mat_sparse.dot(book_user_mat_sparse.T)
prod = prod.toarray()

In [18]:
# calculate similarity matrix
similarity_matrix = (prod/weight) * adjusted_matrix
del prod
del weight
del adjusted_matrix

  similarity_matrix = (prod/weight) * adjusted_matrix


In [19]:
# replace invalid value with -1
similarity_matrix[np.isnan(similarity_matrix)] = -1

# replace similarities larger than 1 with 1
similarity_matrix[(similarity_matrix > 1) & (similarity_matrix < 1.01)] = 0

# distance
distance_matrix = 1 - similarity_matrix

In [20]:
distance_matrix

array([[0.        , 0.59723035, 1.06924055, ..., 1.25853958, 1.2517489 ,
        1.06381259],
       [0.59723035, 0.        , 1.27246855, ..., 1.26544528, 0.77519131,
        1.01558694],
       [1.06924055, 1.27246855, 0.        , ..., 1.22700853, 0.6278526 ,
        0.65446889],
       ...,
       [1.25853958, 1.26544528, 1.22700853, ..., 0.        , 2.        ,
        2.        ],
       [1.2517489 , 0.77519131, 0.6278526 , ..., 2.        , 0.        ,
        1.06666667],
       [1.06381259, 1.01558694, 0.65446889, ..., 2.        , 1.06666667,
        0.        ]])

In [21]:
# create mapper from book title to index
# book_to_idx = {
#     book: i for i, book in 
#     enumerate(list(df_books.set_index('book_id').loc[book_user_mat_standardized.index].title))
# }
# transform matrix to scipy sparse matrix
#book_user_mat_sparse = csr_matrix(book_user_mat.values)

In [22]:
%env JOBLIB_TEMP_FOLDER=/tmp
# define model
model_knn = NearestNeighbors(metric='precomputed', algorithm='brute', n_neighbors=distance_matrix.shape[0], n_jobs=-1)
# fit
model_knn.fit(distance_matrix)

env: JOBLIB_TEMP_FOLDER=/tmp


NearestNeighbors(algorithm='brute', metric='precomputed', n_jobs=-1,
                 n_neighbors=7689)

In [23]:
distances, indices = model_knn.kneighbors(distance_matrix, n_neighbors=distance_matrix.shape[0])

In [24]:
distances

array([[0.        , 0.25505275, 0.29916935, ..., 2.        , 2.        ,
        2.        ],
       [0.        , 0.10321537, 0.1576765 , ..., 2.        , 2.        ,
        2.        ],
       [0.        , 0.11650387, 0.14389397, ..., 2.        , 2.        ,
        2.        ],
       ...,
       [0.        , 0.26666667, 0.29675243, ..., 2.        , 2.        ,
        2.        ],
       [0.        , 0.20112675, 0.33541924, ..., 2.        , 2.        ,
        2.        ],
       [0.        , 0.35231291, 0.3688868 , ..., 2.        , 2.        ,
        2.        ]])

In [25]:
indices

array([[   0,  506, 1353, ..., 3240, 3214, 4951],
       [   1, 3262, 3732, ..., 3214, 3240, 6588],
       [   2,  991, 2016, ..., 3240, 3214, 6588],
       ...,
       [7686, 3188,  914, ..., 3946, 3954, 7688],
       [7687,  184,  283, ..., 4655, 4661, 7686],
       [7688, 1563,  729, ..., 4422, 4429, 7686]], dtype=int64)

In [28]:
count2 = 0
for i in range(distance_matrix.shape[0]):
    count = (distances[i] < 0.5).sum()
    if (count < 6):
        count2 = count2 + 1
        
print(count2)

1026
