# User-Based Collaborative Filtering for Book Recommendation

In [232]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
df_books_final = pd.read_csv('df_books_final.csv')
df_train_interactions = pd.read_csv('train_interactions.csv')
df_test_interactions = pd.read_csv('test_interactions.csv')

In [234]:
df_books_final.head(3)

Unnamed: 0,book_id,title,description,average_rating,ratings_count,text_reviews_count,top_popular_shelves,author_ids,format_Audio,format_Digital,...,lang_tha,lang_tr,lang_tur,lang_ukr,lang_vi,lang_vie,lang_zh,length_long,length_medium,length_short
0,1882090,"Behave Yourself, Bethany Brant",A preacher's daughter with lots of curiosity a...,-1.02863,-0.059069,-0.136278,"[{'count': '1', 'name': 'mrs-withdrawn'}, {'co...",['151369'],False,False,...,False,False,False,False,False,False,False,False,True,False
1,166120,Ghosthunters And The Incredibly Revolting Ghost,A $2.99 value-priced edition of one of our bes...,-0.690046,-0.059069,-0.126186,"[{'count': '9', 'name': 'owned'}, {'count': '9...",['15873'],False,False,...,False,False,False,False,False,False,False,False,True,False
2,8608741,Attack of the Chicken Nugget Man: A National T...,Third-grader Chris Robb just can't seem to do ...,-0.597705,-0.059069,-0.102639,"[{'count': '2', 'name': 'books-i-wrote'}, {'co...",['2868520'],False,False,...,False,False,False,False,False,False,False,False,True,False


In [235]:
df_train_interactions.head(5)

Unnamed: 0,user_id,book_id,rating,date_added,n_votes,review_age,sentiment,Unnamed: 8,Unnamed: 9
0,0,2497,0,359079.0,4,2014-07-16 19:28:57+00:00,,2802.0,0.406046
1,0,5556,4,0.0,2809,-0.224400,,,
2,0,1330,4,0.0,2888,0.808300,,,
3,0,177,4,0.0,2907,0.874500,,,
4,1,92062,4,,2549,0.527973,,,


In [34]:
user_item_interactions = df_train_interactions[['user_id', 'book_id', 'rating']]
user_item_interactions

Unnamed: 0,user_id,book_id,rating
0,0,2497,0
1,0,5556,4
2,0,1330,4
3,0,177,4
4,1,92062,4
...,...,...,...
885096,34192,72240,5
885097,34192,34391,4
885098,34192,782,3
885099,34192,84,4


In [236]:
# number of neighbors of user
N_NEIGHBORS = 10

# number of recommendations
N_RECOMMENDATIONS = 5

In [237]:
def read_ratings(df):
    """
    Read the raw data of the book ratings.

    Returns a list of tuples:
    (user id, book_id, rating)
    """

    data = []
    for _, row in df.iterrows():
        user_id = row['user_id']
        book_id = row['book_id']
        rating = row['rating']
        data.append((user_id, book_id, rating))

    return data

In [238]:
ratings = read_ratings(df_train_interactions)
ratings = pd.DataFrame(data=ratings, columns=['user', 'book', 'rating'])
ratings.head()

Unnamed: 0,user,book,rating
0,0,2218,5
1,0,3922,4
2,0,762,4
3,0,18839,5
4,0,2395,4


In [239]:
ratings.head(3)

Unnamed: 0,user,book,rating
0,0.0,2497.0,0.0
1,0.0,5556.0,4.0
2,0.0,1330.0,4.0


In [240]:
ratings[ratings < 0].notna().sum()

user      0
book      0
rating    0
dtype: int64

In [89]:
# ratings_raw = ratings.copy()
ratings = ratings.pivot(index='user', columns='book', values='rating')

In [241]:
ratings = df_train_interactions.pivot(index='user_id', columns='book_id', values='rating')

In [242]:
ratings.shape

(885101, 3)

In [243]:
ratings.head(10)

book,5,50,93,236,244,302,314,330,881,903,...,35297101,35429280,35440603,35479935,35546694,35616438,35757340,35757419,36131198,36311957
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,,,,,,,,,,,...,,,,,,,,,,
1.0,,,,,,,,,,,...,,,,,,,,,,
2.0,5.0,,,,,,,,,,...,,,,,,,,,,
3.0,,,,,,,,,,,...,,,,,,,,,,
4.0,3.0,,,,,,,,,,...,,,,,,,,,,
7.0,,,,,,,,,,,...,,,,,,,,,,
8.0,,,,,,,,,,,...,,,,,,,,,,
9.0,,,,,,,,,,,...,,,,,,,,,,
10.0,4.0,,,,,,,,,,...,,,,,,,,,,
12.0,,,,,,,,,,,...,,,,,,,,,,


#### Creating Affinity Matrix Based on Ratings

In [22]:
from scipy.sparse import csr_matrix

In [27]:
user_book_matrix = ratings.fillna(0)

In [28]:
user_book_matrix.head(10)

book,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,...,57048.0,57049.0,57050.0,57051.0,57052.0,57053.0,57054.0,57055.0,57056.0,57057.0
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
affinity_matrix = csr_matrix(user_book_matrix.values)

### Similarity between users

In [244]:
def pearson_similarity(v1, v2):
    """
    Compute the Pearson correlation between to ratings vectors.

    pd.corr() function can handle missing data.

    parameters:
    - v1, v2: pd.Series, ratings vectors

    returns:
    - float

    """

    pearson = v1.corr(v2)
    # pearson = user_book_matrix.T.corr(method='pearson')

    return pearson

In [245]:
def compute_similarities(user_id, ratings_matrix):
    """
    Compute the similarity of a given user with all the other users in the dataset.

    Remove the similarity value for the given user from the result.

    returns:
        - a pd.Series with the user id's as index, and similarity as series values
    """

    # get ratings of user to re-use in the similarity computation
    ratings_user = ratings_matrix.loc[user_id,:]

    # calculate the similarity between the given user and the other users
    similarities = ratings_matrix.apply(
        lambda row: pearson_similarity(ratings_user, row),
        axis=1)

    similarities = similarities.to_frame(name='similarity')

    # find most similar users to the given user
    similarities = similarities.sort_values(by='similarity', ascending=False)

    # drop the similarity of the user (should be ~1 anyways)
    similarities = similarities.drop(user_id)

    return similarities

In [270]:
def compute_similarities(ratings_matrix):
    """
    Compute the similarity matrix between all users in the dataset using a CSR matrix.
    
    Args:
        ratings_matrix: The ratings matrix in CSR format (sparse matrix).
    
    Returns:
        A pandas DataFrame with users as both row and column indices, and similarity as values.
    """
    # Get the number of users
    num_users = ratings_matrix.shape[0]
    
    # Initialize an empty similarity matrix (size: num_users x num_users)
    similarity_matrix = np.zeros((num_users, num_users))
    
    # Iterate through each pair of users
    for user_id in range(num_users):
        # Extract the ratings of the given user (row in the CSR matrix)
        user_ratings = ratings_matrix[user_id].toarray().flatten()

        for other_user_id in range(user_id + 1, num_users):  # To avoid redundant calculations
            # Extract the ratings of the other user (row in the CSR matrix)
            other_user_ratings = ratings_matrix[other_user_id].toarray().flatten()

            # Compute the Pearson similarity between the two users
            similarity = pearson_similarity(user_ratings, other_user_ratings)

            # Assign the computed similarity to both [user_id, other_user_id] and [other_user_id, user_id]
            similarity_matrix[user_id, other_user_id] = similarity
            similarity_matrix[other_user_id, user_id] = similarity

    # Convert the similarity matrix to a pandas DataFrame
    similarity_df = pd.DataFrame(similarity_matrix, index=range(num_users), columns=range(num_users))
    
    return similarity_df

def pearson_similarity(user_ratings, other_user_ratings):
    """
    Compute Pearson similarity between two rating vectors.
    
    Args:
        user_ratings: A 1D numpy array of ratings for the given user.
        other_user_ratings: A 1D numpy array of ratings for the other user.
        
    Returns:
        A Pearson similarity value between -1 and 1.
    """
    # Only consider the ratings that are not zero (i.e., non-missing ratings)
    mask = (user_ratings > 0) & (other_user_ratings > 0)
    
    # Extract the ratings that are common (both users rated the same item)
    common_ratings_user = user_ratings[mask]
    common_ratings_other_user = other_user_ratings[mask]
    
    if len(common_ratings_user) > 1:
        # Calculate Pearson correlation
        return np.corrcoef(common_ratings_user, common_ratings_other_user)[0, 1]
    else:
        # If not enough common ratings, return NaN or 0
        return np.nan


In [63]:
user_item_interactions

Unnamed: 0,user_id,book_id,rating,user_idx,item_idx
0,0,2497,0,0,2358
1,0,5556,4,0,5222
2,0,1330,4,0,1270
3,0,177,4,0,169
4,1,92062,4,1,82304
...,...,...,...,...,...
885096,34192,72240,5,34192,64111
885097,34192,34391,4,34192,31343
885098,34192,782,3,34192,753
885099,34192,84,4,34192,79


In [247]:
vec = affinity_matrix[0].toarray().flatten()

In [285]:
book_ratings = ratings[39343].dropna()
user_ids = book_ratings.index
user_vectors = ratings.loc[user_ids]

In [240]:
user_ids

Index([1.0, 9747.0], dtype='float64', name='user')

In [244]:
user_vectors

book,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,...,57048.0,57049.0,57050.0,57051.0,57052.0,57053.0,57054.0,57055.0,57056.0,57057.0
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,,,,,,,,,,,...,,,,,,,,,,
9747.0,,,,,,,,,,,...,,,,,,,,,,


In [260]:
user1 = user_vectors.iloc[0]
user9747 = user_vectors.iloc[1]

In [267]:
user1.corr(user9747)

  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)


nan

In [268]:
# Select the two user vectors
user1_vector = user_vectors.iloc[0]
user9747_vector = user_vectors.iloc[1]

# Drop NaN values from both user vectors to get only the common ratings
common_ratings = user1_vector.dropna().index.intersection(user9747_vector.dropna().index)

    # all the items a user has not rated, that can be recommended
    all_items = ratings.loc[user_id,:]
    unrated_items = all_items.loc[all_items.isnull()]

    # convert the index with item ids into Series values
    unrated_items = unrated_items.index.to_series(name='item_ids').reset_index(drop=True)
    print(unrated_items)
    print('User {} has {} unrated items.'.format(user_id, len(unrated_items)))

    # compute user similarities
    similarities = compute_similarities(user_id, ratings)

    # generate predictions for unseen items based on the user similarity data
    predictions = unrated_items.apply(lambda d: predict_rating(d, ratings, similarities, N=n_neighbors))
    predicitions_df = pd.DataFrame(predictions.tolist(), columns=['book_id', 'predictions', 'number_of_similar_users'])

    # sort items by highest predicted rating and number of similar users
    predicitions_df = predicitions_df.sort_values(by=['predictions', 'number_of_similar_users'], ascending=False)

    # recommend top N items
    recommends = predicitions_df.head(n_recomm)

    return recommends['book_id']

Not enough overlapping ratings to calculate Pearson correlation.


In [286]:
sim_scores = compute_similarities(2, affinity_matrix)
sim_values = sim_scores[sim_scores.notna()]

0               5
1              50
2              93
3             236
4             244
           ...   
14633    35616438
14634    35757340
14635    35757419
14636    36131198
14637    36311957
Name: item_ids, Length: 14638, dtype: int64
User 0 has 14638 unrated items.


  c /= stddev[:, None]
  c /= stddev[None, :]


In [48]:
sim_values.describe()

count    4904.000000
mean        0.128089
std         0.698966
min        -1.000000
25%        -0.500000
50%         0.238271
75%         0.790569
max         1.000000
dtype: float64

In [21]:
from sklearn.metrics.pairwise import cosine_similarity

In [279]:
userid = 11
sim_scores = pd.DataFrame(user_similarity_matrix)[userid]
nonzero_idx = np.nonzero(pd.DataFrame(user_similarity_matrix)[userid])[0]
nonzero_idx = nonzero_idx[nonzero_idx != userid]
sim_scores[sim_scores.index.isin(nonzero_idx)].describe()
# sim_scores
# nonzero_idx

count    1372.000000
mean        0.071419
std         0.066693
min         0.005656
25%         0.037759
50%         0.054378
75%         0.081319
max         0.779266
Name: 11, dtype: float64

In [280]:
len(df_train_interactions[df_train_interactions['user_id'] == 2])

Unnamed: 0,user_id,book_id,rating
0,1,39343,2
1,1,21648,0
2,2,7794,4


In [58]:
# test similaritiy scores
items_to_recommend = get_collaborative_filtering_recommendations(user_similarity_matrix, 2, k, sim_threshold)
items_to_recommend[np.nonzero(items_to_recommend)[0]]

NameError: name 'user_similarity_matrix' is not defined

In [123]:
items_to_recommend = get_collaborative_filtering_recommendations(user_similarity_matrix, 2, k, sim_threshold)
print('Number of recommendations:', len(items_to_recommend))
items_to_recommend

Number of recommendations: 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_item_interactions['user_id'] = encoder.fit_transform(user_item_interactions['user_id'])
  self._set_arrayXarray(i, j, x)


Unnamed: 0,book_id,average_rating,ratings_count,text_reviews_count,top_popular_shelves,author_ids,title_tfidf,description_word2vec,format_Audio,format_Digital,...,lang_tr,lang_tur,lang_ukr,lang_vi,lang_vie,lang_yid,lang_zh,length_long,length_medium,length_short


In [51]:
def read_titles(df):
    """
    Read the mapping of book id -> book title

    Returns a dictionary
    {book id -> book title}
    """
    data = df.set_index("book_id")["title"].to_dict()
    return data

In [285]:
# book_titles = read_titles(df_books_final)

In [84]:
def recommend(user_id, ratings, book_titles, n_neighbors=10, n_recomm=5):
    """

    Recommend N books for a given user based on ratings data.

    1. get the ratings of the user
    2. get the books that the user has not rated
    3. compute the similarities between the user and the other users
    4. generate book ratings predictions for the user based on the similarities with other users
    5. find the N books with the highest predicted ratings

    parameters:
    - user_id: int, user to generate recommendations for
    - ratings: pd.DataFrame, user-book ratings
    - book_titles: dict, mapping of (book id -> book title)
    - n_neighbors: int: the number of neighbors to use to generate rating predictions
    - n_recomm: int, number of books to recommend

    returns:
    - pd.DataFrame with [book_id, rating, book title]

    """

    # get the ratings of the user
    ratings_user = ratings.loc[user_id, :]

    # all the items a user has not rated, that can be recommended
    all_items = ratings.loc[user_id,:]
    unrated_items = all_items.loc[all_items.isnull()]

    # convert the index with item ids into Series values
    unrated_items = unrated_items.index.to_series(name='item_ids').reset_index(drop=True)
    print(unrated_items)
    print('User {} has {} unrated items.'.format(user_id, len(unrated_items)))

    # compute user similarities
    similarities = compute_similarities(user_id, ratings)
    # print(similarities.columns)
    sorted = similarities.sort_values(by="similarity", ascending=False)
    # print(sorted)
    # return similarities

    # generate predictions for unseen items based on the user similarity data
    predictions = unrated_items.apply(lambda d: predict_rating(d, ratings, similarities, N=n_neighbors))
    return(predictions)

    # sort items by highest predicted rating and number of similar users
    predicitions_df = predicitions_df.sort_values(by=['predictions', 'number_of_similar_users'], ascending=False)

    # recommend top N items
    recommends = predicitions_df.head(n_recomm)

    # reformat the result
    recommends = recommends.to_frame(name='predicted_rating')
    recommends = recommends.rename_axis('book_id')
    recommends = recommends.reset_index()

    # recommends['name'] = recommends.book_id.apply(lambda d: book_titles[d])

    # return recommends

In [85]:
recommends = recommend(1, affinity_matrix, book_titles={}, n_neighbors=N_NEIGHBORS, n_recomm=N_RECOMMENDATIONS)
recommends

AttributeError: 'csr_matrix' object has no attribute 'loc'

### Recommend for users in test set

In [21]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_scores =  tfidf.fit_transform(df_books_final['description'])
tfidf_scores.shape

(59828, 149225)

In [22]:
# cosine similarity matrix directly for the sparse TF-IDF matrix
tfidf_cos_sim = cosine_similarity(tfidf_scores, dense_output=False)

In [23]:
print(df_books_final.index[df_books_final['book_id'] == 46677])

Index([59785], dtype='int64')


In [24]:
book_index = df_books_final.index[df_books_final['book_id'] == 343002][0]
similarity_scores = tfidf_cos_sim[book_index].toarray().flatten()
index = df_books_final.index[df_books_final['book_id'] == 46677][0]
print(similarity_scores[index])
index = df_books_final.index[df_books_final['book_id'] == 867248][0]
print(similarity_scores[index])
index = df_books_final.index[df_books_final['book_id'] == 6669717][0]
print(similarity_scores[index])
index = df_books_final.index[df_books_final['book_id'] == 22013040][0]
print(similarity_scores[index])
index = df_books_final.index[df_books_final['book_id'] == 402114][0]
print(similarity_scores[index])
print("break")
book_index = df_books_final.index[df_books_final['book_id'] == 1852][0]
similarity_scores = tfidf_cos_sim[book_index].toarray().flatten()
index = df_books_final.index[df_books_final['book_id'] == 46677][0]
print(similarity_scores[index])
index = df_books_final.index[df_books_final['book_id'] == 867248][0]
print(similarity_scores[index])
index = df_books_final.index[df_books_final['book_id'] == 6669717][0]
print(similarity_scores[index])
index = df_books_final.index[df_books_final['book_id'] == 22013040][0]
print(similarity_scores[index])
index = df_books_final.index[df_books_final['book_id'] == 402114][0]
print(similarity_scores[index])


0.02697621518236352
0.0
0.03242136805362899
0.002300617889906126
0.011088978479916148
break
0.03906784168008337
0.015002949668787525
0.037975677305821874
0.007384900175627537
0.011830229507892809


In [55]:
from statistics import mean 
import warnings
warnings.filterwarnings('ignore')

unique_users = df_test_interactions.user_id.unique()
first1000_users = unique_users[:1000]
results = []
# 1000 users
# set thresold 
# plot 
for user in first1000_users:
    recommends = recommend(user, ratings, n_neighbors=N_NEIGHBORS, n_recomm=N_RECOMMENDATIONS)
    test_books = df_test_interactions[df_test_interactions['user_id'] == user]['book_id']
    similarity_scores_list = []
    max_similarity_scores_list = []

    for test_book in test_books:
        similarity_scores_list_temp=[]
        test_book_index = df_books_final.index[df_books_final['book_id'] == test_book][0]
        similarity_scores = tfidf_cos_sim[test_book_index].toarray().flatten()

        for recommended_book in recommends:
            index = df_books_final.index[df_books_final['book_id'] == recommended_book][0]
            similarity_scores_list_temp.append(similarity_scores[index])
        
        
        max_similarity_score = max(similarity_scores_list_temp) if similarity_scores_list_temp else None
        max_similarity_scores_list.append(max_similarity_score)
        similarity_scores_list.extend(similarity_scores_list_temp)

    results.append({
        "user_id": user,
        "books_id_read": test_books.tolist(),
        "recommendations": recommends.tolist(),
        "similarity_scores_list": similarity_scores_list,
        "mean_similarity_score": mean(similarity_scores_list),
        "max_similarity_score": max_similarity_scores_list
    })

output_df = pd.DataFrame(results)
output_df

User 0 has 14638 unrated items.
User 1 has 14635 unrated items.
User 2 has 14638 unrated items.
User 3 has 14633 unrated items.
User 4 has 14638 unrated items.
User 5 has 14633 unrated items.
User 6 has 14638 unrated items.
User 7 has 14641 unrated items.
User 8 has 14632 unrated items.
User 9 has 14635 unrated items.
User 10 has 14637 unrated items.
User 11 has 14641 unrated items.
User 12 has 14638 unrated items.
User 13 has 14640 unrated items.
User 14 has 14634 unrated items.
User 15 has 14636 unrated items.
User 16 has 14637 unrated items.
User 17 has 14639 unrated items.
User 18 has 14640 unrated items.
User 19 has 14637 unrated items.
User 20 has 14641 unrated items.
User 21 has 14632 unrated items.
User 22 has 14640 unrated items.
User 23 has 14642 unrated items.
User 24 has 14638 unrated items.
User 25 has 14641 unrated items.
User 26 has 14637 unrated items.
User 27 has 14640 unrated items.
User 28 has 14637 unrated items.
User 29 has 14642 unrated items.
User 30 has 14641 un

Unnamed: 0,user_id,books_id_read,recommendations,similarity_scores_list,mean_similarity_score,max_similarity_score
0,0,"[343002, 1852]","[46677, 867248, 6669717, 22013040, 402114]","[0.02697621518236352, 0.0, 0.03242136805362899...",0.018405,"[0.03242136805362899, 0.03906784168008337]"
1,1,"[1248128, 30119]","[14118, 46677, 867248, 6669717, 22013040]","[0.007197123322763255, 0.0051736136925291925, ...",0.058451,"[0.007197123322763255, 0.5127871656196451]"
2,2,"[74595, 2711313]","[14118, 867248, 6669717, 22013040, 402114]","[0.01044065185339305, 0.0, 0.02950585563320733...",0.009337,"[0.029505855633207338, 0.018733342147202403]"
3,3,"[30119, 157993]","[14118, 46677, 867248, 6669717, 22013040]","[0.012281415239966146, 0.02696491648561693, 0....",0.066646,"[0.5127871656196451, 0.0517584584988947]"
4,4,"[13023, 157993]","[14118, 46677, 867248, 6669717, 22013040]","[0.00340888102818777, 0.02332300161231656, 0.0...",0.015926,"[0.02332300161231656, 0.0517584584988947]"
...,...,...,...,...,...,...
995,995,"[24178, 78411]","[14118, 46677, 867248, 6669717, 22013040]","[0.0426236394204663, 0.02526529716731194, 0.00...",0.019403,"[0.0426236394204663, 0.028494710116081803]"
996,996,"[267972, 275000]","[14118, 46677, 867248, 6669717, 22013040]","[0.02398428888624259, 0.020906982781366244, 0....",0.010759,"[0.02561423259626288, 0.007984469981907565]"
997,997,"[25618438, 23754884]","[14118, 46677, 867248, 6669717, 22013040]","[0.0, 0.015102355213715836, 0.0, 0.02371667758...",0.006078,"[0.023716677584827726, 0.006018297839707198]"
998,998,"[140225, 5]","[14118, 46677, 867248, 6669717, 22013040]","[0.011927089756086478, 0.01132274443756136, 0....",0.012094,"[0.029593441500958117, 0.019991562840876765]"


In [59]:
output_df['num_exact_matches'] = output_df.apply(
    lambda row: sum(1 for book_id in row['recommendations'] if book_id in row['books_id_read']),
    axis=1
)
output_df

Unnamed: 0,user_id,books_id_read,recommendations,similarity_scores_list,mean_similarity_score,max_similarity_score,num_exact_matches
0,0,"[343002, 1852]","[46677, 867248, 6669717, 22013040, 402114]","[0.02697621518236352, 0.0, 0.03242136805362899...",0.018405,"[0.03242136805362899, 0.03906784168008337]",0
1,1,"[1248128, 30119]","[14118, 46677, 867248, 6669717, 22013040]","[0.007197123322763255, 0.0051736136925291925, ...",0.058451,"[0.007197123322763255, 0.5127871656196451]",0
2,2,"[74595, 2711313]","[14118, 867248, 6669717, 22013040, 402114]","[0.01044065185339305, 0.0, 0.02950585563320733...",0.009337,"[0.029505855633207338, 0.018733342147202403]",0
3,3,"[30119, 157993]","[14118, 46677, 867248, 6669717, 22013040]","[0.012281415239966146, 0.02696491648561693, 0....",0.066646,"[0.5127871656196451, 0.0517584584988947]",0
4,4,"[13023, 157993]","[14118, 46677, 867248, 6669717, 22013040]","[0.00340888102818777, 0.02332300161231656, 0.0...",0.015926,"[0.02332300161231656, 0.0517584584988947]",0
...,...,...,...,...,...,...,...
995,995,"[24178, 78411]","[14118, 46677, 867248, 6669717, 22013040]","[0.0426236394204663, 0.02526529716731194, 0.00...",0.019403,"[0.0426236394204663, 0.028494710116081803]",0
996,996,"[267972, 275000]","[14118, 46677, 867248, 6669717, 22013040]","[0.02398428888624259, 0.020906982781366244, 0....",0.010759,"[0.02561423259626288, 0.007984469981907565]",0
997,997,"[25618438, 23754884]","[14118, 46677, 867248, 6669717, 22013040]","[0.0, 0.015102355213715836, 0.0, 0.02371667758...",0.006078,"[0.023716677584827726, 0.006018297839707198]",0
998,998,"[140225, 5]","[14118, 46677, 867248, 6669717, 22013040]","[0.011927089756086478, 0.01132274443756136, 0....",0.012094,"[0.029593441500958117, 0.019991562840876765]",0


### Evaluation

In [57]:
output_df['mean_similarity_score'].describe()

count    1000.000000
mean        0.018003
std         0.018727
min         0.000000
25%         0.008647
50%         0.012214
75%         0.016738
max         0.112852
Name: mean_similarity_score, dtype: float64

In [60]:
total_books = 2000
total_hits = output_df['num_exact_matches'].sum()
print(total_hits)
accuracy_of_exact_hits = total_hits/total_books * 100 # checking for accuracy of recommendation. hits/total_books
print(f"The accuracy of the recommendation system in terms of exact match is {accuracy_of_exact_hits:.2f}%")

0
The accuracy of the recommendation system in terms of exact match is 0.00%
