# User-Based Collaborative Filtering for Book Recommendation

In [16]:
import numpy as np
import pandas as pd

In [34]:
df_books_final = pd.read_csv('df_books_final.csv')
df_train_interactions = pd.read_csv('train_interactions.csv')
df_val_interactions = pd.read_csv('val_interactions.csv')
df_test_interactions = pd.read_csv('test_interactions.csv')

In [18]:
df_books_final.head(3)

Unnamed: 0,book_id,average_rating,ratings_count,text_reviews_count,top_popular_shelves,author_ids,title_tfidf,description_word2vec,format_Audio,format_Digital,...,lang_tr,lang_tur,lang_ukr,lang_vi,lang_vie,lang_yid,lang_zh,length_long,length_medium,length_short
0,287141,0.670613,-0.046597,-0.09424,"[{'count': '4', 'name': 'history'}, {'count': ...",['3041852'],"(0, 10604)\t0.42579859675890475\n (0, 3492)...",[ 6.95598602e-01 4.52605486e-01 5.65332353e-...,False,False,...,False,False,False,False,False,False,False,False,True,False
1,6066812,0.921068,-0.042117,-0.098249,"[{'count': '9', 'name': 'favorites'}, {'count'...",['19158'],"(0, 16191)\t0.3157176733108245\n (0, 30033)...",[ 0.9027284 0.82348305 0.47872484 0.423247...,False,False,...,False,False,False,False,False,False,False,False,True,False
2,89378,1.505462,0.064108,0.651555,"[{'count': '8', 'name': 'pets'}, {'count': '8'...",['5411'],"(0, 11832)\t0.8332686465030811\n (0, 7249)\...",[ 1.0797411e+00 5.8800226e-01 3.7105912e-01 ...,False,False,...,False,False,False,False,False,False,False,False,False,True


In [19]:
df_train_interactions.head(5)

Unnamed: 0,user_id,book_id,rating,n_votes,review_age,sentiment
0,1,39343,2,0.0,3434,0.8511
1,1,21648,0,,3435,0.406046
2,2,7794,4,0.0,3046,-0.4215
3,3,8637,3,,3343,0.40686
4,3,50719,4,,3367,0.527973


In [20]:
# number of neighbors of user
N_NEIGHBORS = 10

# number of recommendations
N_RECOMMENDATIONS = 5

In [21]:
def read_ratings(df):
    """
    Read the raw data of the book ratings.

    Returns a list of tuples:
    (user id, book_id, rating)
    """

    data = []
    for _, row in df.iterrows():
        user_id = row['user_id']
        book_id = row['book_id']
        rating = row['rating']
        data.append((user_id, book_id, rating))

    return data

In [22]:
ratings = read_ratings(df_train_interactions)
ratings = pd.DataFrame(data=ratings, columns=['user', 'book', 'rating'])
ratings.head()

Unnamed: 0,user,book,rating
0,1.0,39343.0,2.0
1,1.0,21648.0,0.0
2,2.0,7794.0,4.0
3,3.0,8637.0,3.0
4,3.0,50719.0,4.0


In [23]:
# number of unique books
ratings.book.nunique()

47682

In [24]:
# number of unique users
ratings.user.nunique()

15398

### Transforming the user rating data to wide format

In [25]:
ratings_raw = ratings.copy()
ratings = ratings.pivot(index='user', columns='book', values='rating')

In [26]:
ratings.shape

(15398, 47682)

In [27]:
ratings.head()

book,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,...,57048.0,57049.0,57050.0,57051.0,57052.0,57053.0,57054.0,57055.0,57056.0,57057.0
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,,,,,,,,,,,...,,,,,,,,,,
2.0,,,,,,,,,,,...,,,,,,,,,,
3.0,,,,,,,,,,,...,,,,,,,,,,
4.0,,,,,,,,,,,...,,,,,,,,,,
5.0,,,,,,,,,,,...,,,,,,,,,,


### Similarity between users

In [28]:
def pearson_similarity(v1, v2):
    """
    Compute the Pearson correlation between to ratings vectors.

    pd.corr() function can handle missing data.

    parameters:
    - v1, v2: pd.Series, ratings vectors

    returns:
    - float

    """

    pearson = v1.corr(v2)

    return pearson

In [29]:
def compute_similarities(user_id, ratings_matrix):
    """
    Compute the similarity of a given user with all the other users in the dataset.

    Remove the similarity value for the given user from the result.

    returns:
        - a pd.Series with the user id's as index, and similarity as series values
    """

    # get ratings of user to re-use in the similarity computation
    ratings_user = ratings_matrix.loc[user_id,:]

    # calculate the similarity between the given user and the other users
    similarities = ratings_matrix.apply(
        lambda row: pearson_similarity(ratings_user, row),
        axis=1)

    similarities = similarities.to_frame(name='similarity')

    # find most similar users to the given user
    similarities = similarities.sort_values(by='similarity', ascending=False)

    # drop the similarity of the user (should be ~1 anyways)
    similarities = similarities.drop(user_id)

    return similarities

### Predict Book Ratings

In [30]:
def predict_rating(item_id, ratings, similarities, N=10):
    """
    Predict the rating of a given item by a user, given the ratings of similar users.
    Takes the N users with the highest similarity measure, AND who have rated the given item.
    Returns the average rating of the most similar users who previously rated the item.

    parameters:
    - item_id: int, item that needs a rating prediction
    - ratings: pd.DataFrame
    - similarities: pd.DataFrame
    - N: int, number of neighbors to use for rating prediction

    returns:
    - a float representing the predicted rating for the given item

    """

    # get the ratings of all users for the specific item
    users_ratings = ratings.loc[:, item_id]

    # only keep users who rated the given item, otherwise you won't be able to generate a prediction based on the users ratings
    most_similar_users_who_rated_item = similarities.loc[~users_ratings.isnull()]

    # keep N users with highest similarities to given user who also rated the given item
    N_most_similar_users = most_similar_users_who_rated_item.head(N)

    # find ratings item for most similar users:
    ratings_for_item = ratings.loc[N_most_similar_users.index, item_id]

    # predict the rating of the item by averaging the ratings of that item of the most similar users
    return ratings_for_item.mean()

### Recommend Book

In [31]:
def read_titles(df):
    """
    Read the mapping of book id -> book title

    Returns a dictionary
    {book id -> book title}
    """
    data = df.set_index("book_id")["title"].to_dict()
    return data

In [None]:
# book_titles = read_titles(df_books_final)

In [None]:
def recommend(user_id, ratings, book_titles, n_neighbors=10, n_recomm=5):
    """

    Recommend N books for a given user based on ratings data.

    1. get the ratings of the user
    2. get the books that the user has not rated
    3. compute the similarities between the user and the other users
    4. generate book ratings predictions for the user based on the similarities with other users
    5. find the N books with the highest predicted ratings

    parameters:
    - user_id: int, user to generate recommendations for
    - ratings: pd.DataFrame, user-book ratings
    - book_titles: dict, mapping of (book id -> book title)
    - n_neighbors: int: the number of neighbors to use to generate rating predictions
    - n_recomm: int, number of books to recommend

    returns:
    - pd.DataFrame with [book_id, rating, book title]

    """

    # get the ratings of the user
    ratings_user = ratings.loc[user_id, :]

    # all the items a user has not rated, that can be recommended
    all_items = ratings.loc[user_id,:]
    unrated_items = all_items.loc[all_items.isnull()]

    # convert the index with item ids into Series values
    unrated_items = unrated_items.index.to_series(name='item_ids').reset_index(drop=True)
    print('User {} has {} unrated items.'.format(user_id, len(unrated_items)))

    # compute user similarities
    similarities = compute_similarities(user_id, ratings)

    # generate predictions for unseen items based on the user similarity data
    predictions = unrated_items.apply(lambda d: predict_rating(d, ratings, similarities, N=n_neighbors))

    # sort items by highest predicted rating
    predictions = predictions.sort_values(ascending=False)

    # recommend top N items
    recommends = predictions.head(n_recomm)

    # reformat the result
    recommends = recommends.to_frame(name='predicted_rating')
    recommends = recommends.rename_axis('book_id')
    recommends = recommends.reset_index()

    # recommends['name'] = recommends.book_id.apply(lambda d: book_titles[d])

    return recommends

In [None]:
recommends = recommend(1, ratings, book_titles, n_neighbors=N_NEIGHBORS, n_recomm=N_RECOMMENDATIONS)
recommends