# User-Based Collaborative Filtering

## Import Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

## Read Data

In [2]:
df_books_final = pd.read_csv('df_books_final.csv')
df_train_interactions = pd.read_csv('train_interactions.csv')
df_test_interactions = pd.read_csv('test_interactions.csv')

In [3]:
df_books_final.head(3)

Unnamed: 0,book_id,title,description,average_rating,ratings_count,text_reviews_count,top_popular_shelves,author_ids,format_Audio,format_Digital,...,lang_tha,lang_tr,lang_tur,lang_ukr,lang_vi,lang_vie,lang_zh,length_long,length_medium,length_short
0,1882090,"Behave Yourself, Bethany Brant",A preacher's daughter with lots of curiosity a...,-1.02863,-0.059069,-0.136278,"[{'count': '1', 'name': 'mrs-withdrawn'}, {'co...",['151369'],False,False,...,False,False,False,False,False,False,False,False,True,False
1,166120,Ghosthunters And The Incredibly Revolting Ghost,A $2.99 value-priced edition of one of our bes...,-0.690046,-0.059069,-0.126186,"[{'count': '9', 'name': 'owned'}, {'count': '9...",['15873'],False,False,...,False,False,False,False,False,False,False,False,True,False
2,8608741,Attack of the Chicken Nugget Man: A National T...,Third-grader Chris Robb just can't seem to do ...,-0.597705,-0.059069,-0.102639,"[{'count': '2', 'name': 'books-i-wrote'}, {'co...",['2868520'],False,False,...,False,False,False,False,False,False,False,False,True,False


In [4]:
df_train_interactions.head(5)

Unnamed: 0,user_id,book_id,rating,date_added,n_votes,review_age,sentiment
0,0,157993,5,2016-07-18 19:34:48+00:00,,3004,0.574139
1,0,359079,4,2014-07-16 19:28:57+00:00,,3737,0.527973
2,0,41684,4,2014-07-16 13:45:50+00:00,,3738,0.527973
3,0,180617,4,2012-08-06 19:55:50+00:00,,4446,0.527973
4,0,13023,3,2012-01-30 10:17:30+00:00,,4636,0.40686


## Creating Affinity Matrix Based on Ratings

In [6]:
# number of neighbors of user
N_NEIGHBORS = 10

# number of recommendations
N_RECOMMENDATIONS = 5

In [7]:
def read_ratings(df):
    """
    Read the raw data of the book ratings.

    Returns a list of tuples:
    (user id, book_id, rating)
    """

    data = []
    for _, row in df.iterrows():
        user_id = row['user_id']
        book_id = row['book_id']
        rating = row['rating']
        data.append((user_id, book_id, rating))

    return data

In [8]:
ratings = read_ratings(df_train_interactions)
ratings = pd.DataFrame(data=ratings, columns=['user', 'book', 'rating'])
ratings.head()

Unnamed: 0,user,book,rating
0,0,157993,5
1,0,359079,4
2,0,41684,4
3,0,180617,4
4,0,13023,3


In [9]:
ratings = ratings.pivot(index='user', columns='book', values='rating')

In [10]:
ratings.shape

(20798, 14645)

In [11]:
ratings.head(10)

book,5,50,93,236,244,302,314,330,881,903,...,35297101,35429280,35440603,35479935,35546694,35616438,35757340,35757419,36131198,36311957
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,5.0,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,3.0,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
6,3.0,,,,,,,,,,...,,,,,,,,,,
7,,4.0,,,,,,,,,...,,,,,,,,,,
8,,,,,,,,,,,...,,,,,,,,,,
9,4.0,,,,,,,,,,...,,,,,,,,,,


## Similarity between users

In [12]:
def pearson_similarity(v1, v2):
    """
    Compute the Pearson correlation between to ratings vectors.

    pd.corr() function can handle missing data.

    parameters:
    - v1, v2: pd.Series, ratings vectors

    returns:
    - float

    """

    pearson = v1.corr(v2)

    return pearson

In [13]:
def compute_similarities(user_id, ratings_matrix):
    """
    Compute the similarity of a given user with all the other users in the dataset.

    Remove the similarity value for the given user from the result.

    returns:
        - a pd.Series with the user id's as index, and similarity as series values
    """

    # get ratings of user to re-use in the similarity computation
    ratings_user = ratings_matrix.loc[user_id,:]

    # calculate the similarity between the given user and the other users
    similarities = ratings_matrix.apply(
        lambda row: pearson_similarity(ratings_user, row),
        axis=1)

    similarities = similarities.to_frame(name='similarity')

    # find most similar users to the given user
    similarities = similarities.sort_values(by='similarity', ascending=False)

    # drop the similarity of the user (should be ~1 anyways)
    similarities = similarities.drop(user_id)

    return similarities

## Predict Book Rating 

In [14]:
def predict_rating(item_id, ratings, similarities, N=10):
    """
    Predict the rating of a given item by a user, given the ratings of similar users.
    Takes the N users with the highest similarity measure, AND who have rated the given item.
    Returns the average rating of the most similar users who previously rated the item.

    parameters:
    - item_id: int, item that needs a rating prediction
    - ratings: pd.DataFrame
    - similarities: pd.DataFrame
    - N: int, number of neighbors to use for rating prediction

    returns:
    - a float representing the predicted rating for the given item

    """

    # get the ratings of all users for the specific item
    users_ratings = ratings.loc[:, item_id]

    # only keep users who rated the given item, otherwise you won't be able to generate a prediction based on the users ratings
    most_similar_users_who_rated_item = similarities.loc[~users_ratings.isnull()]

    # keep N users with highest similarities to given user who also rated the given item
    N_most_similar_users = most_similar_users_who_rated_item.head(N)

    # find ratings item for most similar users:
    ratings_for_item = ratings.loc[N_most_similar_users.index, item_id]

    # predict the rating of the item by averaging the ratings of that item of the most similar users
    return item_id, ratings_for_item.mean(), len(ratings_for_item)

## Recommend function

In [15]:
def recommend(user_id, ratings, n_neighbors=10, n_recomm=5):
    """

    Recommend N books for a given user based on ratings data.

    1. get the ratings of the user
    2. get the books that the user has not rated
    3. compute the similarities between the user and the other users
    4. generate book ratings predictions for the user based on the similarities with other users
    5. find the N books with the highest predicted ratings

    parameters:
    - user_id: int, user to generate recommendations for
    - ratings: pd.DataFrame, user-book ratings
    - book_titles: dict, mapping of (book id -> book title)
    - n_neighbors: int: the number of neighbors to use to generate rating predictions
    - n_recomm: int, number of books to recommend

    returns:
    - pd.DataFrame with [book_id, rating, book title]

    """

    # get the ratings of the user
    ratings_user = ratings.loc[user_id, :]

    # all the items a user has not rated, that can be recommended
    all_items = ratings.loc[user_id,:]
    unrated_items = all_items.loc[all_items.isnull()]

    # convert the index with item ids into Series values
    unrated_items = unrated_items.index.to_series(name='item_ids').reset_index(drop=True)
    print('User {} has {} unrated items.'.format(user_id, len(unrated_items)))

    # compute user similarities
    similarities = compute_similarities(user_id, ratings)

    # generate predictions for unseen items based on the user similarity data
    predictions = unrated_items.apply(lambda d: predict_rating(d, ratings, similarities, N=n_neighbors))
    predicitions_df = pd.DataFrame(predictions.tolist(), columns=['book_id', 'predictions', 'number_of_similar_users'])

    # sort items by highest predicted rating and number of similar users
    predicitions_df = predicitions_df.sort_values(by=['predictions', 'number_of_similar_users'], ascending=False)

    # recommend top N items
    recommends = predicitions_df.head(n_recomm)

    return recommends['book_id']

## Similarity Matrix

In [16]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_scores =  tfidf.fit_transform(df_books_final['description'])
tfidf_scores.shape

(59828, 149225)

In [17]:
# cosine similarity matrix directly for the sparse TF-IDF matrix
tfidf_cos_sim = cosine_similarity(tfidf_scores, dense_output=False)

## Recommend for users in test set

In [None]:
from statistics import mean 
import warnings
warnings.filterwarnings('ignore')

unique_users = df_test_interactions.user_id.unique()
first1000_users = unique_users[:1000]
results = []

for user in first1000_users:
    recommends = recommend(user, ratings, n_neighbors=N_NEIGHBORS, n_recomm=N_RECOMMENDATIONS)
    test_books = df_test_interactions[df_test_interactions['user_id'] == user]['book_id']
    similarity_scores_list = []
    max_similarity_scores_list = []

    for test_book in test_books:
        similarity_scores_list_temp=[]
        test_book_index = df_books_final.index[df_books_final['book_id'] == test_book][0]
        similarity_scores = tfidf_cos_sim[test_book_index].toarray().flatten()

        for recommended_book in recommends:
            index = df_books_final.index[df_books_final['book_id'] == recommended_book][0]
            similarity_scores_list_temp.append(similarity_scores[index])
        
        
        max_similarity_score = max(similarity_scores_list_temp) if similarity_scores_list_temp else None
        max_similarity_scores_list.append(max_similarity_score)
        similarity_scores_list.extend(similarity_scores_list_temp)

    results.append({
        "user_id": user,
        "books_id_read": test_books.tolist(),
        "recommendations": recommends.tolist(),
        "similarity_scores_list": similarity_scores_list,
        "mean_similarity_score": mean(similarity_scores_list),
        "max_similarity_score": max_similarity_scores_list
    })

output_df = pd.DataFrame(results)
output_df

User 0 has 14638 unrated items.
User 1 has 14635 unrated items.
User 2 has 14638 unrated items.
User 3 has 14633 unrated items.
User 4 has 14638 unrated items.
User 5 has 14633 unrated items.
User 6 has 14638 unrated items.
User 7 has 14641 unrated items.
User 8 has 14632 unrated items.
User 9 has 14635 unrated items.
User 10 has 14637 unrated items.
User 11 has 14641 unrated items.
User 12 has 14638 unrated items.
User 13 has 14640 unrated items.
User 14 has 14634 unrated items.
User 15 has 14636 unrated items.
User 16 has 14637 unrated items.
User 17 has 14639 unrated items.
User 18 has 14640 unrated items.
User 19 has 14637 unrated items.
User 20 has 14641 unrated items.
User 21 has 14632 unrated items.
User 22 has 14640 unrated items.
User 23 has 14642 unrated items.
User 24 has 14638 unrated items.
User 25 has 14641 unrated items.
User 26 has 14637 unrated items.
User 27 has 14640 unrated items.
User 28 has 14637 unrated items.
User 29 has 14642 unrated items.
User 30 has 14641 un

Unnamed: 0,user_id,books_id_read,recommendations,similarity_scores_list,mean_similarity_score,max_similarity_score
0,0,"[343002, 1852]","[46677, 867248, 6669717, 22013040, 402114]","[0.02697621518236352, 0.0, 0.03242136805362899...",0.018405,"[0.03242136805362899, 0.03906784168008337]"
1,1,"[1248128, 30119]","[14118, 46677, 867248, 6669717, 22013040]","[0.007197123322763255, 0.0051736136925291925, ...",0.058451,"[0.007197123322763255, 0.5127871656196451]"
2,2,"[74595, 2711313]","[14118, 867248, 6669717, 22013040, 402114]","[0.01044065185339305, 0.0, 0.02950585563320733...",0.009337,"[0.029505855633207338, 0.018733342147202403]"
3,3,"[30119, 157993]","[14118, 46677, 867248, 6669717, 22013040]","[0.012281415239966146, 0.02696491648561693, 0....",0.066646,"[0.5127871656196451, 0.0517584584988947]"
4,4,"[13023, 157993]","[14118, 46677, 867248, 6669717, 22013040]","[0.00340888102818777, 0.02332300161231656, 0.0...",0.015926,"[0.02332300161231656, 0.0517584584988947]"
...,...,...,...,...,...,...
995,995,"[24178, 78411]","[14118, 46677, 867248, 6669717, 22013040]","[0.0426236394204663, 0.02526529716731194, 0.00...",0.019403,"[0.0426236394204663, 0.028494710116081803]"
996,996,"[267972, 275000]","[14118, 46677, 867248, 6669717, 22013040]","[0.02398428888624259, 0.020906982781366244, 0....",0.010759,"[0.02561423259626288, 0.007984469981907565]"
997,997,"[25618438, 23754884]","[14118, 46677, 867248, 6669717, 22013040]","[0.0, 0.015102355213715836, 0.0, 0.02371667758...",0.006078,"[0.023716677584827726, 0.006018297839707198]"
998,998,"[140225, 5]","[14118, 46677, 867248, 6669717, 22013040]","[0.011927089756086478, 0.01132274443756136, 0....",0.012094,"[0.029593441500958117, 0.019991562840876765]"


In [59]:
output_df['num_exact_matches'] = output_df.apply(
    lambda row: sum(1 for book_id in row['recommendations'] if book_id in row['books_id_read']),
    axis=1
)
output_df

Unnamed: 0,user_id,books_id_read,recommendations,similarity_scores_list,mean_similarity_score,max_similarity_score,num_exact_matches
0,0,"[343002, 1852]","[46677, 867248, 6669717, 22013040, 402114]","[0.02697621518236352, 0.0, 0.03242136805362899...",0.018405,"[0.03242136805362899, 0.03906784168008337]",0
1,1,"[1248128, 30119]","[14118, 46677, 867248, 6669717, 22013040]","[0.007197123322763255, 0.0051736136925291925, ...",0.058451,"[0.007197123322763255, 0.5127871656196451]",0
2,2,"[74595, 2711313]","[14118, 867248, 6669717, 22013040, 402114]","[0.01044065185339305, 0.0, 0.02950585563320733...",0.009337,"[0.029505855633207338, 0.018733342147202403]",0
3,3,"[30119, 157993]","[14118, 46677, 867248, 6669717, 22013040]","[0.012281415239966146, 0.02696491648561693, 0....",0.066646,"[0.5127871656196451, 0.0517584584988947]",0
4,4,"[13023, 157993]","[14118, 46677, 867248, 6669717, 22013040]","[0.00340888102818777, 0.02332300161231656, 0.0...",0.015926,"[0.02332300161231656, 0.0517584584988947]",0
...,...,...,...,...,...,...,...
995,995,"[24178, 78411]","[14118, 46677, 867248, 6669717, 22013040]","[0.0426236394204663, 0.02526529716731194, 0.00...",0.019403,"[0.0426236394204663, 0.028494710116081803]",0
996,996,"[267972, 275000]","[14118, 46677, 867248, 6669717, 22013040]","[0.02398428888624259, 0.020906982781366244, 0....",0.010759,"[0.02561423259626288, 0.007984469981907565]",0
997,997,"[25618438, 23754884]","[14118, 46677, 867248, 6669717, 22013040]","[0.0, 0.015102355213715836, 0.0, 0.02371667758...",0.006078,"[0.023716677584827726, 0.006018297839707198]",0
998,998,"[140225, 5]","[14118, 46677, 867248, 6669717, 22013040]","[0.011927089756086478, 0.01132274443756136, 0....",0.012094,"[0.029593441500958117, 0.019991562840876765]",0


## Evaluation

In [57]:
output_df['mean_similarity_score'].describe()

count    1000.000000
mean        0.018003
std         0.018727
min         0.000000
25%         0.008647
50%         0.012214
75%         0.016738
max         0.112852
Name: mean_similarity_score, dtype: float64

In [60]:
total_books = 2000
total_hits = output_df['num_exact_matches'].sum()
print(total_hits)
accuracy_of_exact_hits = total_hits/total_books * 100 # checking for accuracy of recommendation. hits/total_books
print(f"The accuracy of the recommendation system in terms of exact match is {accuracy_of_exact_hits:.2f}%")

0
The accuracy of the recommendation system in terms of exact match is 0.00%
