# Content Based Recommender

First we load in the books data

In [29]:
# Loading the Data
import pandas as pd
import glob
import os
import numpy as np
import re

directory = './dataset'
csv_files = glob.glob(os.path.join(directory, 'book*.csv'))
dataframes = [pd.read_csv(file) for file in csv_files]
book_data = pd.concat(dataframes, ignore_index=True)
print(book_data.head())

        Id                                               Name  \
0  1900511                                         Barbarossa   
1  1900512  Collector's Guide to German World War II: Comb...   
2  1900514                               Images of Barbarossa   
3  1900520        Romania After 2000: Five New Romanian Plays   
4  1900521           Global Foreigners: An Anthology of Plays   

                  Authors        ISBN  Rating  PublishYear  PublishMonth  \
0      Christopher Ailsby  1840138009     3.0         2007             4   
1      Christopher Ailsby  0781802253     0.0         1994             7   
2      Christopher Ailsby  0711028257     3.5         2001             1   
3  Daniel Charles Gerould  0595436560     4.0         2007             9   
4        Saviana Stănescu  1905422423     4.6         2006            12   

   PublishDay                                    Publisher RatingDist5  ...  \
0           1                               New Line Books         5:0  .

We then load in the ratings data

In [30]:
rating_df = pd.DataFrame()
for i, file in enumerate(os.listdir(directory)):
    if re.match(r'user_rating_*',file):
        data = pd.read_csv(directory+'/'+file)
        rating_df = pd.concat([rating_df, data], axis=0, join='outer')

# Delete any rows that refer to a users with no ratings
rating_df = rating_df.drop(rating_df[rating_df['Rating'] == "This user doesn't have any rating"].index)

# Replace textual ratings with numerical ratings
ratings = {'did not like it': 1, 'it was ok': 2, 'liked it': 3, 'really liked it': 4, 'it was amazing': 5}
for textual, numerical in ratings.items():
    rating_df['Rating'] = rating_df['Rating'].mask(rating_df['Rating'] == textual, numerical)

# Visualise the data and its shape
print(rating_df.head())
print(rating_df.shape)

# Print the number of unique user ID's and book names in the dataframe
print(f"Number of unique users with ratings: {rating_df['ID'].nunique()}")
print(f"Number of unique books in ratings data: {rating_df['Name'].nunique()}")

     ID                                   Name Rating
580   5                      The Secret Garden      5
581   5         Remarkable Women of California      5
582   5                       The Forsyte Saga      5
583   5  The Crystal Cave (Arthurian Saga, #1)      4
584   5      Me Before You (Me Before You, #1)      5
(342591, 3)
Number of unique users with ratings: 4118
Number of unique books in ratings data: 102744


There are missing values in our books data so we remove the rows with missing publisher and description since we need these features. We merge the identical features pagesNumber and PagesNumber. We then combine Name, Authors, PublishYear, Publisher, Description into one column named Content.

In [31]:
# Drop duplicates and missing values
book_data = book_data.drop_duplicates()
print(book_data.isna().sum())
book_data.dropna(subset=['Publisher', 'Description'], inplace=True)
book_data['PagesNumber'] = book_data['pagesNumber'].combine_first(book_data['PagesNumber'])
book_data.drop(columns=['pagesNumber'], inplace=True)
book_data['Content'] = book_data[['Name', 'Authors', 'PublishYear', 'Publisher', 'Description']].astype(str).apply(lambda x: ' '.join(x), axis=1)
book_data

Id                             0
Name                           0
Authors                        0
ISBN                        5922
Rating                         0
PublishYear                    0
PublishMonth                   0
PublishDay                     0
Publisher                  17823
RatingDist5                    0
RatingDist4                    0
RatingDist3                    0
RatingDist2                    0
RatingDist1                    0
RatingDistTotal                0
CountsOfReview                 0
Language                 1598399
PagesNumber               834966
Description               679010
pagesNumber              1015232
Count of text reviews    1440501
dtype: int64


Unnamed: 0,Id,Name,Authors,ISBN,Rating,PublishYear,PublishMonth,PublishDay,Publisher,RatingDist5,...,RatingDist3,RatingDist2,RatingDist1,RatingDistTotal,CountsOfReview,Language,PagesNumber,Description,Count of text reviews,Content
0,1900511,Barbarossa,Christopher Ailsby,1840138009,3.00,2007,4,1,New Line Books,5:0,...,3:1,2:0,1:0,total:1,0,,192.0,"On 22 June 1941, Adolf Hitler launched Operati...",,Barbarossa Christopher Ailsby 2007 New Line Bo...
2,1900514,Images of Barbarossa,Christopher Ailsby,0711028257,3.50,2001,1,25,Ian Allan Ltd,5:0,...,3:2,2:1,1:0,total:8,0,,256.0,"On 22 June 1941, Adolf Hitler launched Operati...",,Images of Barbarossa Christopher Ailsby 2001 I...
3,1900520,Romania After 2000: Five New Romanian Plays,Daniel Charles Gerould,0595436560,4.00,2007,9,1,Martin E. Segal Theatre Center Publications,5:1,...,3:1,2:0,1:0,total:6,0,,226.0,The first anthology of new Romanian Drama publ...,,Romania After 2000: Five New Romanian Plays Da...
4,1900521,Global Foreigners: An Anthology of Plays,Saviana Stănescu,1905422423,4.60,2006,12,7,Seagull Books,5:4,...,3:1,2:0,1:0,total:5,0,,320.0,"In Waxing West, Daniella, newly arrived in the...",,Global Foreigners: An Anthology of Plays Savia...
5,1900525,Diary of a Clone,Saviana Stănescu,092338961X,4.80,2003,1,1,Meeting Eyes Bindery,5:4,...,3:0,2:0,1:0,total:5,0,,66.0,Poetry. Translation. DIARY OF A CLONE is a sma...,,Diary of a Clone Saviana Stănescu 2003 Meeting...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1850303,1499980,The O'Brien Book of Irish Fairy Tales & Legends,Una Leavy,0862784824,4.24,1996,9,10,O'Brien Press,5:39,...,3:8,2:4,1:0,total:95,1,,96.0,Irish fairy tales and legends are full of ench...,1.0,The O'Brien Book of Irish Fairy Tales & Legend...
1850305,1499988,Irish Folk and Fairy Tales Omnibus Edition,Michael Scott,0751508861,4.19,1989,24,8,Sphere,5:140,...,3:42,2:13,1:4,total:311,10,,637.0,"Here, collected in one volume, are tales and l...",10.0,Irish Folk and Fairy Tales Omnibus Edition Mic...
1850306,1499990,Robin Hood: The Shaping of the Legend,Jeffrey L. Singman,0313301018,3.00,1998,23,7,Praeger,5:0,...,3:1,2:0,1:0,total:1,0,,224.0,Among the narrative traditions of the Middle A...,0.0,Robin Hood: The Shaping of the Legend Jeffrey ...
1850308,1499992,Competing on Value,Mack Hanan,0814450369,3.50,1991,22,4,Amacom,5:2,...,3:2,2:2,1:0,total:8,1,,220.0,Presents a new approach to selling that emphas...,1.0,Competing on Value Mack Hanan 1991 Amacom Pres...


# Preprocess Content

We preprocess our content column using tokenization and word stemming. This ensures that we get the most information possible from the data and helps with the vectorization process.

In [32]:
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

ps = PorterStemmer()

def preprocess_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [ps.stem(word) for word in tokens]
    return ' '.join(tokens)

In [33]:
book_data['Content'] = book_data['Content'].apply(preprocess_text)
book_data.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1163326 entries, 0 to 1850309
Data columns (total 21 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   Id                     1163326 non-null  int64  
 1   Name                   1163326 non-null  object 
 2   Authors                1163326 non-null  object 
 3   ISBN                   1160256 non-null  object 
 4   Rating                 1163326 non-null  float64
 5   PublishYear            1163326 non-null  int64  
 6   PublishMonth           1163326 non-null  int64  
 7   PublishDay             1163326 non-null  int64  
 8   Publisher              1163326 non-null  object 
 9   RatingDist5            1163326 non-null  object 
 10  RatingDist4            1163326 non-null  object 
 11  RatingDist3            1163326 non-null  object 
 12  RatingDist2            1163326 non-null  object 
 13  RatingDist1            1163326 non-null  object 
 14  RatingDistTotal   

# Vectorize

We load the vectorizer from the genre classifier and vectorize our content column.

In [34]:
import joblib

vectorizer = joblib.load('tfidf_vectorizer.pkl')

In [35]:
from scipy.sparse import vstack

def process_in_chunks(df, chunk_size=10000):
    num_chunks = len(df) // chunk_size + (1 if len(df) % chunk_size != 0 else 0)
    sparse_matrices = []

    for i in range(num_chunks):
        chunk = df.iloc[i*chunk_size : (i+1)*chunk_size]
        sparse_matrix = vectorizer.transform(chunk['Content'])
        sparse_matrices.append(sparse_matrix)

    combined_sparse_matrix = vstack(sparse_matrices)
    return combined_sparse_matrix

book_vectors = process_in_chunks(book_data, chunk_size=10000)
book_vectors = book_vectors.astype('float32')

# Genre Prediction

We now use our genre classifier to assign each book with a genre.

In [36]:
from tensorflow import keras

model = keras.models.load_model('./dense32model8972.h5')

model.summary()



Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_4 (Dense)             (None, 32)                320032    
                                                                 
 dropout_2 (Dropout)         (None, 32)                0         
                                                                 
 batch_normalization_2 (Bat  (None, 32)                128       
 chNormalization)                                                
                                                                 
 dense_5 (Dense)             (None, 10)                330       
                                                                 
Total params: 320490 (1.22 MB)
Trainable params: 320426 (1.22 MB)
Non-trainable params: 64 (256.00 Byte)
_________________________________________________________________


In [37]:
genre_pred = model.predict(book_vectors)



In [38]:
genres = ['fantasy', 'science', 'crime', 'history', 'horror', 'thriller', 'psychology', 'romance', 'sports', 'travel']
genre_pred.shape
predicted_classes = np.argmax(genre_pred, axis=1)
predicted_genres = [genres[class_idx] for class_idx in predicted_classes]
book_data['Genre'] = predicted_genres
book_data.head()

Unnamed: 0,Id,Name,Authors,ISBN,Rating,PublishYear,PublishMonth,PublishDay,Publisher,RatingDist5,...,RatingDist2,RatingDist1,RatingDistTotal,CountsOfReview,Language,PagesNumber,Description,Count of text reviews,Content,Genre
0,1900511,Barbarossa,Christopher Ailsby,1840138009,3.0,2007,4,1,New Line Books,5:0,...,2:0,1:0,total:1,0,,192.0,"On 22 June 1941, Adolf Hitler launched Operati...",,barbarossa christoph ailsbi 2007 new line book...,fantasy
2,1900514,Images of Barbarossa,Christopher Ailsby,0711028257,3.5,2001,1,25,Ian Allan Ltd,5:0,...,2:1,1:0,total:8,0,,256.0,"On 22 June 1941, Adolf Hitler launched Operati...",,imag of barbarossa christoph ailsbi 2001 ian a...,fantasy
3,1900520,Romania After 2000: Five New Romanian Plays,Daniel Charles Gerould,0595436560,4.0,2007,9,1,Martin E. Segal Theatre Center Publications,5:1,...,2:0,1:0,total:6,0,,226.0,The first anthology of new Romanian Drama publ...,,romania after 2000 : five new romanian play da...,sports
4,1900521,Global Foreigners: An Anthology of Plays,Saviana Stănescu,1905422423,4.6,2006,12,7,Seagull Books,5:4,...,2:0,1:0,total:5,0,,320.0,"In Waxing West, Daniella, newly arrived in the...",,global foreign : an antholog of play saviana s...,psychology
5,1900525,Diary of a Clone,Saviana Stănescu,092338961X,4.8,2003,1,1,Meeting Eyes Bindery,5:4,...,2:0,1:0,total:5,0,,66.0,Poetry. Translation. DIARY OF A CLONE is a sma...,,diari of a clone saviana stănescu 2003 meet ey...,fantasy


# Combined Data (Finding the Overlap)

Next, we merge both the ratings and books dataframes on their 'Name' column (i.e., the name of the book). This is required as the original rating data is only connected to book names, but we also want to connect them to book ID's so that we can build one user profile that can be used for evaluation by both the content-based and collaborative filtering models. 

Initially, we attempted to merge the data directly, but there was a key issue with our approach. A user with userID = 5 had rated a book called 'The Secret Garden' in the ratings dataframe. However, multiple books shared this book name and each book had a seperate ID. Thus, this rating of 'The Secret Garden' was copied to each unique book that shared this name. Furthermore, we were unable to know which of these BookID's refer to the actual book that user 5 rated. Thus, for the merged dataframe we decided to remove all instances of ratings where the book Name is not unique in the books dataframe. To do this, we must ensure that each 'UserID' and book 'Name' pair is associated with only a single unique 'bookID'. To ensure that there are no cases where a (UserID, Name) pair is associated with multiple BookID's, we need to first remove all book names that aren't unique in the original books dataframe prior to merging, as seen below.

In [39]:
# For each book name, get a count for the number of times it appears in the book_data.
book_name_counts = book_data['Name'].value_counts()
# Seperate the unique book names.
unique_names = book_name_counts[book_name_counts == 1].index

# Make a copy of the dataframe which retains ONLY the books with unique names.
book_df = book_data[book_data['Name'].isin(unique_names)].copy()

# Print the number of entries that were removed from the original books dataframe.
print(f"{book_data.shape[0] - book_df.shape[0]} rows were removed from the books dataframe.")
print(book_df.shape)


187248 rows were removed from the books dataframe.
(976078, 22)


Since both the 'rating_df' and 'book_df' dataframes have a 'Rating' value, we must drop one of them. We have chosen to drop the overall 'Rating' of a book, as it is less helpful when creating user profiles and can still be easily retrieved from the original dataframe using the BookID.

In [40]:
# Drop the rating column from book_df.
book_df.drop('Rating', axis=1, inplace=True)

Finally, we merge the two dataframes and also drop all unecessary columns so that we can see a clear connection between a user rating and the book ID of the rated book. We also rename the 'Id' column from the books dataframe to 'BookID, and the 'ID' column from the ratings dataframe to 'UserID' to increase clarity.

Lastly, we print the head of the data, aswell as the shape.

In [41]:
merged_df = pd.merge(rating_df, book_df, on='Name', how='inner')

colsToDrop = ['Authors', 'PublishYear', 'PublishMonth', 'PublishDay', 'Publisher', 'RatingDist5', 'RatingDist4', 
              'RatingDist3', 'RatingDist2', 'RatingDist1', 'RatingDistTotal', 'CountsOfReview', 'Language',
              'Content', 'Description', 'Count of text reviews', 'PagesNumber', 'Genre']

merged_df.drop(colsToDrop, axis=1, inplace=True)

merged_df.rename(columns={'Id':'BookID', 'ID':'UserID'}, inplace=True)

print(merged_df.head())
print(merged_df.shape)
# print(f"There are {merged_df['BookID'].nunique()} books that are comming in both dataframes.")

   UserID                                   Name Rating   BookID        ISBN
0       5  The Crystal Cave (Arthurian Saga, #1)      4  1293575  0449206440
1     139  The Crystal Cave (Arthurian Saga, #1)      4  1293575  0449206440
2     338  The Crystal Cave (Arthurian Saga, #1)      4  1293575  0449206440
3     853  The Crystal Cave (Arthurian Saga, #1)      5  1293575  0449206440
4    7268  The Crystal Cave (Arthurian Saga, #1)      4  1293575  0449206440
(56302, 5)


# Finding Users to Evaluate

Next, we will use the merged dataframe to find 3 users that will be used to evaluate the performance of our models. The goal is to find users who have enough ratings to provide meaningful information about their likes & dislikes, while ensuring that there is a reasonable split between liked and disliked books. For likes, we will set the threshold at 4 or more stars, i.e., a user 'likes' a book if they have rated it at 4 or above.

To obtain meaninful information, we will only evaluate users that 'like' (rating >= 4) less than half of the total books that they've rated. This ensures that there is a useful amount of information on both the user's likes and dislikes. However, we will also see how the model performs for users with different rating habits. In particular, we will see how the model performs for three specific users:
- 20+ ratings and 10 liked books
- 80+ ratings and 40 liked books
- 326+ ratings and 163 liked books

We aim to see the value provide by the three models for the three chosen users with respect to the number of books that they have indicated as a 'like', as well as the total amount of feedback that they have provided.

In [42]:
# Get a count of the number of books that each user has rated.
ratings_count = merged_df.groupby('UserID').size()

# Get a dataframe that only contains books that users like (i.e., rating at 4 or above)
likes_df = merged_df[merged_df['Rating'] >= 4]
# Get a count of the number of books that each user 'likes'. 
likes_count = likes_df.groupby('UserID').size()

# Find users with three differing numbers of liked books.
users_with_10_likes = likes_count[likes_count == 10].index
users_with_40_likes = likes_count[likes_count == 40].index
users_with_160_likes = likes_count[likes_count >= 160].index

# Initialise a list to hold the ID's of the users that are chosen for evaluation.
users_to_evaluate = []

# Out of all users with 10 liked books,
for user_id in users_with_10_likes:
    # If they have rated more than 20 books,
    if (ratings_count[user_id] > 2*likes_count[user_id]):
        # Add them to the users_to_evaluate list, and break (as we need only one user of this type).
        users_to_evaluate.append(user_id)
        break

# Out of all users with 40 liked books,
for user_id in users_with_40_likes:
    # If they have rated more than 80 books,
    if (ratings_count[user_id] > 2*likes_count[user_id]):
        # Add them to the users_to_evaluate list, and break (as we need only one user of this type).
        users_to_evaluate.append(user_id)
        break

# Out of all users with >= 160 liked books,
for user_id in users_with_160_likes:
    # If they have rated more than double their liked amount,
    if (ratings_count[user_id] > 2*likes_count[user_id]):
        # Add them to the users_to_evaluate list, and break (as we need only one user of this type).
        users_to_evaluate.append(user_id)
        break

# Print information about the chosen users.
print("The following users will be evaluated: \n")
for user in users_to_evaluate:
    print(f"User {user}, who liked {likes_count[user]} books and rated {ratings_count[user]} in total.")

The following users will be evaluated: 

User 63, who liked 10 books and rated 27 in total.
User 3422, who liked 40 books and rated 99 in total.
User 3259, who liked 163 books and rated 358 in total.


# User Profiles

For the cold start problem we do not have any user ratings to form a user profile. Instead we must use the user preferences to form a user profile. We collect the user preferences from our interface and then select the highest rated books that match each of the genres and authors that the user prefers. We filter these results by their ideal book length (within 100 pages). We then combine all of the content from those books into 1 document and vectorize it, this is the user profile.

In [43]:
def generate_user_profile(user_pref, df, vectorizer):
    matches_dfs = []
    for genre in user_pref['Genres']:
        genre_matches = df[df['Genre'] == genre]
        genre_matches = genre_matches[abs(genre_matches['PagesNumber'] - user_pref['Length']) <= 100].sort_values(by=['Rating'], ascending=False).head(10)
        matches_dfs.append(genre_matches)
    for author in user_pref['Authors']:
        author_matches = df[df['Authors'].str.contains(author, case=False, na=False)]
        author_matches = author_matches[abs(author_matches['PagesNumber'] - user_pref['Length']) <= 100].sort_values(by=['Rating'], ascending=False).head(10)
        matches_dfs.append(author_matches)
    matches = pd.concat(matches_dfs, ignore_index=True)
    # Drop duplicate rows if necessary
    matches = matches.drop_duplicates().reset_index(drop=True)

    document = ' '.join(matches['Content'])
    return vectorizer.transform([document]).toarray()

kid_pref = {
    'Genres': ['fantasy', 'sports', 'science'],
    'Authors': ['J.K Rowling', 'Enid Blyton', 'Suzanne Collins', 'Andy Griffiths', 'Roald Dahl', 'Jeff Kinney'],
    'Length': 100
}
ya_pref = {
    'Genres': ['fantasy', 'romance', 'thriller'],
    'Authors': ['J.K Rowling', 'Suzanne Collins', 'John Green', 'Veronica Roth', 'James Dashner', 'Karen M. McManus'],
    'Length': 300
}
crime_pref = {
    'Genres': ['crime', 'thriller', 'horror', 'romance', 'psychology'],
    'Authors': ['Sarah J. Maas', 'Agatha Christie', 'Karen M. McManus', 'Ian Rankin', 'Lee Child', 'Richard Osman', 'Kate Atkinson'],
    'Length': 500
}
classics_pref = {
    'Genres': ['fantasy', 'history', 'romance', 'thriller'],
    'Authors': ['Agatha Christie', 'Enid Blyton', 'Charles Dickens', 'Roald Dahl', 'Jane Austen', 'William Shakespeare', 'J.D. Salinger', 'George Orwell'],
    'Length': 350
}
nonfiction_pref = {
    'Genres': ['science', 'history', 'psychology', 'sports', 'travel'],
    'Authors': ['Tom Holland', 'David McCullough', 'William L. Shirer', 'A.J. Liebling', 'David Attenborough'],
    'Length': 500
}
kid_profile = generate_user_profile(kid_pref, book_data, vectorizer)
ya_profile = generate_user_profile(ya_pref, book_data, vectorizer)
crime_profile = generate_user_profile(crime_pref, book_data, vectorizer)
classics_profile = generate_user_profile(classics_pref, book_data, vectorizer)
nonfiction_profile = generate_user_profile(nonfiction_pref, book_data, vectorizer)

Once we have ratings data we can use the user's 'liked' books (rating > 4) as their user profile. This allows us to evaluate the model. We take the 3 users chosen earlier and create user profiles for them using half of their liked books, combining the content columns into one document and then vectorizing.

In [70]:
from sklearn.model_selection import train_test_split

def user_profile_from_liked_books(user_id, likes_df, book_data, vectorizer):
    all_liked = likes_df[likes_df['UserID'] == user_id].copy()
    train_liked, test_liked = train_test_split(all_liked, test_size=0.5, random_state=42)
    all_content = []
    for bookID in train_liked['BookID']:
        all_content.append(book_data[book_data['Id'] == bookID]['Content'].iloc[0])
    document = ' '.join(all_content)
    
    return vectorizer.transform([document]).toarray(), train_liked, test_liked

user_profiles = {}
test_sets = {}
train_sets = {}
for user in users_to_evaluate:
    user_profile, train_liked, test_liked = user_profile_from_liked_books(user, likes_df, book_data, vectorizer)
    user_profiles[user] = user_profile
    test_sets[user] = test_liked
    train_sets[user] = train_liked

We can then compute the Jaccard similarity between our user profile and any book vector.

In [59]:
def jaccard_similarity(vec1, vec2):
    intersection = np.sum(np.minimum(vec1, vec2))
    union = np.sum(np.maximum(vec1, vec2))
    return intersection / union if union != 0 else 0

We get the top n recommendations by going through every book and computing the Jaccard similarity. We ignore the books used in the user profile and we ignore books with equal or less reviews than our threshold. We multilply the similarity by the similarity weight and add the normalized rating by (1 - similarity weight). This ensures that we are recommending books that other users have read and liked. We sort the books by similarity and take the top N.

In [73]:
import re

def get_top_n(book_data, book_vectors, user_profile, train_liked, threshold, n, similarity_weight):
    max_review_count = max(book_data['CountsOfReview'])
    similarities = {}
    for index, vector in enumerate(book_vectors):
        bookID = book_data['Id'].iloc[index]
        if bookID in train_liked['BookID'].values:
            continue
        review_count = book_data['CountsOfReview'].iloc[index]
        rating = book_data['Rating'].iloc[index]
        if review_count <= threshold:
            continue
        similarity = jaccard_similarity(user_profile, vector.toarray())
        normalized_reviews = (review_count)/(max_review_count)
        normalized_rating = (rating*normalized_reviews)/5
        # weight the similarity by their review count
        similarities[index] = (similarity_weight * similarity) + ((1-similarity_weight) * normalized_rating)
        # similarities[index] = similarity

    sorted_books = sorted(similarities.items(), key=lambda x: x[1], reverse=True)[:n]
    sorted_book_names = []
    for book in sorted_books:
        name = book_data.iloc[book[0]]['Name']
        bookID = book_data.iloc[book[0]]['Id']
        sorted_book_names.append({"Name": name, "BookID": bookID})
    return sorted_book_names

# Evaluation

We can then evaluate our model on precision, recall and f1_score. We compute TP, FP, TN and FN by checking if our recommendations are in the users 'test' set that we kept out of their profile. If we recommend a book to the user and they like it then that is a TP. If they don't like a recommendation then that is a FP. If we don't recommend a book to the user and they don't like it then that is a TN. If we don't recommend a book to a user but they do like it then that is a FN.

We assume that if a user has not rated a book that they don't like it since we have no idea about their preferences. However, in reality they would like some of those books. This means that our TN value will be very high as almost all of the books we don't recommend will be TN. Therefore, accuracy is not a useful metric to use.

In [78]:
def evaluate_recommendations(top_n, n, test_liked, total_books):
    TP = 0
    FP = 0
    TN = 0
    FN = 0
    for book in top_n:
        if book['BookID'] in test_liked['BookID'].values:
            TP += 1
        else:
            FP += 1
    FN = len(test_liked) - TP
    TN = total_books - n - FP
    accuracy = (TP + TN) / (total_books) if (total_books) != 0 else 0
    precision = TP / (TP + FP) if (TP + FP) != 0 else 0
    recall = TP / (TP + FN) if (TP + FN) != 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0

    return accuracy, precision, recall, f1_score

We evaluate the model for each user with the different parameters to determine the best ones.

In [82]:
import itertools
for user in users_to_evaluate:
    ns = [10, 30, 50]
    thresholds = [0, 500, 1000]
    weights = [1, 0.9, 0.8]
    for params in itertools.product(ns, thresholds, weights):
        top_n = get_top_n(book_data, book_vectors, user_profiles[user], train_sets[user], params[1], params[0], params[2])
        accuracy, precision, recall, f1_score = evaluate_recommendations(top_n, params[0], test_sets[user], len(book_data)-len(train_sets[user]))
        print("For user", user)
        print("With n =", params[0], "and threshold =", params[1], "and similarity weight =", params[2])
        print("Accuracy:", accuracy)
        print("Precision:", precision)
        print("Recall:", recall)
        print("f1_score:", f1_score)

For user 63
With n = 10 and threshold = 0 and similarity weight = 1
Accuracy: 0.9999828078406562
Precision: 0.0
Recall: 0.0
f1_score: 0
For user 63
With n = 10 and threshold = 0 and similarity weight = 0.9
Accuracy: 0.9999828078406562
Precision: 0.0
Recall: 0.0
f1_score: 0
For user 63
With n = 10 and threshold = 0 and similarity weight = 0.8
Accuracy: 0.9999828078406562
Precision: 0.0
Recall: 0.0
f1_score: 0
For user 63
With n = 10 and threshold = 500 and similarity weight = 1
Accuracy: 0.9999828078406562
Precision: 0.0
Recall: 0.0
f1_score: 0
For user 63
With n = 10 and threshold = 500 and similarity weight = 0.9
Accuracy: 0.9999828078406562
Precision: 0.0
Recall: 0.0
f1_score: 0
For user 63
With n = 10 and threshold = 500 and similarity weight = 0.8
Accuracy: 0.9999828078406562
Precision: 0.0
Recall: 0.0
f1_score: 0
For user 63
With n = 10 and threshold = 1000 and similarity weight = 1
Accuracy: 0.9999828078406562
Precision: 0.0
Recall: 0.0
f1_score: 0
For user 63
With n = 10 and thr

The paramater values that produce the best precision, recall and f1 score are:
N = 50, threshold = 1000 and similarity weight = 0.9.

We ignore accuracy since the true negative value is very high so it makes accuracy an irrelevant metric. The actual precision, recall and f1 scores are very low. This is becuase we have over 1 million books in the dataset and only 5-81 liked books in each users test set. This means that even recommending 1 book that the user likes is a significant achievement because we can't know if the user likes books that they haven't rated, however it is fair to assume that they would like some of the recommendations (had they rated it) if we recommend at least 1 that they already like.

We now do a final evaluation of the model with the chosen paramaters and average the results from our 3 users. We use a higher N (N = 100) since we want to see the performance of the model, a higher N will improve the chances of recommending a book that the user likes but may reduce precision and recall slightly if we do recommend a book that they like. This is ok for the evaluation step, as it allows us to see if the recommender is recommending any books that the user has already liked. In the actual implementation of the model we will choose a smaller N as we need to take into account the usability for the user, 100 recommendations is not user-friendly.

In [83]:
sum_accuracy = 0
sum_precision = 0
sum_recall = 0
sum_f1 = 0
for user in users_to_evaluate:
    n = 100
    threshold = 1000
    similarity_weight = 0.9
    top_n = get_top_n(book_data, book_vectors, user_profiles[user], train_sets[user], threshold, n, similarity_weight)
    accuracy, precision, recall, f1_score = evaluate_recommendations(top_n, n, test_sets[user], len(book_data)-len(train_sets[user]))
    sum_accuracy += accuracy
    sum_precision += precision
    sum_recall += recall
    sum_f1 += f1_score
print("Overall:")
print("Accuracy:", sum_accuracy/len(users_to_evaluate))
print("Precision:", sum_precision/len(users_to_evaluate))
print("Recall:", sum_recall/len(users_to_evaluate))
print("f1_score:", sum_f1/len(users_to_evaluate))

Overall:
Accuracy: 0.9998292201122778
Precision: 0.006666666666666667
Recall: 0.020731707317073172
f1_score: 0.009218559218559219
