# Content-Based Filtering

## Load libraries & methods

In [1]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import ast
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

## Read Data

In [3]:
df_books = pd.read_csv('df_books_final.csv')

In [None]:
train_interactions = pd.read_csv('train_interactions.csv')

## Cosine Similarity

### Similarity Matrix Using Book Metadata

We tried to create a similarity matrix using the book metadata (vectorized title & description using Word2Vec, genre, format, and other feature engineered columns). We experimented calculating cosine similarity using both the dense and the sparse matrix representations, but ran into memory issues for both methods as the matrix was too large. Thus, we proceeded with finding similarities between books using TF-IDF of the book descriptions.

In [None]:
# # Concatenate 'title' and 'description' columns into a new column 'title_description'
# df_books['title_description'] = df_books['title'] + " " + df_books['description']

# from gensim.models import Word2Vec

# book_descriptions = df_books['title_description']

# # Tokenize book descriptions
# tokenized_descriptions = [description.split() for description in book_descriptions]

# # Train Word2Vec model
# model = Word2Vec(tokenized_descriptions, vector_size=100, window=5, min_count=1, workers=4)

# # To get a single vector for an entire description, average the word vectors
# def get_vector(text):
#     words = text.split()
#     word_vectors = np.array([model.wv[word] for word in words if word in model.wv])
#     return np.mean(word_vectors, axis=0) if word_vectors.size else np.zeros(model.vector_size)

# # Vectorize descriptions
# descriptions_vectors = np.array([get_vector(description) for description in book_descriptions])

# # Create a DataFrame with 100 new columns for the word vectors
# vector_columns = pd.DataFrame(descriptions_vectors, columns=[f'vector_{i+1}' for i in range(descriptions_vectors.shape[1])])
# # Concatenate the vector columns with the original df_books DataFrame
# df_books_with_vectors = pd.concat([df_books, vector_columns], axis=1)

# df_books_with_vectors = df_books_with_vectors.drop(columns=['top_popular_shelves'])

# df_books_with_vectors.head(3)

# # convert the string representation of lists into actual lists and then extract the first element as an integer
# df_books_with_vectors['author_id_int'] = df_books_with_vectors['author_ids'].apply(lambda x: int(ast.literal_eval(x)[0]) if x else None)
# # type int64
# df_books_with_vectors['author_id_int'] = df_books_with_vectors['author_id_int'].astype('Int64')

# df_books_final = df_books_with_vectors.drop(columns=['author_ids', 'title', 'description', 'title_description'])

# bool_columns = df_books_final.select_dtypes(include=['bool']).columns
# df_books_final[bool_columns] = df_books_final[bool_columns].astype(int)
# print(df_books_final.head(3))


# # Select only the numeric columns (int, float)
# df_books_numeric = df_books_final.select_dtypes(include=[int, float])

# # Convert the DataFrame to a sparse matrix
# from scipy.sparse import csr_matrix
# df_books_sparse = csr_matrix(df_books_numeric.values)


# books_cos_sim = cosine_similarity(df_books_final, dense_output = False)

### Vectorization of Book Description

In [3]:
#Calculation of TFIDF scores for 'description'
tfidf = TfidfVectorizer(stop_words='english')
tfidf_scores =  tfidf.fit_transform(df_books['description'])
tfidf_scores.shape

(59828, 149225)

### Cosine Similarity Matrix

In [4]:
# compute cosine similarity matrix directly for the TF-IDF matrix
tfidf_cos_sim = cosine_similarity(tfidf_scores, dense_output=False)

## Generating Recommendations Using Training Data

In [10]:
# Sort by user_id, review_age (ascending for lowest first), and rating (descending for highest first)
train_interactions_sorted = train_interactions.sort_values(
    by=['user_id', 'review_age', 'rating'], ascending=[True, True, False]
)

### Defining User's History of Books Read

We initially wanted to account for all books in the reading history, but generating recommendations for each book in the history took too much time. Thus, to account for a user's reading history when generating recommendations, we took the 3 most recent and most highly rated books from the users reading history.

In [11]:
# Group by user_id and select top 3 books based on the sorted order
user_reading_history = (
    train_interactions_sorted.groupby('user_id')
    .head(3)  # Take top 3 rows per user
    .groupby('user_id')['book_id']
    .apply(list)  # Aggregate book_ids into a list
    .reset_index()
    .rename(columns={'book_id': 'books_id_read'})
)

In [12]:
user_reading_history.head(5)

Unnamed: 0,user_id,books_id_read
0,0,"[157993, 359079, 41684]"
1,1,"[34524, 236093, 17131769]"
2,2,"[24213, 5, 194755]"
3,3,"[134371, 10444, 42359]"
4,4,"[8144, 5, 6680753]"


We then filtered out the book with the highest similarity with the other books in the top 3 to generate book recommendations for each user. 

In [13]:
# Function to compute cumulative similarity scores and find the reference book
def get_reference_book(books_id_read, books_df, similarity_matrix):
    cumulative_scores = {}
    
    # For each book in the user's reading history
    for book_id in books_id_read:
        try:
            # Get the index of the book in df_books to match the similarity matrix
            book_index = books_df.index[books_df['book_id'] == book_id][0]
            
            # Compute similarity scores with all other books in books_id_read
            similarity_scores = [
                similarity_matrix[book_index, books_df.index[books_df['book_id'] == other_book_id][0]]
                for other_book_id in books_id_read if other_book_id != book_id
            ]
            
            # Calculate cumulative similarity score for the current book
            cumulative_scores[book_id] = sum(similarity_scores)
          
        except IndexError:
            # Skip books that may not be in the similarity matrix
            continue
    # Find the book with the highest cumulative similarity score
    reference_book = max(cumulative_scores, key=cumulative_scores.get) if cumulative_scores else None
    return reference_book

In [14]:
# Apply the function to find the reference book for each user
user_reading_history['reference_book'] = user_reading_history['books_id_read'].apply(
    lambda books_id_read: get_reference_book(books_id_read, df_books, tfidf_cos_sim)
)

user_reading_history.head(5)

Unnamed: 0,user_id,books_id_read,reference_book
0,0,"[157993, 359079, 41684]",41684
1,1,"[34524, 236093, 17131769]",34524
2,2,"[24213, 5, 194755]",24213
3,3,"[134371, 10444, 42359]",42359
4,4,"[8144, 5, 6680753]",8144


In [None]:
# Function to get top N most similar books for a reference book
def get_top_n_similar_books(reference_book_id, books_df, similarity_matrix, top_n=5):
    # Get the index of the reference book in books_df
    try:
        ref_book_index = books_df.index[books_df['book_id'] == reference_book_id][0]
    except IndexError:
        # Return an empty list if the reference_book_id is not found in books_df
        return []
    
    # Get similarity scores for all books with respect to the reference book
    similarity_scores = similarity_matrix[ref_book_index].toarray().flatten()
    
    # Create a list of (book_id, similarity_score) tuples, excluding the reference book itself
    similar_books = [
        (books_df.iloc[i]['book_id'], similarity_scores[i])
        for i in range(len(similarity_scores)) if books_df.iloc[i]['book_id'] != reference_book_id
    ]
    
    # Sort by similarity score in descending order and take top N
    top_similar_books = sorted(similar_books, key=lambda x: x[1], reverse=True)[:top_n]
    
    # Extract book IDs of the top similar books
    top_book_ids = [book_id for book_id, _ in top_similar_books]
    return top_book_ids

In [None]:
# test case: Recommend books for sample user 7

user_id = 7
user_7_data = user_reading_history[user_reading_history['user_id'] == user_id]

# Ensure user 7 has a reference book
if not user_7_data.empty:
    reference_book_id = user_7_data['reference_book'].values[0]
    
    # Get the top 5 recommendations for user 7's reference book
    recommendations_user_7 = get_top_n_similar_books(reference_book_id, df_books, tfidf_cos_sim, top_n=5)
    
    print(f"Top 5 recommendations for User {user_id} (Reference Book {reference_book_id}):")
    print(recommendations_user_7)
else:
    print(f"User {user_id} not found in user_reading_history.")


Top 5 recommendations for User 7 (Reference Book 32929):
[1099989, 232381, 94559, 1099991, 32932]


We recommend 5 books for each of the first 1000 users to evaluate the recommendations. 

In [None]:
# Top 1000 rows of user_reading_history
top_1000_user_reading_history = user_reading_history.head(1000)

In [24]:
# Apply the function to each user in user_reading_history to get recommendations
top_1000_user_reading_history['recommendations'] = top_1000_user_reading_history['reference_book'].apply(
    lambda ref_book: get_top_n_similar_books(ref_book, df_books, tfidf_cos_sim, top_n=5)
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_1000_user_reading_history['recommendations'] = top_1000_user_reading_history['reference_book'].apply(


## Evaluation

In [17]:
test_interactions = pd.read_csv('test_interactions.csv')
test_interactions.head(3)

Unnamed: 0,user_id,book_id,rating,date_added,n_votes,review_age,sentiment
0,0,343002,5,2012-01-25 00:26:06+00:00,,4641,0.574139
1,0,1852,4,2012-01-21 18:36:06+00:00,,4644,0.527973
2,1,1248128,4,2009-05-04 15:32:21+00:00,,5636,0.527973


In [18]:
# Group by user_id and select top 3 books based on the sorted order
user_test = (
    test_interactions.groupby('user_id')
    .head(3)  # Take top 3 rows per user
    .groupby('user_id')['book_id']
    .apply(list)  # Aggregate book_ids into a list
    .reset_index()
    .rename(columns={'book_id': 'books_id_read'})
)

In [None]:
user_test_top_1000 = user_test.head(1000)

In [21]:
# Function to calculate the average similarity score among all actual read books and recommended books
def calculate_avg_similarity(user_id, actual_books_read, recommendations, books_df, similarity_matrix):
    all_similarity_scores = []
    
    for actual_book_id in actual_books_read:
        try:
            # Get the index of the actual book in books_df
            actual_book_index = books_df.index[books_df['book_id'] == actual_book_id][0]
        except IndexError:
            # Skip if the actual book is not in books_df
            all_similarity_scores.append(None)
            continue
        
        # Calculate similarity between each recommended book and the actual books read
        for recommended_book_id in recommendations:
            try:
                recommended_book_index = books_df.index[books_df['book_id'] == recommended_book_id][0]
                similarity_score = similarity_matrix[actual_book_index, recommended_book_index]
                all_similarity_scores.append(similarity_score)
            except IndexError:
                # Skip if the recommended book is not in books_df
                continue
    
    return np.average(all_similarity_scores)

In [26]:
# Merge the top_3_user_reading_history and user_test_top_3 dataframes to align recommendations with actual reads
merged_df = pd.merge(
    top_1000_user_reading_history[['user_id', 'recommendations']],
    user_test_top_1000[['user_id', 'books_id_read']],
    on='user_id'
)

merged_df['avg_similarity_score'] = merged_df.apply(
    lambda row: calculate_avg_similarity(
        row['user_id'],
        row['books_id_read'],
        row['recommendations'],
        df_books,
        tfidf_cos_sim
    ),
    axis=1
)

# Show the results
merged_df[['user_id', 'books_id_read', 'recommendations', 'max_similarity_scores','avg_similarity_score']]

Unnamed: 0,user_id,books_id_read,recommendations,max_similarity_scores,avg_similarity_score
0,0,"[343002, 1852]","[1150470, 1009934, 3426335, 9046908, 29363115]","[0.04252608615023137, 0.047140118602314714]",0.020710
1,1,"[1248128, 30119]","[16190512, 1343087, 26109114, 1847543, 842002]","[0.01034420398048235, 0.013824423369103476]",0.005388
2,2,"[74595, 2711313]","[12197986, 38686, 1068933, 1956182, 70453]","[0.022517120971567876, 0.005899712413665775]",0.007727
3,3,"[30119, 157993]","[2830856, 3200616, 822392, 42331, 370495]","[0.020098251059419647, 0.022648563485515742]",0.006475
4,4,"[13023, 157993]","[19290672, 823598, 18051057, 18051056, 3017558]","[0.03238304484039952, 0.016681591975954726]",0.017383
...,...,...,...,...,...
995,995,"[24178, 78411]","[30139, 30137, 838059, 838073, 30132]","[0.04874861866795203, 0.016531585364157587]",0.024292
996,996,"[267972, 275000]","[1252480, 847692, 1252452, 1252496, 446848]","[0.024311902137199932, 0.021052906385195583]",0.012646
997,997,"[25618438, 23754884]","[17190400, 16054810, 18594577, 1329131, 6871059]","[0.023090530125603085, 0.007369528119377128]",0.007771
998,998,"[140225, 5]","[1222825, 7063752, 334013, 1121411, 2956698]","[0.05729717480779167, 0.014608194798619899]",0.035362


### Read in recommendation & similarity scores from CSV (merged_df exceeds local memory limit)

In [27]:
merged_df = pd.read_csv("1000user_recommendations(content).csv")

### Counting the number of matches in recommendations and actually read books for every user recommended

In [28]:
merged_df.head(3)

Unnamed: 0,user_id,recommendations,books_id_read,max_similarity_scores,avg_similarity_score,num_exact_match,avg_max_similarity_scores
0,0,"[1150470, 1009934, 3426335, 9046908, 29363115]","[343002, 1852]","[0.04252608615023137, 0.047140118602314714]",0.02071,0,0.044833
1,1,"[16190512, 1343087, 26109114, 1847543, 842002]","[1248128, 30119]","[0.01034420398048235, 0.013824423369103476]",0.005388,0,0.012084
2,2,"[12197986, 38686, 1068933, 1956182, 70453]","[74595, 2711313]","[0.022517120971567876, 0.005899712413665775]",0.007727,0,0.014208


In [10]:
merged_df['avg_similarity_score'].describe()

count    1000.000000
mean        0.027905
std         0.054445
min         0.000000
25%         0.007529
50%         0.012594
75%         0.020113
max         0.395172
Name: avg_similarity_score, dtype: float64

We initially considered using a threshold similarity score to evaluate the accuracy of the model. However, setting the threshold too low would give us a high accuracy even when recommendations are likely to be bad, but setting the threshold too high would not be favourable either. Thus, as there was no benchmark or research showing what threshold we should use, we ultimately decided to evaluate our model based off the average similarity score of the predicted books with the books in the test set.

As seen in the table above, similarities between recommendations and actually read books are generally quite low with the median average score at 0.0126. This indicates that the recommendations provided to each user are not that similar to previously read books. However, this may also be due to an absence of relevant books in the training data due to memory constraints in our local machines. Furthermore, we only generated recommendations for the first 1000 users in the dataset and thus, our results might not be representative of the population in our entire dataset.

### Accuracy

Hit Rate: Check if the 5 recommendations for each user are actually read in the test set

Validity of recommendation: Calculate the average similarity score between the recommendations and read books 

In [24]:
# Initialize lists to hold true/false positives and negatives
all_y_true = []
all_y_pred = []
total_hits = 0

# Calculate TP, FP, FN for each user
for _, row in merged_df.iterrows():
    recommendations = list(map(int, row['recommendations'].strip("[]").split(", ")))
    books_read = list(map(int, row['books_id_read'].strip("[]").split(", ")))
    
    tp = len(set(recommendations) & set(books_read))  # True Positives: in both lists
    fp = len(set(recommendations) - set(books_read))  # False Positives: recommended but not read
    fn = len(set(books_read) - set(recommendations))  # False Negatives: read but not recommended

    total_hits += tp
    
    # Append results to the lists
    # True positives are marked as 1 in both y_true and y_pred
    all_y_true.extend([1] * tp)
    all_y_pred.extend([1] * tp)
    
    # False positives are marked as 0 in y_true and 1 in y_pred
    all_y_true.extend([0] * fp)
    all_y_pred.extend([1] * fp)
    
    # False negatives are marked as 1 in y_true and 0 in y_pred
    all_y_true.extend([1] * fn)
    all_y_pred.extend([0] * fn)

In [25]:
# hit rate = number of relevant recommendations/ total recommendations
total_users = 1000
accuracy_of_exact_hits = total_hits/total_users * 100 # checking for accuracy of recommendation. hits/total_books
print(f"The accuracy of the recommendation system in terms of exact match is {accuracy_of_exact_hits:.2f}%")

The accuracy of the recommendation system in terms of exact match is 1.10%


### Precision, Recall, F1-Score, Confusion Matrix

In [26]:
#Precision@K
precision = precision_score(all_y_true,  all_y_pred)
print(f"Precision@K: {precision}")

#Recall@K
recall = recall_score( all_y_true,  all_y_pred)
print(f"Recall@K: {recall}")

#F1 Score
f1 = f1_score(all_y_true, all_y_pred)
print(f"F1 Score: {f1}")

#Confusion Matrix
conf_matrix = confusion_matrix(all_y_true, all_y_pred)
print("Confusion Matrix:\n", conf_matrix)

Precision@K: 0.0022
Recall@K: 0.0055
F1 Score: 0.003142857142857143
Confusion Matrix:
 [[   0 4989]
 [1989   11]]
