In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from gensim.models import Word2Vec
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df_books = pd.read_csv('df_books_final.csv')

combine test and training
for each user id inside, recommend some books,
compare that recommendation with what he actually read 

### using df_books_final as main books dataset (already sampled 70%)

In [3]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_scores =  tfidf.fit_transform(df_books['description'])
tfidf_scores.shape

(59828, 149225)

In [4]:
# cosine similarity matrix directly for the sparse TF-IDF matrix
tfidf_cos_sim = cosine_similarity(tfidf_scores, dense_output=False)

# recommendation on training set

In [5]:
train_interactions = pd.read_csv('train_interactions.csv')

In [10]:
train_interactions.head(10)

Unnamed: 0,user_id,book_id,rating,date_added,n_votes,review_age,sentiment
0,0,157993,5,2016-07-18 19:34:48+00:00,,3004,0.574139
1,0,359079,4,2014-07-16 19:28:57+00:00,,3737,0.527973
2,0,41684,4,2014-07-16 13:45:50+00:00,,3738,0.527973
3,0,180617,4,2012-08-06 19:55:50+00:00,,4446,0.527973
4,0,13023,3,2012-01-30 10:17:30+00:00,,4636,0.40686
5,0,22917,5,2012-01-25 00:30:02+00:00,,4641,0.574139
6,0,46306,5,2012-01-25 00:26:44+00:00,,4641,0.574139
7,1,34524,4,2014-12-30 02:41:38+00:00,,3571,0.527973
8,1,236093,3,2014-11-11 15:56:15+00:00,,3619,0.40686
9,1,17131769,3,2014-05-20 22:49:29+00:00,,3794,0.40686


In [6]:
# Sort by user_id, review_age (ascending for lowest first), and rating (descending for highest first)
train_interactions_sorted = train_interactions.sort_values(
    by=['user_id', 'review_age', 'rating'], ascending=[True, True, False]
)

In [7]:
# Group by user_id and select top 5 books based on the sorted order
user_reading_history = (
    train_interactions_sorted.groupby('user_id')
    .head(3)  # Take top 5 rows per user
    .groupby('user_id')['book_id']
    .apply(list)  # Aggregate book_ids into a list
    .reset_index()
    .rename(columns={'book_id': 'books_id_read'})
)

In [8]:
user_reading_history.head(5)

Unnamed: 0,user_id,books_id_read
0,0,"[157993, 359079, 41684]"
1,1,"[34524, 236093, 17131769]"
2,2,"[24213, 5, 194755]"
3,3,"[134371, 10444, 42359]"
4,4,"[8144, 5, 6680753]"


In [11]:
# Function to compute cumulative similarity scores and find the reference book
def get_reference_book(books_id_read, books_df, similarity_matrix):
    cumulative_scores = {}
    
    # For each book in the user's reading history
    for book_id in books_id_read:
        try:
            # Get the index of the book in df_books to match the similarity matrix
            book_index = books_df.index[books_df['book_id'] == book_id][0]
            
            # Compute similarity scores with all other books in books_id_read
            similarity_scores = [
                similarity_matrix[book_index, books_df.index[books_df['book_id'] == other_book_id][0]]
                for other_book_id in books_id_read if other_book_id != book_id
            ]
            
            # Calculate cumulative similarity score for the current book
            cumulative_scores[book_id] = sum(similarity_scores)
            print(cumulative_scores)
        except IndexError:
            print(f"Error")
            print(book_id)
            # Skip books that may not be in the similarity matrix
            continue
    # Find the book with the highest cumulative similarity score
    # print(cumulative_scores)
    # print(
    #     max(cumulative_scores.items(), key = lambda x: x[1])
    # )
    # print('---------')
    reference_book = max(cumulative_scores, key=cumulative_scores.get) if cumulative_scores else None
    return reference_book

In [12]:
# Apply the function to find the reference book for each user
user_reading_history['reference_book'] = user_reading_history['books_id_read'].apply(
    lambda books_id_read: get_reference_book(books_id_read, df_books, tfidf_cos_sim)
)

user_reading_history.head(5)

{157993: 0.035170646167702754}
{157993: 0.035170646167702754, 359079: 0.06303543963739376}
{157993: 0.035170646167702754, 359079: 0.06303543963739376, 41684: 0.0702621617546453}
{157993: 0.035170646167702754, 359079: 0.06303543963739376, 41684: 0.0702621617546453}
(41684, 0.0702621617546453)
---------
{34524: 0.0543940020513642}
{34524: 0.0543940020513642, 236093: 0.02295328811729306}
{34524: 0.0543940020513642, 236093: 0.02295328811729306, 17131769: 0.03144071393407114}
{34524: 0.0543940020513642, 236093: 0.02295328811729306, 17131769: 0.03144071393407114}
(34524, 0.0543940020513642)
---------
{24213: 0.020550954530531594}
{24213: 0.020550954530531594, 5: 0.009902639691331809}
{24213: 0.020550954530531594, 5: 0.009902639691331809, 194755: 0.010648314839199787}
{24213: 0.020550954530531594, 5: 0.009902639691331809, 194755: 0.010648314839199787}
(24213, 0.020550954530531594)
---------
{134371: 0.015301324466147133}
{134371: 0.015301324466147133, 10444: 0.017894935705416684}
{134371: 0.0

Unnamed: 0,user_id,books_id_read,reference_book
0,0,"[157993, 359079, 41684]",41684
1,1,"[34524, 236093, 17131769]",34524
2,2,"[24213, 5, 194755]",24213
3,3,"[134371, 10444, 42359]",42359
4,4,"[8144, 5, 6680753]",8144


In [19]:
# def get_top_n_similar_books(reference_book_id, books_df, similarity_matrix, top_n=5):
#     # Get the index of the reference book in books_df
#     try:
#         ref_book_index = books_df.index[books_df['book_id'] == reference_book_id][0]
#     except IndexError:
#         # Return an empty list if the reference_book_id is not found in books_df
#         return []

#     # Get similarity scores for all books with respect to the reference book
#     similarity_scores = similarity_matrix[ref_book_index].toarray().flatten()

#     # Create a list of (book_id, similarity_score) tuples, excluding the reference book itself
#     similar_books = [
#         (books_df.iloc[i]['book_id'], books_df.iloc[i]['title'], similarity_scores[i])
#         for i in range(len(similarity_scores))
#         if books_df.iloc[i]['book_id'] != reference_book_id
#     ]

#     # Sort by similarity score in descending order and take top N
#     top_similar_books = sorted(similar_books, key=lambda x: x[2], reverse=True)[:top_n]

#     # Extract titles of the top similar books
#     top_book_titles = [title for _, title, _ in top_similar_books]
#     return top_book_titles

def get_top_n_similar_books(reference_book_id, books_df, similarity_matrix, top_n=5):
    # Get the index of the reference book in books_df
    try:
        ref_book_index = books_df.index[books_df['book_id'] == reference_book_id][0]
    except IndexError:
        # Return an empty list if the reference_book_id is not found in books_df
        return []

    # Get similarity scores for all books with respect to the reference book
    similarity_scores = similarity_matrix[ref_book_index].toarray().flatten()

    # Create a list of (book_id, title, similarity_score) tuples, excluding the reference book itself
    similar_books = [
        (books_df.iloc[i]['book_id'], books_df.iloc[i]['title'], similarity_scores[i])
        for i in range(len(similarity_scores))
        if books_df.iloc[i]['book_id'] != reference_book_id
    ]

    # Sort by similarity score in descending order and take top N
    top_similar_books = sorted(similar_books, key=lambda x: x[2], reverse=True)[:top_n]

    # Return top similar books with ID, title, and similarity score
    return top_similar_books


In [20]:
# # test case: Recommend books for sample user 7
# sample_user_id = 7  # Replace with an actual user ID from user_reading_history
# recommendations = get_top_n_similar_books(sample_user_id, df_books, tfidf_cos_sim, top_n=3)
# print(f"Top recommendations for User {sample_user_id}: {recommendations}")

# Select user 7's reference book
user_id = 7
user_7_data = user_reading_history[user_reading_history['user_id'] == user_id]

# Ensure user 7 has a reference book
if not user_7_data.empty:
    reference_book_id = user_7_data['reference_book'].values[0]
    
    # Get the top 5 recommendations for user 7's reference book
    recommendations_user_7 = get_top_n_similar_books(reference_book_id, df_books, tfidf_cos_sim, top_n=5)
    
    print(f"Top 5 recommendations for User {user_id} (Reference Book {reference_book_id}):")
    print(recommendations_user_7)
else:
    print(f"User {user_id} not found in user_reading_history.")


Top 5 recommendations for User 7 (Reference Book 32929):
[(1099989, 'Goodnight Moon', 1.0), (232381, 'Goodnight Moon', 0.9712501318713651), (94559, 'Goodnight Moon Big Book', 0.9363356669838709), (1099991, 'Goodnight Moon', 0.9206638259338205), (32932, 'Over the Moon: A Collection of First Books: Goodnight Moon, The Runaway Bunny, and My World', 0.5257615189781027)]


In [24]:
# top 500 rows of user_reading_history
top_3_user_reading_history = user_reading_history.head(3)
top_3_user_reading_history

Unnamed: 0,user_id,books_id_read,reference_book
0,0,"[157993, 359079, 41684]",41684
1,1,"[34524, 236093, 17131769]",34524
2,2,"[24213, 5, 194755]",24213


In [26]:
# Apply the function to each user in user_reading_history to get recommendations
top_3_user_reading_history['recommendations'] = top_3_user_reading_history['reference_book'].apply(
    lambda ref_book: get_top_n_similar_books(ref_book, df_books, tfidf_cos_sim, top_n=5)
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_3_user_reading_history['recommendations'] = top_3_user_reading_history['reference_book'].apply(


In [27]:
top_3_user_reading_history

Unnamed: 0,user_id,books_id_read,reference_book,recommendations
0,0,"[157993, 359079, 41684]",41684,"[(1150470, The Jungle Book, 1.0000000000000002..."
1,1,"[34524, 236093, 17131769]",34524,"[(16190512, Sam and Charlie, 0.268201691210449..."
2,2,"[24213, 5, 194755]",24213,"[(12197986, Alice's Adventures in Wonderland a..."


this code below recommends book for entire user reading history

In [18]:
# # Apply the function to each user in user_reading_history to get recommendations
# user_reading_history['recommendations'] = user_reading_history['reference_book'].apply(
#     lambda ref_book: get_top_n_similar_books(ref_book, df_books, tfidf_cos_sim, top_n=5)
# )

# # Display the recommendations
# user_reading_history[['user_id', 'reference_book', 'recommendations']].head()

KeyboardInterrupt: 

In [None]:
# # books each user has read 
# def user_profiling(user_books, books_df, similarity_matrix):
#     vectors = []
#     for book_id in user_books:
#         try:
#             # get index of the book in df_books_sampled to match the similarity matrix
#             book_index = books_df.index[books_df['book_id'] == book_id][0]
#             vectors.append(similarity_matrix[book_index].toarray().flatten())
#         except IndexError:
#             continue

#     if vectors:
#         user_profile_vector = np.mean(vectors, axis=0)
#     else:
#         user_profile_vector = np.zeros(similarity_matrix.shape[1])  # Handle edge cases
#     return user_profile_vector

In [None]:
# def recommend_books_for_user(user_id, books_df, similarity_matrix, top_n=5):
#     # Retrieve the list of books read by the user
#     user_books = user_reading_history[user_reading_history['user_id'] == user_id]['books_id_read'].values[0]
    
#     # Calculate user profile vector based on reading history
#     user_profile_vector = user_profiling(user_books, books_df, similarity_matrix)
#     scores = []
    
#     # Calculate similarity between user profile and all books in df_books_sampled
#     for idx, book_id in enumerate(books_df['book_id']):
#         if book_id not in user_books:  # Exclude books the user has already read
#             similarity_score = np.dot(user_profile_vector, similarity_matrix[idx].toarray().flatten())
#             scores.append((book_id, similarity_score))
    
#     # similarity score in descending order and get top 5 recommendations
#     top_recommendations = sorted(scores, key=lambda x: x[1], reverse=True)[:top_n]
    
#     # Return titles of recommended books
#     recommended_books = [books_df[books_df['book_id'] == book_id]['title'].values[0] for book_id, _ in top_recommendations]
#     return recommended_books

In [None]:
# # recommended books for each user
# user_reading_history['recommendations'] = user_reading_history['user_id'].apply(
#     lambda user_id: recommend_books_for_user(user_id, df_books, tfidf_cos_sim, top_n=5)
# )

In [None]:
# # Generate user profile vectors for all users at once
# def get_user_profile_vectors(user_reading_history, books_df, similarity_matrix):
#     user_profiles = []
#     for user_books in user_reading_history['books_id_read']:
#         user_profiles.append(user_profiling(user_books, books_df, similarity_matrix))
#     return np.array(user_profiles)

# # Get similarity scores for all books for each user at once
# def recommend_books_batch(user_profile_vectors, similarity_matrix, books_df, user_reading_history, top_n=5):
#     all_recommendations = []
#     # Matrix multiplication to get scores for all books for each user
#     scores_matrix = np.dot(user_profile_vectors, similarity_matrix.T)
    
#     for user_index, user_books in enumerate(user_reading_history['books_id_read']):
#         # Mask out books already read by the user
#         mask = np.isin(books_df['book_id'], user_books)
#         scores_matrix[user_index][mask] = -np.inf
        
#         # Get top recommendations
#         top_book_indices = np.argpartition(scores_matrix[user_index], -top_n)[-top_n:]
#         top_books = books_df.iloc[top_book_indices]['title'].values
#         all_recommendations.append(top_books)
    
#     return all_recommendations

# # Generate user profile vectors for all users at once
# user_profile_vectors = get_user_profile_vectors(user_reading_history, df_books, tfidf_cos_sim)

# # Get recommendations for each user
# user_recommendations = recommend_books_batch(user_profile_vectors, tfidf_cos_sim, df_books, user_reading_history, top_n=5)
# user_reading_history['recommendations'] = user_recommendations

### Using testing set

In [None]:
test_interactions = pd.read_csv('test_interactions.csv')
test_interactions.head(3)

In [None]:
# filter for those books that exist in df_book_sampled, and only for users that are in training for evaluation
test_interactions_filtered = test_interactions[
    test_interactions['book_id'].isin(df_books_sampled['book_id']) &
    test_interactions['user_id'].isin(train_interactions['user_id'])
]

In [None]:
# collate books read by each user
test_aggregated = test_interactions_filtered.groupby('user_id')['book_id'].apply(list).reset_index()
test_aggregated.columns = ['user_id', 'books_read_in_test']

In [None]:
# merge test_aggregated with user_reading_history on user_id
merged_data = pd.merge(user_reading_history, test_aggregated, on='user_id', how='inner')

In [None]:
def evaluate_recommendations(merged_data, books_df, similarity_matrix, threshold=0.2):
    user_scores = []

    for _, row in merged_data.iterrows():
        user_id = row['user_id']
        actual_books = row['books_read_in_test']
        recommended_books = row['books_recommendation']

        total_similarity = 0
        count = 0

        for actual_book in actual_books:
            for recommended_book in recommended_books:
                # get similarity from similarity_matrix
                idx_actual = books_df[books_df['book_id'] == actual_book].index[0]
                idx_recommended = books_df[books_df['book_id'] == recommended_book].index[0]
                similarity = similarity_matrix[idx_actual, idx_recommended]
                
                # Accumulate total similarity and count for weighted average
                total_similarity += similarity
                count += 1

        # average similarity score for the user's recommendations
        avg_similarity = total_similarity / count if count > 0 else 0

        # check if average similarity meets the threshold
        user_scores.append({
            'user_id': user_id,
            'avg_similarity': avg_similarity,
            'meets_threshold': avg_similarity >= threshold
        })

    return pd.DataFrame(user_scores)

# ignore

test

In [None]:
# similarity_threshold = 0.5

# evaluation_results = []

# # For each user, compare the similarity between recommended and actual book
# for user_id, user_df in filtered_train_interactions.groupby('user_id'):
#     # Get the recommended book_id for the user (assuming only 1 recommendation)
#     recommended_book_id = user_df['user_recommendations'].iloc[0][0]  # since it's a list
    
#     # Get the actual book_id that the user read in the test set
#     actual_book_id = filtered_test_interactions[filtered_test_interactions['user_id'] == user_id]['book_id'].iloc[0]
    
#     # Get the indices of the books in the similarity matrix
#     recommended_index = df_books_final_with_desc_sampled[df_books_final_with_desc_sampled['book_id'] == recommended_book_id].index[0]
#     actual_index = df_books_final_with_desc_sampled[df_books_final_with_desc_sampled['book_id'] == actual_book_id].index[0]
    
#     # Get the similarity score between the recommended and actual book
#     similarity_score = tfidf_cos_sim[recommended_index, actual_index]
    
#     # Check if the similarity score is above the threshold
#     if similarity_score >= similarity_threshold:
#         evaluation_results.append((user_id, recommended_book_id, actual_book_id, similarity_score, True))
#     else:
#         evaluation_results.append((user_id, recommended_book_id, actual_book_id, similarity_score, False))

# # Convert the results to a DataFrame for easier analysis
# evaluation_df = pd.DataFrame(evaluation_results, columns=['user_id', 'recommended_book_id', 'actual_book_id', 'similarity_score', 'correct'])

# # Calculate the accuracy (proportion of correct recommendations)
# accuracy = evaluation_df['correct'].mean()
# print(f"Recommendation accuracy: {accuracy:.2f}")

In [None]:
# Define a function to calculate similarity score between two book_ids
def calculate_similarity(book_id_1, book_id_2, books_df, similarity_matrix):
    try:
        index_1 = books_df.index[books_df['book_id'] == book_id_1].tolist()[0]
        index_2 = books_df.index[books_df['book_id'] == book_id_2].tolist()[0]
        similarity_score = similarity_matrix[index_1, index_2]
        return similarity_score
    except IndexError:
        return None  # none if the book_id doesnt exist in the similarity matrix

similarity_threshold = 0.15 #threshold for evaluation
results = []

for _, row in filtered_test_interactions.iterrows():
    user_id = row['user_id']
    actual_book_id = row['book_id']

    # Get the recommended book_id from your user_recommendations dictionary (assuming 1 recommendation per user)
    recommended_books = user_recommendations.get(user_id, [])
    
    # check if recommendations exist for the user
    if recommended_books:
        recommended_book_id = recommended_books[0]  # take the first recommendation
        
        # similarity between actual and recommended book
        similarity = calculate_similarity(actual_book_id, recommended_book_id, df_books_final_with_desc_sampled, tfidf_cos_sim)
        
        # store the results 
        if similarity is not None:
            success = 1 if similarity >= similarity_threshold else 0
            results.append({
                'user_id': user_id,
                'actual_book_id': actual_book_id,
                'recommended_book_id': recommended_book_id,
                'similarity': similarity,
                'success': success
            })
        else:
            results.append({
                'user_id': user_id,
                'actual_book_id': actual_book_id,
                'recommended_book_id': recommended_book_id,
                'similarity': None,
                'success': 0
            })

evaluation_df = pd.DataFrame(results)
success_rate = evaluation_df['success'].mean()
print(f"Recommendation Success Rate: {success_rate * 100:.2f}%")
print(evaluation_df.head())


In [None]:
evaluation_df_sorted = evaluation_df.sort_values(by='similarity', ascending=False)
print(evaluation_df_sorted.head())

In [None]:
evaluation_df_sorted.shape