# Import Libraries

In [None]:
import os
import pickle
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
warnings.filterwarnings('ignore')

## Load Preprocessed data

In [None]:
book_ratings = pd.read_csv('processed.csv')

# Popularity Based Recommendation System

In [None]:
# Group the data by 'Book-Title'
# Calculate two metrics--> 1. 'count' - Number of ratings per book | 2. 'mean' - Average rating per book
top_books = book_ratings.groupby('Book-Title')['Book-Rating'].agg(['count','mean'])

# Select the top 50 books based on the number of ratings (popularity)
top_books = top_books.nlargest(50,columns=['count'])

# Sort the top 50 books by their average rating (mean) in descending order (highest-rated first)
top_books = top_books.sort_values('mean',ascending=False).reset_index()

## Top 50 Books <--> No.of Ratings

In [None]:
plt.figure(figsize=(10, 12))

# X-axis: 'count' (number of ratings) Y-axis: 'Book-Title' (book names)
plt.barh(top_books_df["Book-Title"], top_books_df["count"], color='skyblue')
plt.xlabel('Count')
plt.ylabel('Book Title')
plt.title('Top 50 Books by Count')
# Invert the Y-axis to display the book with the highest count at the top
plt.gca().invert_yaxis() 

# Display the plot
plt.show()

## Save the Top 50 Books Data

In [None]:
# Create an 'artifacts' directory to store the processes data
parent_dir = os.path.dirname(os.getcwd())
target_dir = os.path.join(parent_dir,'artifacts')

os.makedirs(target_dir,exist_ok=True)

In [None]:
# Drop the 'count' column as it's no longer needed for final output
# Convert the result to a NumPy array for further processing or analysis
top_books = top_books.drop(columns='count').to_numpy()

# Save the top 50 books NumPy array to a pickle file for later use
pickle.dump(top_books,open(f'{target_dir}/top_50_books.pkl','wb'))

# Collaborative Filtering Based Recommender System

In [None]:
#only selecting users who contributed more ratings 
# Group the merged book_ratings dataset by 'User-ID'
# Count the number of 'Book-Rating' entries (ratings) given by each user
# Check which users have given more than 50 ratings (active users)
x = book_ratings.groupby('User-ID').count()['Book-Rating']>50 

# Extract the User-IDs of active users who satisfy the above condition
active_users = x[x].index

# Print the total number of unique active users
print('Unique users : ',len(active_users))

In [None]:
# Filter the book_ratings DataFrame to keep only the ratings from active users
filtered_user = book_ratings[book_ratings['User-ID'].isin(active_users)]

# Check and return the number of unique active users present after filtering
filtered_user['User-ID'].nunique()

In [None]:
# Group the book_ratings dataset by 'Book-Title'
# Count the number of ratings ('Book-Rating') each book received
# Filter books that have received 40 or more ratings (considered as famous/popular books)
y = book_ratings.groupby('Book-Title').count()['Book-Rating']>=40

# Extract the Book-Titles of these popular books
famous_books = y[y].index

# Print the total number of unique popular books
print('Unique books : ',len(famous_books))

In [None]:
# Filter the filtered_user DataFrame to include only the famous books
final_ratings = filtered_user[filtered_user['Book-Title'].isin(famous_books)]
# Drop duplicate rows, if any (same user rating the same book more than once)
final_ratings.drop_duplicates() 
final_ratings.shape

# Item based Recommender system (Book)

In [None]:
# Create a pivot table (user-item matrix) from the final_ratings DataFrame
# Rows: 'Book-Title'
# Columns: 'User-ID'
# Values: 'Book-Rating' (actual rating given by the user)
pivot_table = final_ratings.pivot_table(index='Book-Title',columns='User-ID',values='Book-Rating')

# Replace all NaN values (missing ratings) with 0
pivot_table.fillna(0,inplace=True)
# Display the user-item matrix
pivot_table

In [None]:
# Compute the cosine similarity between the books based on the user-item matrix
similarity_scores = cosine_similarity(pivot_table)
# Display the similarity scores matrix
similarity_scores

In [None]:
similarity_scores.shape

In [None]:
# Convert the similarity scores into a DataFrame for easier lookup
similarity_data = pd.DataFrame(similarity_scores,index=pivot_table.index)

# Define the recommendation function
def recommend_book(book_name : str):
    # Fetch similarity scores for the given book and Sort based on similarity score in descending order
    similar_items = sorted(
        list(enumerate(similarity_data.loc[book_name])),
        key=lambda x:x[1],
        reverse=True
    )[1:6]  # Top 5 similar books excluding the book itself

    # Print the titles of the recommended books
    for i in similar_items:
        print(similarity_data.index[i[0]])

In [None]:
recommend_book("Harry Potter and the Chamber of Secrets (Book 2)")

In [None]:
# Prepare the data for saving the model or using it for recommendation metadata
# Drop the 'User-ID' column since the focus is now on book-level data
book_data = final_ratings.drop(columns=['User-ID'])

# Calculate the mean (average) rating for each book
mean_ratings = book_data.groupby('Book-Title')['Book-Rating'].mean().reset_index(name='avg_ratings')

# Remove duplicate entries so that each book appears only once
# Merge the average ratings back with the book data
book_data = book_data.drop_duplicates(subset=['Book-Title']).merge(mean_ratings,on='Book-Title').drop('Book-Rating',axis=1)

## Save the processed data

In [None]:
# Save the processed metadata into a pickle file.
pickle.dump(book_data,open(f'{target_dir}/book_data.pkl','wb'))
pickle.dump(similarity_data,open(f'{target_dir}/similarity_data.pkl','wb'))

# User based Recommender System

In [None]:
# Create a user-item matrix (pivot table) from the final_ratings DataFrame
# Rows: 'User-ID' (each user)
# Columns: 'Book-Title' (each book)
# Values: 'Book-Rating' (rating given by the user to the book)
pivot = final_ratings.pivot_table(index='User-ID',columns='Book-Title',values='Book-Rating')

In [None]:
# Calculate the Pearson correlation between users based on their book ratings
# Transpose the pivot table so that books become rows and users become columns
user_similarity = pivot.T.corr()
user_similarity.shape

In [None]:
def recommend_books_by_user(user : int ,n=5)-->list:
    # Drop the target user from similarity matrix to avoid self-comparison
    temp = user_similarity.drop(index=user)

    # Select only users similar to the target user with similarity > 0.3
    similar_users = temp[temp[user]>0.3][user]

    # Books the target user has already rated (watched/read)
    user_watched_books = pivot.loc[user].dropna().index

    # Get the ratings of similar users, drop books not rated by anyone
    similar_users_watched = pivot.loc[similar_users.index].dropna(axis=1,how='all')

    # Focus only on books the target user hasn't read
    unwatched = similar_users_watched.drop(columns=user_watched_books,errors='ignore')
    
    weighted_score={} # Initialize dictionary to store weighted scores for each book
    # Iterate through each unwatched book
    for book in unwatched:
        # Get ratings of similar users who rated this book
        cleaned_movie = unwatched[book].dropna()

        # Compute weighted score: similarity * rating (mean over users)
        scores = np.array([similar_users[j]*cleaned_movie[j] for j in cleaned_movie.index]).mean()
        weighted_score[book]=scores

    # Sort the books based on weighted scores in descending order
    sorted_books = dict(sorted(weighted_score.items(), key=lambda x: x[1], reverse=True))

     # Return top N recommended books
    return list(sorted_books.keys())[:n]

In [None]:
recommend_books_by_user(269566,10)

## Save the user-item matrix

In [10]:
os.getcwd()

'/home/kudsit/VRS_PRJ/BookCompass-RecommendationSystem/src'

In [None]:
pickle.dump(pivot,open(f'{target_dir}/user_similarity.pkl','wb'))

----