# Book Recommendation Algorithm



In [1]:
# Import Python Libraries (Lau Chien Yi & Ooi Jin Kun)

In this step, will be import Python libraries: 'pandas', 'numpy' and 'scipy.stats'. These library are used for data processing and calculations.

Other than that, also need to import 'seaborn' for visualization.

In [None]:
# Data processing
import pandas as pd
import numpy as np
import scipy.stats

# Visualization
import seaborn as sb

#define variable
target_vote_number = 100
numberOfResult = 5

import warnings
warnings.filterwarnings('ignore')

# Download and Read Data

### Rating Dataset :

In [None]:
ratings = pd.read_csv("ratings.csv",nrows=90000)

In [None]:
ratings.head()

In [None]:
# Get ratings dataset information
ratings.info()

In [None]:
# Number of users
print('The ratings dataset has', ratings['user_id'].nunique(), 'unique users')

# Number of books
print('The ratings dataset has', ratings['book_id'].nunique(), 'unique books')

# Number of ratings
print('The ratings dataset has', ratings['rating'].nunique(), 'unique ratings')

# List of unique ratings
print('The unique ratings are', sorted(ratings['rating'].unique()))

### Book Dataset :


In [None]:
books = pd.read_csv("books.csv",nrows=90000)

In [None]:
books.head()

In [None]:
books.info()

In [None]:
# Filter data
books = books[['book_id','title','authors','original_publication_year']]

books.head()


In [None]:
# Get books dataset information
books.info()

# Data Preprosessing


### Change Variable type

In [None]:
# Change object data type to string data type using astype()
books = books.astype({"title":"string","authors":"string"})
books.info()

In [None]:
### Check Missing Value

In [None]:
ratings.isna().sum()

In [None]:
books.isna().sum()

In [None]:
# Drop missing values in original_publication_year column in books dataset
books.dropna(axis=0, inplace = True)
books.isna().sum()

### Merge Dataset by book_id

Using 'book_id' as the matching key, then append book information to the ratings dataset and named it as 'ratings_book'. Then now we can have the book title and book ratings in the same dataset.

In [None]:
# Merge ratings and books datasets
ratings_books = pd.merge(ratings, books, on='book_id', how='inner')

ratings_books.head()

In [None]:
ratings_books.info()

In [None]:
# Number of users
print('The ratings dataset has', ratings_books['user_id'].nunique(), 'unique users')
# Number of books
print('The ratings dataset has', ratings_books['book_id'].nunique(), 'unique books')
# Number of ratings
print('The ratings dataset has', ratings_books['rating'].nunique(), 'unique ratings')
# List of unique ratings
print('The unique ratings are', sorted(ratings_books['rating'].unique()))

# User-Based Collaborative Filtering (Lau Chien Yi)

Defination :

User-Based Collaborative Filtering makes recommendations based on user product
interactions in the past. The assumption behind the algorithm is that similar
users like similar products.
    

### Exploratory Data Analysis (EDA)

In [None]:
# Aggregate by books
agg_ratings_books = ratings_books.groupby('book_id').agg(mean_rating = ('rating', 'mean'),number_of_ratings = ('rating', 'count')).reset_index()

agg_ratings_books.info()

In [None]:
# Check popular based on the number of ratings they have received
agg_ratings_books.sort_values(by='number_of_ratings', ascending=False).head(10)

In [None]:
# Visualization
sb.jointplot(x='mean_rating', y='number_of_ratings', data=agg_ratings_books)

### Create User-Book Matrix (pivot table)¶
Transform the dataset into a matrix format. The rows of the matrix are users, and the columns of the matrix are books. The value of the matrix is the user rating of the books if there is a rating. Otherwise, it shows ‘NaN’.

In [None]:
# Create user-book matrix
user_book_matrix = ratings_books.pivot_table(index='user_id', columns='book_id', values='rating')
user_book_matrix.head()

### Data Normalization

In [None]:
# Normalize user-book matrix
matrix_norm = user_book_matrix.subtract(user_book_matrix.mean(axis=1), axis = 'rows')
# This helps in centering the ratings around each user's average, allowing you to identify whether a user rated a particular book higher or lower than their average.

matrix_norm.head()
# negative value = books with a rating less than the user's average rating
# positive value = books with a rating more than the user's average rating

### Pearson Correlation

In [None]:
# User similarity matrix using Pearson correlation
user_sim = matrix_norm.T.corr()

user_sim.head()
# positive value = similar user (same book preference)
# neagative value = not similar user (opposite book preference)

### Identify Similar User (Given Scenario)

In [None]:
# Make a copy
user_similarity = user_sim.copy()

# Pick a target user
target_userID = 35

# Remove target user ID from the user_similarity matrix
user_similarity.drop(index=target_userID, inplace=True)

user_similarity.head()

In [None]:
# Number of similar users to get (Top 10 most similar user for target user)
n = 10

# User similarity threshold (to make sure the Pearson correlation >0.3)
user_similarity_threshold = 0.3

# Get top n similar users
top_similar_users = user_similarity[user_similarity[target_userID]>user_similarity_threshold][target_userID].sort_values(ascending=False)[:n]

print(f'Top {n} similar users for user {target_userID} :\n', top_similar_users)

# this code calculates and prints the top similar users for the specified
# target user, considering the user similarity threshold. These similar users
# can be used in collaborative filtering recommendation systems to suggest
# books that users with similar preferences enjoyed.

### Input userID to identify the similar user that have read the books

In [None]:
def get_similar_user(userID):
    user_s = user_sim.copy()
    user_similarity_threshold = 0.3

    user_s.drop(index=userID, inplace =True)
    similar_user = user_s[user_s[userID] > user_similarity_threshold][userID].sort_values(ascending = False)
    return similar_user

In [None]:
userID = int(input("Enter userID: "))

while userID not in ratings['user_id'].unique():
    userID = int(input("Enter valid userID: "))

sim_users = get_similar_user(userID)
print(f'\nSimilar users for user {userID} :\n', sim_users)

### Books that have been read by target user

In [None]:
target_userid_read = matrix_norm[matrix_norm.index == target_userID].dropna(axis=1, how='all')
target_userid_read.head()

### Books that similar user read

In [None]:
similar_user_books = matrix_norm[matrix_norm.index.isin(top_similar_users.index)].dropna(axis=1, how='all')
similar_user_books.head()

### Remove the books that have been read by target user from the similar_user_books

In [None]:
similar_user_books.drop(target_userid_read.columns,axis=1, inplace=True, errors='ignore')

similar_user_books.head()
#5 books will be removed (according to the target_userid_read list)

### User Based Recommended Result


In [None]:
book_score = {}

for i in similar_user_books.columns:

  # Get the ratings for book i
  book_rating = similar_user_books[i]

  # Store the total score
  total = 0

  # Number of scores
  count = 0


  for u in top_similar_users.index:
    # If the book has rating
    if pd.isna(book_rating[u]) == False:
      # Score = sum of user similarity score * book rating
      score = top_similar_users[u] * book_rating[u]
      # Total up the score
      total += score
      # Update number of scores
      count +=1
  # Calculate average score for the book
  book_score[i] = total / count

book_score = pd.DataFrame(book_score.items(), columns=['book_id', 'book_score'])

ranked_book_score = pd.merge(book_score, books, on = 'book_id', how='inner')

# Sort the books by score
ranked_book_score = ranked_book_score.sort_values(by='book_score', ascending=False)



# m = number of book recommendation
m = 10
ranked_book_score.head(m)

In [None]:
# Average rating for the target user
avg_rating = user_book_matrix[user_book_matrix.index == target_userID].T.mean()[target_userID]

print(f'The average book rating for user {target_userID} is {avg_rating:.2f}')

In [None]:
# Calcuate the predicted rating
ranked_book_score['predicted_rating'] = ranked_book_score['book_score'] + avg_rating

ranked_book_score.head(m)

In [None]:
## Book recommendation
ranked_book_score = ranked_book_score.drop(['book_score','predicted_rating'], axis=1)
ranked_book_score.rename(columns = {'book_id':'Book ID','title':'Title','authors':'Author','original_publication_year':'Publish Year'}, inplace=True)
print(f'Top {m} book recommendations for user {target_userID}:')
ranked_book_score.head(m)

In [None]:
def recommender_system(userID):
    # Find similar user

    similar_user = get_similar_user(userID)

    # Narrow down the book
    target_userid_read = matrix_norm[matrix_norm.index == userID].dropna(axis=1, how='all')
    similar_user_books = matrix_norm[matrix_norm.index.isin(similar_user.index)].dropna(axis=1, how='all')

    similar_user_books.drop(target_userid_read.columns,axis=1, inplace=True, errors='ignore')

    #Prediction
    book_score = {}
    for i in similar_user_books.columns:
        book_rating = similar_user_books[i]
        total = 0
        count = 0
        for u in similar_user.index:
            if pd.isna(book_rating[u]) == False:
                score = similar_user[u] * book_rating[u]
                total += score
                count +=1
        book_score[i] = total / count

    book_score = pd.DataFrame(book_score.items(), columns=['book_id', 'book_score'])

    ranked_book_score = pd.merge(book_score, books, on = 'book_id', how='inner')

    ranked_book_score = ranked_book_score.sort_values(by='book_score', ascending=False)

    avg_rating = user_book_matrix[user_book_matrix.index == userID].T.mean()[userID]

    ranked_book_score['predicted_rating'] = ranked_book_score['book_score'] + avg_rating

    ranked_book_score = ranked_book_score.drop(['book_score','predicted_rating'], axis=1)
    ranked_book_score.rename(columns = {'book_id':'Book ID','title':'Title','authors':'Author','original_publication_year':'Publish Year'}, inplace=True)

    return ranked_book_score

In [None]:
userID = int(input("Enter user ID to whom you want to recommend : "))

while userID not in ratings['user_id'].unique():
    userID = int(input("Enter valid userID: "))

recommendation = recommender_system(userID)
print(f'\nBook recommendations for user {userID}:')
print(f'Total of Books: ', recommendation.shape[0])
recommendation

In [None]:
print(f'\nTop 10 book recommendations for user {userID}:')
recommendation.head(10)

# Item-Based collaborative filtering ( Ooi Jin Kun )


Defination :

Item-based collaborative filtering is a technique used in recommender systems to provide personalized recommendations to users based on their interactions and preferences with items (such as products, movies, articles, etc.). It focuses on establishing relationships between items rather than users. The core idea behind item-based collaborative filtering is that if a user has shown a positive preference for one item, they are likely to have similar preferences for items that are closely related to it.


### Read user_id from user

In [None]:
target_user_id = int(input('Enter user_id : '))
while target_user_id not in ratings['user_id'].unique():
    target_user_id = int(input('Enter valid user_id : '))


### Find user-books matrix

In [None]:
ratings_books_matrix = ratings_books.pivot_table(index='user_id', columns=['book_id'],values = 'rating')
ratings_books_matrix.head()

### Find book-user matrix

In [None]:
books_ratings_matrix = ratings_books_matrix.transpose()
books_ratings_matrix.head()

### Find the information of user_id

In [None]:
books_ratings_user = books_ratings_matrix[target_user_id]
pd.DataFrame(books_ratings_user.sort_values(ascending= False)).rename(columns={target_user_id: f"user_id {target_user_id}'s rating"})

### Use user's highest rated books to recommend other related books

In [None]:
source_user_books_rating = ratings_books.loc[ratings_books['user_id'] == target_user_id].sort_values(by = 'rating',ascending = False)
pd.DataFrame(source_user_books_rating.head(10))

In [None]:
top_rated_books_id = source_user_books_rating['book_id'].tolist()[0]
top_rated_books_id

In [None]:
ratings = ratings_books_matrix[top_rated_books_id]
pd.DataFrame(ratings).rename(columns={top_rated_books_id: f"{top_rated_books_id}'s rating"})

In [None]:
similar_books = ratings_books_matrix.corrwith(ratings)
similar_books = pd.DataFrame(similar_books, columns=['correlation'])
similar_books

### Identify the most correlated books

In [None]:
sorted_similar_books = pd.DataFrame(similar_books, columns=['correlation']).sort_values(by= 'correlation', ascending= False)
sorted_similar_books

### Eliminate the source books

In [None]:
sorted_similar_books = sorted_similar_books[1:]
sorted_similar_books

## Ensure the identified books is popular
Higher number of votes means more popular

In [None]:
#Get number of rating for each books
rating_votes = pd.DataFrame(ratings_books.groupby('book_id')['rating'].count())
rating_votes=rating_votes.rename(columns={'rating': 'rating_count'})
rating_votes

In [None]:
similar_books_ratings = sorted_similar_books.join(rating_votes['rating_count']).sort_values(by = 'correlation', ascending = False)
similar_books_ratings

## Get the books that have higher votes and have higher correlation

In [None]:
similar_popular_books = similar_books_ratings.loc[similar_books_ratings['rating_count']>=target_vote_number].dropna()
similar_popular_books

In [None]:
#Find target_user's rated books
target_user = ratings_books.loc[ratings_books['user_id'] == target_user_id].sort_values(by= 'rating', ascending= False)

#trim the result exist in user's rating
similar_popular_books = similar_popular_books[~similar_popular_books.index.isin(target_user['book_id'].tolist())]
similar_popular_books = similar_popular_books.sort_values(by='correlation', ascending = False)
similar_popular_books

In [None]:
#make it a list
most_similar_popular_books = similar_popular_books[:numberOfResult]
most_similar_popular_books_list = most_similar_popular_books.index.to_list()
most_similar_popular_books_list

### Show target book's information

In [None]:
target_search_books = books.loc[books['book_id'] == top_rated_books_id]
pd.DataFrame(target_search_books)

### Show books correlation table

In [None]:
similar_popular_books

## Item Based Recommended Result


In [None]:
# Item-based recommended result
most_similar_popular_books_df = books.loc[books['book_id'].isin(most_similar_popular_books_list)]
most_similar_popular_books_df