# CONTENT BASED BOOK RECOMMENDATION SYSTEM

In [2]:
#Required Libraries
import pandas as pd
import numpy as np 

In [3]:
#Read the Dataset
books = pd.read_csv("Book.csv")

In [4]:
books.head()

Unnamed: 0,ID,title,author,rating,voters,price,currency,description,publisher,page_count,generes,ISBN,language,published_date
0,0,Attack on Titan: Volume 13,Hajime Isayama,4.6,428,43.28,SAR,NO SAFE PLACE LEFT At great cost to the Garris...,Kodansha Comics,192.0,none,9780000000000.0,English,31-Jul-14
1,1,Antiques Roadkill: A Trash 'n' Treasures Mystery,Barbara Allan,3.3,23,26.15,SAR,Determined to make a new start in her quaint h...,Kensington Publishing Corp.,288.0,"Fiction , Mystery &amp, Detective , Cozy , Gen...",9780000000000.0,English,1-Jul-07
2,2,The Art of Super Mario Odyssey,Nintendo,3.9,9,133.85,SAR,Take a globetrotting journey all over the worl...,Dark Horse Comics,368.0,"Games &amp, Activities , Video &amp, Electronic",9780000000000.0,English,5-Nov-19
3,3,Getting Away Is Deadly: An Ellie Avery Mystery,Sara Rosett,4.0,10,26.15,SAR,"With swollen feet and swelling belly, pregnant...",Kensington Publishing Corp.,320.0,none,9780000000000.0,English,1-Mar-09
4,4,"The Painted Man (The Demon Cycle, Book 1)",Peter V. Brett,4.5,577,28.54,SAR,The stunning debut fantasy novel from author P...,HarperCollins UK,544.0,"Fiction , Fantasy , Dark Fantasy",9780000000000.0,English,8-Jan-09


In [5]:
#Drop unnecessary data
books_cleaned = books.drop(columns=['voters','price','currency','publisher','page_count','published_date'])

# Remove duplicate rows
books_cleaned = books_cleaned.drop_duplicates()

books_cleaned.head()

Unnamed: 0,ID,title,author,rating,description,generes,ISBN,language
0,0,Attack on Titan: Volume 13,Hajime Isayama,4.6,NO SAFE PLACE LEFT At great cost to the Garris...,none,9780000000000.0,English
1,1,Antiques Roadkill: A Trash 'n' Treasures Mystery,Barbara Allan,3.3,Determined to make a new start in her quaint h...,"Fiction , Mystery &amp, Detective , Cozy , Gen...",9780000000000.0,English
2,2,The Art of Super Mario Odyssey,Nintendo,3.9,Take a globetrotting journey all over the worl...,"Games &amp, Activities , Video &amp, Electronic",9780000000000.0,English
3,3,Getting Away Is Deadly: An Ellie Avery Mystery,Sara Rosett,4.0,"With swollen feet and swelling belly, pregnant...",none,9780000000000.0,English
4,4,"The Painted Man (The Demon Cycle, Book 1)",Peter V. Brett,4.5,The stunning debut fantasy novel from author P...,"Fiction , Fantasy , Dark Fantasy",9780000000000.0,English


In [6]:
books_cleaned.head(1)['description']

0    NO SAFE PLACE LEFT At great cost to the Garris...
Name: description, dtype: object

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfv = TfidfVectorizer(min_df=3, max_features=None,
                    strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
                     ngram_range=(1,3),
                     stop_words = 'english')
    
#Filling NaN with empty string
books_cleaned['description'] = books_cleaned['description'].fillna('')

In [8]:
tfv_matrix = tfv.fit_transform(books_cleaned['description'])

In [9]:
tfv_matrix

<1299x52308 sparse matrix of type '<class 'numpy.float64'>'
	with 395385 stored elements in Compressed Sparse Row format>

In [10]:
tfv_matrix.shape

(1299, 52308)

In [11]:
from sklearn.metrics.pairwise import sigmoid_kernel

#compute the sigmoid kernel
sig = sigmoid_kernel(tfv_matrix, tfv_matrix)

In [12]:
sig[0]

array([0.76160218, 0.76159417, 0.76159417, ..., 0.76159422, 0.76159416,
       0.76159416])

In [13]:
indices = pd.Series(books_cleaned.index, index=books_cleaned['title']).drop_duplicates()

In [14]:
indices

title
Attack on Titan: Volume 13                                                                                                     0
Antiques Roadkill: A Trash 'n' Treasures Mystery                                                                               1
The Art of Super Mario Odyssey                                                                                                 2
Getting Away Is Deadly: An Ellie Avery Mystery                                                                                 3
The Painted Man (The Demon Cycle, Book 1)                                                                                      4
                                                                                                                            ... 
Twas The Nightshift Before Christmas: Festive hospital diaries from the author of million-copy hit This is Going to Hurt    1294
Why We Sleep: The New Science of Sleep and Dreams                                          

In [15]:
indices['The Art of Super Mario Odyssey']

title
The Art of Super Mario Odyssey       2
The Art of Super Mario Odyssey     301
The Art of Super Mario Odyssey     446
The Art of Super Mario Odyssey     659
The Art of Super Mario Odyssey    1070
The Art of Super Mario Odyssey    1248
dtype: int64

In [16]:
sig[2]

array([0.76159417, 0.76159417, 0.76160218, ..., 0.76159418, 0.76159416,
       0.76159416])

In [17]:
list(enumerate(sig[indices['How to Understand Business Finance: Edition 2']]))

[(0,
  array([0.76159422, 0.7615942 , 0.76159418, ..., 0.76160218, 0.76159416,
         0.76159417])),
 (1,
  array([0.76159422, 0.7615942 , 0.76159418, ..., 0.76160218, 0.76159416,
         0.76159417])),
 (2,
  array([0.76159422, 0.7615942 , 0.76159418, ..., 0.76160218, 0.76159416,
         0.76159417])),
 (3,
  array([0.76159422, 0.7615942 , 0.76159418, ..., 0.76160218, 0.76159416,
         0.76159417])),
 (4,
  array([0.76159422, 0.7615942 , 0.76159418, ..., 0.76160218, 0.76159416,
         0.76159417])),
 (5,
  array([0.76159422, 0.7615942 , 0.76159418, ..., 0.76160218, 0.76159416,
         0.76159417])),
 (6,
  array([0.76159422, 0.7615942 , 0.76159418, ..., 0.76160218, 0.76159416,
         0.76159417])),
 (7,
  array([0.76159422, 0.7615942 , 0.76159418, ..., 0.76160218, 0.76159416,
         0.76159417])),
 (8,
  array([0.76159422, 0.7615942 , 0.76159418, ..., 0.76160218, 0.76159416,
         0.76159417]))]

In [20]:
result = sorted(list(enumerate(sig[indices['The Art of Super Mario Odyssey']])), key=lambda x: x[1].all(), reverse=True)

[(0,
  array([0.76159417, 0.76159417, 0.76160218, ..., 0.76159418, 0.76159416,
         0.76159416])),
 (1,
  array([0.76159417, 0.76159417, 0.76160218, ..., 0.76159418, 0.76159416,
         0.76159416])),
 (2,
  array([0.76159417, 0.76159417, 0.76160218, ..., 0.76159418, 0.76159416,
         0.76159416])),
 (3,
  array([0.76159417, 0.76159417, 0.76160218, ..., 0.76159418, 0.76159416,
         0.76159416])),
 (4,
  array([0.76159417, 0.76159417, 0.76160218, ..., 0.76159418, 0.76159416,
         0.76159416])),
 (5,
  array([0.76159417, 0.76159417, 0.76160218, ..., 0.76159418, 0.76159416,
         0.76159416]))]

In [23]:
    
def give_rec_based_on_genre(user_genre, sig=sig):
    # Get indices of books related to the user's specified genre
    genre_indices = books_cleaned[books_cleaned['generes'].str.contains(user_genre, case=False)].index
    
    # Calculate the average similarity scores for books in the specified genre
    avg_scores = np.mean(sig[genre_indices], axis=0)
    
    # Enumerate and sort the books based on the average similarity scores
    sorted_books = sorted(list(enumerate(avg_scores)), key=lambda x: x[1], reverse=True)
    
    # Get the top 5 recommendations
    top_recommendations = sorted_books[:5]
    
    # Books indices
    books_indices = [i[0] for i in top_recommendations]
    
    # Recommended Books
    return books_cleaned['title'].iloc[books_indices]

# Example usage:
user_genre = input("Enter your preferred genre or interest: ")
recommended_books = give_rec_based_on_genre(user_genre)

print("Recommended Books:")
print(recommended_books)

Enter your preferred genre or interest: drama
Recommended Books:
38     The Complete Works of William Shakespeare: All...
111    The Complete Works of William Shakespeare: All...
186    The Complete Works of William Shakespeare: All...
251    The Complete Works of William Shakespeare: All...
290    The Complete Works of William Shakespeare: All...
Name: title, dtype: object
