Dataset:

https://www.kaggle.com/datasets/arpansri/books-summary/data


Importing Libraries

In [10]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

Load the Dataset

In [16]:
df = pd.read_csv("C:\\Users\\vaiju\\Documents\\Spring_2025\\DSA\\BooksSummary\\books_summary.csv")  
print(df.head())


   Unnamed: 0                            book_name  \
0           0          The Highly Sensitive Person   
1           1  Why Has Nobody Told Me This Before?   
2           2                 The Midnight Library   
3           3                      Brave New World   
4           4                                 1984   

                                           summaries categories  
0   is a self-assessment guide and how-to-live te...    science  
1   is a collection of a clinical psychologist’s ...    science  
2   tells the story of Nora, a depressed woman in...    science  
3   presents a futuristic society engineered perf...    science  
4   is the story of a man questioning the system ...    science  


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5194 entries, 0 to 5200
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  5194 non-null   int64 
 1   title       5194 non-null   object
 2   summary     5194 non-null   object
 3   categories  5194 non-null   object
dtypes: int64(1), object(3)
memory usage: 202.9+ KB


In [29]:
df.dtypes

Unnamed: 0     int64
title         object
summary       object
categories    object
dtype: object

Data Preprocessing

In [33]:
df.rename(columns={'Unnamed: 0': 'ID'}, inplace=True)

print(df.head())

   ID                                title  \
0   0          The Highly Sensitive Person   
1   1  Why Has Nobody Told Me This Before?   
2   2                 The Midnight Library   
3   3                      Brave New World   
4   4                                 1984   

                                             summary categories  
0   is a self-assessment guide and how-to-live te...    science  
1   is a collection of a clinical psychologist’s ...    science  
2   tells the story of Nora, a depressed woman in...    science  
3   presents a futuristic society engineered perf...    science  
4   is the story of a man questioning the system ...    science  


In [35]:
df.rename(columns={'book_name': 'title', 'summaries': 'summary'}, inplace=True)

df.head()


Unnamed: 0,ID,title,summary,categories
0,0,The Highly Sensitive Person,is a self-assessment guide and how-to-live te...,science
1,1,Why Has Nobody Told Me This Before?,is a collection of a clinical psychologist’s ...,science
2,2,The Midnight Library,"tells the story of Nora, a depressed woman in...",science
3,3,Brave New World,presents a futuristic society engineered perf...,science
4,4,1984,is the story of a man questioning the system ...,science


In [43]:
df = df[['title', 'summary']].dropna()
print(df.head())


                                 title  \
0          The Highly Sensitive Person   
1  Why Has Nobody Told Me This Before?   
2                 The Midnight Library   
3                      Brave New World   
4                                 1984   

                                             summary  
0   is a self-assessment guide and how-to-live te...  
1   is a collection of a clinical psychologist’s ...  
2   tells the story of Nora, a depressed woman in...  
3   presents a futuristic society engineered perf...  
4   is the story of a man questioning the system ...  


Transform the text data to vectors

In [48]:
# Initialize the TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english')

# Fit and transform the book summaries
tfidf_matrix = tfidf.fit_transform(df['summary'])
print(tfidf_matrix.shape)


(5194, 4790)


Compute Similarity

In [51]:
# Compute the cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)


Recommendation Function

In [65]:
def get_recommendations(query, df, tfidf, cosine_sim):
    # Transform the query using the same TF-IDF vectorizer
    query_tfidf = tfidf.transform([query])
    
    # Compute cosine similarity between the query and the dataset
    sim_scores = cosine_similarity(query_tfidf, tfidf_matrix)
    
    # Get the indices of the top 5 most similar books
    sim_scores = sim_scores.flatten()
    top_indices = sim_scores.argsort()[-10:][::-1]
    
    # Filter out duplicate recommendations
    seen_titles = set()
    recommendations = []
    for i in top_indices:
        title = df.iloc[i]['title']
        if title not in seen_titles:
            recommendations.append((title, sim_scores[i]))
            seen_titles.add(title)
        if len(recommendations) >= 5:
            break
    
    return recommendations

# Example query
query = "I love fantasy books with magical worlds and strong female protagonists."
recommendations = get_recommendations(query, df, tfidf, cosine_sim)
for title, score in recommendations:
    print(f'Title: {title}, Similarity Score: {score:.2f}')


Title: Lean In, Similarity Score: 0.19
Title: What They Don’t Teach You At Harvard Business School, Similarity Score: 0.18


In [69]:
# Example query
query = "I love and enjoy reading self help books. They develop my personality"
recommendations = get_recommendations(query, df, tfidf, cosine_sim)
for title, score in recommendations:
    print(f'Title: {title}, Similarity Score: {score:.2f}')


Title: How To Read A Book, Similarity Score: 0.21
Title: How to Take Smart Notes, Similarity Score: 0.20
Title: The Charisma Myth, Similarity Score: 0.18
