In [176]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
import nltk

In [177]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\smit1\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [178]:
# Load the dataset
df = pd.read_csv('books.csv')

In [179]:
df

Unnamed: 0,title,author,description,language,genres,characters,bookFormat,pages,publisher,publishDate,awards,setting,coverImg,price,genre1,genre2,genre3,genre4,genre5,main_genre
0,Harry Potter and the Order of the Phoenix,"J.K. Rowling, Mary GrandPré (Illustrator)",There is a door at the end of a silent corrido...,English,"['Fantasy', 'Young Adult', 'Fiction', 'Magic',...","['Sirius Black', 'Draco Malfoy', 'Ron Weasley'...",Paperback,870,Scholastic Inc.,2004/09/28,['Bram Stoker Award for Works for Young Reader...,['Hogwarts School of Witchcraft and Wizardry (...,https://i.gr-assets.com/images/S/compressed.ph...,630.990,Fantasy','Young Adult','Fiction','Magic','Childrens',Fantasy
1,Twilight,Stephenie Meyer,About three things I was absolutely positive.\...,English,"['Young Adult', 'Fantasy', 'Romance', 'Vampire...","['Edward Cullen', 'Jacob Black', 'Laurent', 'R...",Paperback,501,"Little, Brown and Company",2006/09/06,"['Georgia Peach Book Award (2007)', 'Buxtehude...","['Forks, Washington (United States)', 'Phoenix...",https://i.gr-assets.com/images/S/compressed.ph...,179.550,Young Adult','Fantasy','Romance','Vampires','Fiction',Children
2,The Book Thief,Markus Zusak (Goodreads Author),Librarian's note: An alternate cover edition c...,English,"['Historical Fiction', 'Fiction', 'Young Adult...","['Liesel Meminger', 'Hans Hubermann', 'Rudy St...",Hardcover,552,Alfred A. Knopf,2006/03/14,['National Jewish Book Award for Children’s an...,"['Molching (Germany)', 'Germany']",https://i.gr-assets.com/images/S/compressed.ph...,324.900,Historical Fiction','Fiction','Young Adult','Historical','Classics',Fiction
3,Animal Farm,"George Orwell, Russell Baker (Preface), C.M. W...",Librarian's note: There is an Alternate Cover ...,English,"['Classics', 'Fiction', 'Dystopia', 'Fantasy',...","['Snowball', 'Napoleon', 'Clover', 'Boxer', 'O...",Mass Market Paperback,141,Signet Classics,1996/04/28,"['Prometheus Hall of Fame Award (2011)', 'Retr...","['England', 'United Kingdom']",https://i.gr-assets.com/images/S/compressed.ph...,377.910,Classics','Fiction','Dystopia','Fantasy','Literature',Others
4,J.R.R. Tolkien 4-Book Boxed Set: The Hobbit an...,J.R.R. Tolkien,"This four-volume, boxed set contains J.R.R. To...",English,"['Fantasy', 'Fiction', 'Classics', 'Adventure'...","['Frodo Baggins', 'Gandalf', 'Bilbo Baggins', ...",Mass Market Paperback,1728,Ballantine Books,2012/09/25,[],['Middle-earth'],https://i.gr-assets.com/images/S/compressed.ph...,1808.325,Fantasy','Fiction','Classics','Adventure','Science Fiction Fantasy',Fantasy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23255,Heal Your Body: The Mental Causes for Physical...,Louise L. Hay,Heal Your Body is a fresh and easy step-by-ste...,English,"['Self Help', 'Health', 'Nonfiction', 'Spiritu...",[],Paperback,96,Hay House,January 1st 1984,[],[],https://i.gr-assets.com/images/S/compressed.ph...,389.880,Self Help','Health','Nonfiction','Spirituality','Psychology',Nonfiction
23256,Attracted to Fire,DiAnn Mills (Goodreads Author),Special Agent Meghan Connors' dream of one day...,English,"['Christian Fiction', 'Christian', 'Suspense',...",[],Paperback,416,Tyndale House Publishers,October 1st 2011,['HOLT Medallion by Virginia Romance Writers N...,['West Texas (United States)'],https://i.gr-assets.com/images/S/compressed.ph...,474.525,Christian Fiction','Christian','Suspense','Romance','Mystery',Romance
23257,Unbelievable,Sherry Gammon (Goodreads Author),Lilah Lopez Dreser's in town to take care of u...,English,"['Romance', 'Young Adult', 'Contemporary', 'Co...",[],Paperback,360,Wordpaintings Unlimited,April 11th 2013,[],"['Port Fare, New York (United States)']",https://i.gr-assets.com/images/S/compressed.ph...,1639.890,Romance','Young Adult','Contemporary','Contemporary Romance','Suspense',Romance
23258,Marked,Kim Richardson (Goodreads Author),--READERS FAVORITE AWARDS WINNER 2011--Sixteen...,English,"['Fantasy', 'Young Adult', 'Paranormal', 'Ange...",[],Paperback,280,CreateSpace,March 18th 2011,"[""Readers' Favorite Book Award (2011)""]",[],https://i.gr-assets.com/images/S/compressed.ph...,630.135,Fantasy','Young Adult','Paranormal','Angels','Romance',Fantasy


In [180]:
df.to_csv('books.csv', index=False)

In [183]:
df.to_json('books.json',orient='records')

In [None]:
df.to_csv()

In [None]:
# Select relevant fields
df = df[['title', 'author', 'description', 'language', 'genres', 'publisher', 'awards', 'setting', 'main_genre']]


In [None]:
# Remove English stop words from the description and setting
stop_words = set(stopwords.words('english'))

In [None]:
df

In [None]:
def preprocess_text(text):
    # Tokenize and remove stop words
    return ' '.join([word for word in text.split() if word.lower() not in stop_words])


In [None]:
# Apply preprocessing to description and setting
df['description'] = df['description'].apply(preprocess_text)
df['setting'] = df['setting'].apply(preprocess_text)


In [None]:
# Combine description and setting for the recommendation model
df['combined_text'] = df['description'] + ' ' + df['setting']


In [None]:
# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['combined_text'])


In [None]:
# Compute cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [None]:
# Function to get recommendations
def recommend_books(title, num_recommendations=5):
    # Get index of the book that matches the title
    idx = df[df['title'] == title].index[0]

    # Get the pairwise similarity scores of all books with that book
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the books based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the most similar books
    sim_scores = sim_scores[0:num_recommendations + 1]

    # Get the book indices
    book_indices = [i[0] for i in sim_scores]

    # Return the top most similar books
    return df.iloc[book_indices][['title', 'author', 'main_genre']]

In [None]:
# Example usage
recommended_books = recommend_books("The Alchemist", num_recommendations=5)  # Replace with an actual book title
print(recommended_books)

In [None]:
# title
# auther
# description
# language
# genres
# publisher
# awards
# setting
# main_genre