In [1]:
import pandas as pd
import numpy as np
import ast
from sklearn.preprocessing import MultiLabelBinarizer
from gensim.models import Word2Vec
from scipy.spatial.distance import cosine
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_csv('books_10101.csv')

In [3]:
df.head(10)

Unnamed: 0,Title,Author,Rating,Genre,Publisher,PublishDate,Unnamed: 6
0,The Hunger Games,Suzanne Collins,4.33,"['Young Adult', 'Fiction', 'Dystopia', 'Fantas...",Scholastic Press,09/14/08,
1,Harry Potter and the Order of the Phoenix,"J.K. Rowling, Mary GrandPrÃ© (Illustrator)",4.5,"['Fantasy', 'Young Adult', 'Fiction', 'Magic',...",Scholastic Inc.,09/28/04,
2,To Kill a Mockingbird,Harper Lee,4.28,"['Classics', 'Fiction', 'Historical Fiction', ...",Harper Perennial Modern Classics,05/23/06,
3,Pride and Prejudice,"Jane Austen, Anna Quindlen (Introduction)",4.26,"['Classics', 'Fiction', 'Romance', 'Historical...",Modern Library,10/10/00,
4,Twilight,Stephenie Meyer,3.6,"['Young Adult', 'Fantasy', 'Romance', 'Vampire...","Little, Brown and Company",9/6/06,
5,The Book Thief,Markus Zusak (Goodreads Author),4.37,"['Historical Fiction', 'Fiction', 'Young Adult...",Alfred A. Knopf,03/14/06,
6,Animal Farm,"George Orwell, Russell Baker (Preface), C.M. W...",3.95,"['Classics', 'Fiction', 'Dystopia', 'Fantasy',...",Signet Classics,04/28/96,
7,The Chronicles of Narnia,"C.S. Lewis, Pauline Baynes (Illustrator)",4.26,"['Fantasy', 'Classics', 'Fiction', 'Young Adul...",HarperCollins,09/16/02,
8,J.R.R. Tolkien 4-Book Boxed Set: The Hobbit an...,J.R.R. Tolkien,4.6,"['Fantasy', 'Fiction', 'Classics', 'Adventure'...",Ballantine Books,09/25/12,
9,Gone with the Wind,Margaret Mitchell,4.3,"['Classics', 'Historical Fiction', 'Fiction', ...",Warner Books,4/1/99,


In [4]:
# Filter out missing values (NaN) in 'Title', 'Author', and 'Publisher' columns
df_cleaned = df.dropna(subset=['Title', 'Author', 'Publisher']).copy()
df_cleaned.head()

Unnamed: 0,Title,Author,Rating,Genre,Publisher,PublishDate,Unnamed: 6
0,The Hunger Games,Suzanne Collins,4.33,"['Young Adult', 'Fiction', 'Dystopia', 'Fantas...",Scholastic Press,09/14/08,
1,Harry Potter and the Order of the Phoenix,"J.K. Rowling, Mary GrandPrÃ© (Illustrator)",4.5,"['Fantasy', 'Young Adult', 'Fiction', 'Magic',...",Scholastic Inc.,09/28/04,
2,To Kill a Mockingbird,Harper Lee,4.28,"['Classics', 'Fiction', 'Historical Fiction', ...",Harper Perennial Modern Classics,05/23/06,
3,Pride and Prejudice,"Jane Austen, Anna Quindlen (Introduction)",4.26,"['Classics', 'Fiction', 'Romance', 'Historical...",Modern Library,10/10/00,
4,Twilight,Stephenie Meyer,3.6,"['Young Adult', 'Fantasy', 'Romance', 'Vampire...","Little, Brown and Company",9/6/06,


In [5]:
df_cleaned.drop(columns=['Unnamed: 6'], inplace=True)
df_cleaned.head()

Unnamed: 0,Title,Author,Rating,Genre,Publisher,PublishDate
0,The Hunger Games,Suzanne Collins,4.33,"['Young Adult', 'Fiction', 'Dystopia', 'Fantas...",Scholastic Press,09/14/08
1,Harry Potter and the Order of the Phoenix,"J.K. Rowling, Mary GrandPrÃ© (Illustrator)",4.5,"['Fantasy', 'Young Adult', 'Fiction', 'Magic',...",Scholastic Inc.,09/28/04
2,To Kill a Mockingbird,Harper Lee,4.28,"['Classics', 'Fiction', 'Historical Fiction', ...",Harper Perennial Modern Classics,05/23/06
3,Pride and Prejudice,"Jane Austen, Anna Quindlen (Introduction)",4.26,"['Classics', 'Fiction', 'Romance', 'Historical...",Modern Library,10/10/00
4,Twilight,Stephenie Meyer,3.6,"['Young Adult', 'Fantasy', 'Romance', 'Vampire...","Little, Brown and Company",9/6/06


In [6]:
# Function to clean the author names
def clean_author_name(author_name):
    return author_name.split(',')[0]

In [7]:
# Apply the clean_author_name function to the 'Author' column
df_cleaned['Author'] = df_cleaned['Author'].apply(clean_author_name)

In [8]:
df_cleaned.head()

Unnamed: 0,Title,Author,Rating,Genre,Publisher,PublishDate
0,The Hunger Games,Suzanne Collins,4.33,"['Young Adult', 'Fiction', 'Dystopia', 'Fantas...",Scholastic Press,09/14/08
1,Harry Potter and the Order of the Phoenix,J.K. Rowling,4.5,"['Fantasy', 'Young Adult', 'Fiction', 'Magic',...",Scholastic Inc.,09/28/04
2,To Kill a Mockingbird,Harper Lee,4.28,"['Classics', 'Fiction', 'Historical Fiction', ...",Harper Perennial Modern Classics,05/23/06
3,Pride and Prejudice,Jane Austen,4.26,"['Classics', 'Fiction', 'Romance', 'Historical...",Modern Library,10/10/00
4,Twilight,Stephenie Meyer,3.6,"['Young Adult', 'Fantasy', 'Romance', 'Vampire...","Little, Brown and Company",9/6/06


In [9]:
# Function to convert the Genre strings to lists of genres
def convert_genres(genre_str):
    try:
        genre_list = ast.literal_eval(genre_str)
        if not isinstance(genre_list, list):
            genre_list = [genre_list]
        return genre_list
    except (ValueError, SyntaxError):
        return []

In [10]:
# Convert the Genre strings to lists of genres
df['Genre'] = df['Genre'].apply(convert_genres)

In [11]:
def vectorize_text(text):
    words = text.split()
    vec = np.zeros(model.vector_size)
    count = 0
    for word in words:
        if word in model.wv:
            vec += np.array(model.wv[word])
            count += 1
    if count > 0:
        return vec / count
    return vec

In [12]:
# Create a Word2Vec model with cleaned data
df_cleaned['Text'] = df_cleaned['Title'] + " " + df_cleaned['Author'] + " " + df_cleaned['Genre'].apply(lambda x: ' '.join(x))
model = Word2Vec(sentences=df_cleaned['Text'].str.split(), vector_size=100, window=5, min_count=1, workers=4)

In [13]:
# Vectorize the Text column using .apply
df_cleaned.loc[:, 'TextVector'] = df_cleaned['Text'].apply(vectorize_text)

In [14]:
# Drop any rows containing NaN values in vectors
df_cleaned.dropna(subset=['TextVector'], inplace=True)

In [15]:
# Fill any remaining NaN values for 'TextVector' with zeros
df_cleaned['TextVector'].fillna(0, inplace=True)

In [16]:
# Create similarity matrices
title_sim_matrix = cosine_similarity(df_cleaned['TextVector'].tolist())
author_sim_matrix = cosine_similarity(df_cleaned['TextVector'].tolist())

In [17]:
# Combine the similarity matrices
combined_vectors = np.column_stack(df_cleaned['TextVector'].apply(lambda x: np.nan_to_num(x)).tolist())
combined_sim_matrix = cosine_similarity(combined_vectors)

rating_sim_matrix = 1 - np.abs(df_cleaned['Rating'].values[:, None] - df_cleaned['Rating'].values) / 5

In [18]:
def recommend_books(book_title, num_recommendations=5):
    if len(df_cleaned) == 0:
        return "Not enough books to provide recommendations."

    if book_title not in df_cleaned['Title'].values:
        return f"Book with title '{book_title}' not found in the database."

    target_index = df_cleaned[df_cleaned['Title'] == book_title].index[0]

    similar_books_indices = np.argsort(title_sim_matrix[target_index])[::-1]

    recommended_books = df_cleaned.iloc[similar_books_indices[1:num_recommendations + 1]]
    return recommended_books

In [19]:
# Check for any remaining NaN values
print(df_cleaned.isnull().sum())

Title          0
Author         0
Rating         0
Genre          0
Publisher      0
PublishDate    0
Text           0
TextVector     0
dtype: int64


In [20]:
print(df_cleaned.isnull().sum())
# Check the number of books in the dataset
num_books = len(df_cleaned)
print("Number of books in the dataset:", num_books)

Title          0
Author         0
Rating         0
Genre          0
Publisher      0
PublishDate    0
Text           0
TextVector     0
dtype: int64
Number of books in the dataset: 9744


In [21]:
# Replace "To Kill a Mockingbird" with the title of your choice for book recommendation
recommendations = recommend_books("To Kill a Mockingbird", num_recommendations=5)
print(recommendations)

                             Title             Author  Rating  \
3718                    The Pigman        Paul Zindel    3.59   
7930                    I Am David          Anne Holm    3.98   
169              Heart of Darkness      Joseph Conrad    3.43   
872   Roll of Thunder, Hear My Cry  Mildred D. Taylor    3.84   
373               A Separate Peace       John Knowles    3.58   

                                                  Genre  \
3718  ['Fiction', 'Young Adult', 'Classics', 'School...   
7930  ['Historical Fiction', 'Fiction', 'Young Adult...   
169   ['Classics', 'Fiction', 'Literature', 'Africa'...   
872   ['Historical Fiction', 'Young Adult', 'Fiction...   
373   ['Classics', 'Fiction', 'Young Adult', 'Histor...   

                        Publisher       PublishDate  \
3718                   HarperTeen              2005   
7930  HMH Books for Young Readers  January 1st 2004   
169                 Green Integer           10/1/03   
872                  Puffin Books 

In [22]:
# Replace the title of your choice for book recommendation
recommendations = recommend_books("Harry Potter and the Order of the Phoenix", num_recommendations=5)
print(recommendations)

                                        Title        Author  Rating  \
103       Harry Potter and the Goblet of Fire  J.K. Rowling    4.56   
126   Harry Potter and the Chamber of Secrets  J.K. Rowling    4.43   
93   Harry Potter and the Prisoner of Azkaban  J.K. Rowling    4.57   
105    Harry Potter and the Half-Blood Prince  J.K. Rowling    4.57   
32      Harry Potter and the Sorcerer's Stone  J.K. Rowling    4.47   

                                                 Genre  \
103  ['Fantasy', 'Young Adult', 'Fiction', 'Magic',...   
126  ['Fantasy', 'Young Adult', 'Fiction', 'Magic',...   
93   ['Fantasy', 'Young Adult', 'Fiction', 'Magic',...   
105  ['Fantasy', 'Young Adult', 'Fiction', 'Magic',...   
32   ['Fantasy', 'Fiction', 'Young Adult', 'Magic',...   

                                    Publisher PublishDate  \
103                                Scholastic    09/28/02   
126  Arthur A. Levine Books / Scholastic Inc.      6/2/99   
93                            Scholastic 