In [55]:
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA

df = pd.read_csv("dataset/book_data.csv")
df.fillna(value="", inplace=True)


df.drop_duplicates(subset=["book_title"], inplace=True)


df["genres"] = df["genres"].apply(lambda x: x.split("|"))

df["book_authors"] = df["book_authors"].apply(lambda x: x.split("|"))

df["book_pages"] = pd.to_numeric(
    df["book_pages"].str.replace(" pages", ""), errors="coerce"
)

df["book_pages"] = (
    df["book_pages"]
    .astype(str)
    .str.replace(" pages", "")
    .replace("", np.nan)
    .astype(float)
)

# print(df.describe())
# print(df.info())
# print(df.info)
authors = [author for authors in df["book_authors"] for author in authors]
unique_authors = list(set(authors))

le = LabelEncoder()

author_labels = le.fit_transform(unique_authors)
author_dict = dict(zip(unique_authors, author_labels))


df_author = pd.DataFrame.from_dict(author_dict, orient='index', columns=['author_label'])

# write the DataFrame to an Excel file
df_author.to_excel('authors.xlsx')

# Create a new column in the dataframe with the integer labels for each author
df["author_labels"] = [
    list(map(lambda x: author_dict[x], authors)) for authors in df["book_authors"]
]

#print(df["author_labels"])

In [56]:
df['book_rating_count'].describe()

count    4.848300e+04
mean     2.802133e+04
std      1.586540e+05
min      0.000000e+00
25%      3.310000e+02
50%      2.348000e+03
75%      1.016800e+04
max      5.588580e+06
Name: book_rating_count, dtype: float64

In [57]:
df[df["book_title"] == "Gump and Co."]

Unnamed: 0,book_authors,book_desc,book_edition,book_format,book_isbn,book_pages,book_rating,book_rating_count,book_review_count,book_title,genres,image_url,author_labels
50700,[Winston Groom],Forrest Gump captured our hearts in the #1 New...,,Paperback,9780000000000.0,242.0,3.27,2546,209,Gump and Co.,"[Fiction, Humor]",https://images.gr-assets.com/books/1387667059l...,[29332]


In [58]:
def recommend(inp_book_title, df_all):
    book_index = df_all.index[df['book_title'] == inp_book_title].tolist()[0]
    #print(f'Book index is {book_index}')
    
    book_authors = df.loc[book_index, 'author_labels']
    #print(f'Book authors are {book_authors}')
    
    recommended_books = df[df['author_labels'].apply(lambda x: set(x) ==set(book_authors))].sort_values('book_rating')
    #print(recommended_books)
    
    if book_index in recommended_books.index:
        recommended_books = recommended_books.drop(book_index)
        
    return recommended_books

In [59]:
book_name = 'Twilight'
books = recommend(book_name, df)
print(books.head())

            book_authors                                          book_desc  \
21501  [Stephenie Meyer]  Celebrate the tenth anniversary of Twilight! T...   
923    [Stephenie Meyer]  Bree Tanner can barely remember life before sh...   
1302   [Stephenie Meyer]  As Seattle is ravaged by a string of mysteriou...   
1363   [Stephenie Meyer]  To be irrevocably in love with a vampire is bo...   
41907  [Stephenie Meyer]  «No tengas miedo», le susurré.«Somos como una ...   

                                  book_edition            book_format  \
21501  Twilight Tenth Anniversary/Dual Edition              Hardcover   
923                              First Edition              Hardcover   
1302                             First Edition              Hardcover   
1363                             First Edition              Hardcover   
41907                                           Mass Market Paperback   

      book_isbn  book_pages  book_rating  book_rating_count  \
21501  9.78E+12       7

In [60]:
book_title = "Twilight / Life and Death"  # example book title
book_row = df[df["book_title"] == book_title]  # filter the DataFrame based on book title
author_label = book_row["author_labels"].values[0]  # get the author label as a list
author_names = []
for label in author_label:
    author_names.append(df[df["author_labels"].apply(lambda x: label in x)]["book_authors"].values[0])
print("Author Label: ", author_label)
print("Author Names: ", author_names)


Author Label:  [26503]
Author Names:  [['Stephenie Meyer']]
