In [None]:
import pandas as pd

In [None]:
books = pd.read_csv("../DATA/books_cleaned.csv")

In [None]:
books["categories"].value_counts().reset_index()

In [None]:
books["categories"].value_counts().reset_index().query("count > 50")

In [None]:
books[books["categories"] == "Juvenile Fiction"]

In [None]:
books[books["categories"] == "Juvenile Nonfiction"]

In [None]:

category_mapping = {'Fiction' : "Fiction",
 'Juvenile Fiction': "Children's Fiction",
 'Biography & Autobiography': "Nonfiction",
 'History': "Nonfiction",
 'Literary Criticism': "Nonfiction",
 'Philosophy': "Nonfiction",
 'Religion': "Nonfiction",
 'Comics & Graphic Novels': "Fiction",
 'Drama': "Fiction",
 'Juvenile Nonfiction': "Children's Nonfiction",
 'Science': "Nonfiction",
 'Poetry': "Fiction"}

books["simple_categories"] = books["categories"].map(category_mapping)

In [None]:
books.head()

In [None]:
books[~(books["simple_categories"].isna())].head()

In [None]:
from transformers import  pipeline

fiction_categories = ["Fiction", "Nonfiction"]

pipe = pipeline("zero-shot-classification",model="facebook/bart-large-mnli",device="cpu")

In [None]:
sequence = books.loc[books["simple_categories"] == "Fiction", "description"].reset_index(drop=True)[0]

In [None]:
pipe(sequence,fiction_categories)

In [None]:
import numpy as np

max_index = np.argmax(pipe(sequence, fiction_categories)["scores"])
max_label = pipe(sequence, fiction_categories)["labels"][max_index]
max_label

In [None]:
def generate_predictions(sequence,categories):
  predictions = pipe(sequence,categories)
  max_index = np.argmax(predictions["scores"])
  max_label = predictions["labels"][max_index]
  return max_label

In [None]:

from tqdm import tqdm

actual_cats = []
predicted_cats = []

for i in tqdm(range(0, 300)):
  sequence = books.loc[books["simple_categories"] == "Fiction", "description"].reset_index(drop=True)[i]
  predicted_cats += [generate_predictions(sequence, fiction_categories)]
  actual_cats += ["Fiction"]

In [None]:

for i in tqdm(range(0, 300)):
  sequence = books.loc[books["simple_categories"] == "Nonfiction", "description"].reset_index(drop=True)[i]
  predicted_cats += [generate_predictions(sequence, fiction_categories)]
  actual_cats += ["Nonfiction"]

In [None]:
predictions_df = pd.DataFrame({"actual_categories":actual_cats,"predicted_categories":predicted_cats})

predictions_df

In [None]:
predictions_df["correct_prediction"] = (np.where(predictions_df["actual_categories"] == predictions_df["predicted_categories"],1,0))

In [None]:
predictions_df["correct_prediction"].sum() / len(predictions_df)

In [None]:
isbns = []
predicted_cats = []

missing_cats = books.loc[books["simple_categories"].isna(),["isbn13","description"]].reset_index(drop=True)

In [None]:
for i in tqdm(range(0,len(missing_cats))):
  sequence = missing_cats["description"][i]
  predicted_cats += [generate_predictions(sequence, fiction_categories)]
  isbns += [missing_cats.loc[i, "isbn13"]]

In [39]:

missing_predicted_df = pd.DataFrame({"isbn13": isbns, "predicted_categories": predicted_cats})

missing_predicted_df

Unnamed: 0,isbn13,predicted_categories
0,9780002261982,Fiction
1,9780006280897,Nonfiction
2,9780006280934,Nonfiction
3,9780006380832,Nonfiction
4,9780006470229,Fiction
...,...,...
1458,9788125026600,Nonfiction
1459,9788171565641,Fiction
1460,9788172235222,Fiction
1461,9788173031014,Nonfiction


In [41]:
books = pd.merge(books,missing_predicted_df,on="isbn13",how="left", suffixes=('', '_new'))
books["simple_categories"] = np.where(books["simple_categories"].isna(), books["predicted_categories"], books["simple_categories"])
books = books.drop(columns=["predicted_categories"])

books.head()

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description,simple_categories,predicted_categories_x,predicted_categories_y,predicted_categories_new
0,9780002005883,2005883,Gilead,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0,Gilead,9780002005883 A NOVEL THAT READERS and critics...,Fiction,,,
1,9780002261982,2261987,Spider's Web,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0,Spider's Web: A Novel,9780002261982 A new 'Christie for Christmas' -...,Fiction,Fiction,Fiction,Fiction
2,9780006178736,6178731,Rage of angels,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0,Rage of angels,"9780006178736 A memorable, mesmerizing heroine...",Fiction,,,
3,9780006280897,6280897,The Four Loves,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0,The Four Loves,9780006280897 Lewis' work on the nature of lov...,Nonfiction,Nonfiction,Nonfiction,Nonfiction
4,9780006280934,6280935,The Problem of Pain,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=Kk-uV...,"""In The Problem of Pain, C.S. Lewis, one of th...",2002.0,4.09,176.0,37569.0,The Problem of Pain,"9780006280934 ""In The Problem of Pain, C.S. Le...",Nonfiction,Nonfiction,Nonfiction,Nonfiction


In [42]:

books[books["categories"].str.lower().isin([
  "romance",
  "science fiction",
  "scifi",
  "fantasy",
  "horror",
  "mystery",
  "thriller",
  "comedy",
  "crime",
  "historical"
])]

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description,simple_categories,predicted_categories_x,predicted_categories_y,predicted_categories_new
24,9780006513087,0006513085,Gravity,Tess Gerritsen,Science fiction,http://books.google.com/books/content?id=KI66c...,Emma Watson a research physician has been trai...,2004.0,4.04,342.0,8024.0,Gravity,9780006513087 Emma Watson a research physician...,Nonfiction,Nonfiction,Nonfiction,Nonfiction
486,9780099410355,0099410354,Traitor,Matthew Woodring Stover,Science fiction,http://books.google.com/books/content?id=VbICO...,"From the depths of catastrophe, a glimmer of h...",2002.0,4.0,320.0,6765.0,Traitor,"9780099410355 From the depths of catastrophe, ...",Fiction,Fiction,Fiction,Fiction
489,9780099422341,0099422344,Yeats is Dead!,Joseph O'Connor,Comedy,http://books.google.com/books/content?id=DrE3I...,"In aid of Amnesty International, this is a bri...",2002.0,3.39,298.0,34.0,Yeats is Dead!: A Novel by Fifteen Irish Writers,"9780099422341 In aid of Amnesty International,...",Fiction,Fiction,Fiction,Fiction
502,9780099446729,0099446723,Blackwood Farm,Anne Rice,Horror,http://books.google.com/books/content?id=cIn8T...,"Lestat Is Back, Saviour And Demon, Presiding O...",2003.0,3.86,774.0,26145.0,Blackwood Farm,"9780099446729 Lestat Is Back, Saviour And Demo...",Fiction,Fiction,Fiction,Fiction
1101,9780261102422,0261102427,The Silmarillion,John Ronald Reuel Tolkien,Fantasy,http://books.google.com/books/content?id=22ePu...,Tolkien's Silmarillion is the core work of the...,1999.0,3.91,384.0,253.0,The Silmarillion,9780261102422 Tolkien's Silmarillion is the co...,Fiction,Fiction,Fiction,Fiction
1446,9780340837955,0340837950,Stranger in a Strange Land,Robert A. Heinlein,Science fiction,http://books.google.com/books/content?id=ZQhiP...,"Epic, entertaining, Stranger in a Strange Land...",2005.0,3.92,672.0,563.0,Stranger in a Strange Land,"9780340837955 Epic, entertaining, Stranger in ...",Fiction,Fiction,Fiction,Fiction
1450,9780345251220,0345251229,Visions from Nowhere,William Arrow,Science fiction,,"The first novel in the series, ""Return to the ...",1976.0,3.23,183.0,10.0,Visions from Nowhere,"9780345251220 The first novel in the series, ""...",Fiction,Fiction,Fiction,Fiction
2861,9780575075597,0575075597,Replay,Ken Grimwood,Fantasy,http://books.google.com/books/content?id=9vmNP...,At forty-three Jeff Winston is tired of his lo...,2005.0,4.16,272.0,412.0,Replay,9780575075597 At forty-three Jeff Winston is t...,Fiction,Fiction,Fiction,Fiction
2876,9780590254762,0590254766,"The lion, the witch and the wardrobe",Clive Staples Lewis,Fantasy,,Four English school children enter the magic l...,1995.0,4.21,189.0,860.0,"The lion, the witch and the wardrobe",9780590254762 Four English school children ent...,Nonfiction,Nonfiction,Nonfiction,Nonfiction
3305,9780739423851,0739423851,Wizard's Castle,Diana Wynne Jones,Fantasy,http://books.google.com/books/content?id=hB7hA...,Howl's moving castle - Eldest of three sisters...,2002.0,4.44,376.0,439.0,Wizard's Castle,9780739423851 Howl's moving castle - Eldest of...,Fiction,Fiction,Fiction,Fiction


In [43]:
books.to_csv("../DATA/books_with_categories.csv",index=False)