In [2]:
import pandas as pd 

books = pd.read_csv("books_cleaned.csv")


In [3]:
books['categories'].value_counts()

categories
Fiction                       2111
Juvenile Fiction               390
Biography & Autobiography      311
History                        207
Literary Criticism             124
                              ... 
Aged women                       1
Imperialism                      1
Human-animal relationships       1
Amish                            1
Human cloning                    1
Name: count, Length: 479, dtype: int64

In [4]:
books["categories"].value_counts().reset_index().query("count > 50")


Unnamed: 0,categories,count
0,Fiction,2111
1,Juvenile Fiction,390
2,Biography & Autobiography,311
3,History,207
4,Literary Criticism,124
5,Religion,117
6,Philosophy,117
7,Comics & Graphic Novels,116
8,Drama,86
9,Juvenile Nonfiction,57


In [6]:
books[books['categories'] == "Juvenile Fiction"].shape

(390, 13)

In [7]:
books[books['categories'] == "Juvenile Nonfiction"].shape

(57, 13)

In [8]:
category_mapping = {
    'Fiction': "Fiction",
    'Juvenile Fiction': "Children's Fiction",
    'Biology & Autobiography': "Nonfiction",
    'History': "Nonfiction",
    'Literary Criticism': "Nonfiction",
    'Philosophy': "Nonfiction",
    'Religion': "Nonfiction",
    'Comics & Graphic Novels': "Fiction",
    'Drama': "Fiction",
    'Juvenile Nonfiction': "Children's Nonfiction",
    'Science': "Nonfiction",
    'Poetry': "Fiction"
}

books['simple_categories'] = books['categories'].map(category_mapping)

In [9]:
books.head(2)

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description,simple_categories
0,9780002005883,2005883,Gilead,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0,Gilead,9780002005883 A NOVEL THAT READERS and critics...,Fiction
1,9780002261982,2261987,Spider's Web,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0,Spider's Web: A Novel,9780002261982 A new 'Christie for Christmas' -...,


In [11]:
books[~(books["simple_categories"].isna())].shape

(3432, 14)

In [None]:
from transformers import pipeline

fiction_categories = ["Fiction", "Nonfiction"]


pipe=pipeline("zero-shot-classification", 
              model="facebook/bart-large-mnli",
              device='mps')


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Error while downloading from https://huggingface.co/facebook/bart-large-mnli/resolve/main/model.safetensors: HTTPSConnectionPool(host='cas-bridge.xethub.hf.co', port=443): Read timed out.
Trying to resume download...
Error while downloading from https://huggingface.co/facebook/bart-large-mnli/resolve/main/model.safetensors: HTTPSConnectionPool(host='cas-bridge.xethub.hf.co', port=443): Read timed out.
Trying to resume download...


In [None]:
sequence = books.loc[books['simple_categories'] == "Fiction", "description"].reset_index(drop=True)[0]

In [None]:
pipe(sequence, fiction_categories)


In [None]:
# Gettign the predicted label 
import numpy as np 

max_index = np.argmax(pipe(sequence, fiction_categories)['scores'])

max_label = pipe(sequence, fiction_categories)["labels"][max_index]
max_label


In [None]:

def generate_predictions(sequence, categories):
    predictions = pipe(sequence, categories)["labels"]
    max_index = np.argmax(predictions['scores'])
    max_label = predictions["labels"][max_index]
    return max_label




In [None]:
# Testing 
from tqdm import tqdm
# To calculate the time taken for execution

actual_cats = []
predicted_cats = []

for i in tqdm(range(0, 300)):
    sequence = books.loc[books["simple_categories"] == "Fiction", "description"].reset_index(drop=True)[i]
    predicted_cat += generate_predictions(sequence, fiction_categories)
    actual_cats += ["Fiction"]

for i in tqdm(range(0, 300)):
    sequence = books.loc[books["simple_categories"] == "Nonfiction", "description"].reset_index(drop=True)[i]
    predicted_cat += generate_predictions(sequence, fiction_categories)
    actual_cats += ["Nonfiction"]


In [None]:

predictions_df = pd.DataFrame({"actual_categories": actual_cats, "predicted_categories": predicted_cats})

predictions_df.head()


In [None]:

predictions_df["correct_predictions"] = (
    np.where(predictions_df['actual_categories'] == predictions_df['predicted_categories'], 1, 0)
)



In [None]:
predictions_df["correct_prediction"].sum() / len(predictions_df)

In [None]:
isbns=[]
predicted_cats = []

missing_cats = books.loc[books["simple_categories"].isna(), ["isbn13", "description"]].reset_index(drop=True)

In [None]:
for i in tqdm(range(0, len(missing_cats))):
    sequence = missing_cats["description"][i]
    predicted_cats += [generate_predictions(sequence, fiction_categories)]
    isbns += [missing_cats["isbn13"][i]]





In [None]:
missing_predicted_df = pd.DataFrame({"isbn13": isbns, "predicted_categories": predicted_cats})
missing_predicted_df.head(2)


In [None]:
books = pd.merge(books, missing_predicted_df, on="isbns13", how="left")
books["simple_categories"] = np.where(books["simple_categories"].isna(), books["predicted_categories", books["predicted_categories"], books["simple_categories"]])
books = books.drop(columns=["predicted_categories"])




In [None]:
books[books["categories"].str.lower().isin([
    "romance",
    "science fiction",
    "scifi",
    "fantasy",
    "horror",
    "mystery",
    "thriller",
    "comedy",
    "crime", 
    "historical"
])]

In [None]:
books.to_csv("books_with_categories.csv", index=False)