In [None]:
import numpy as np
import pandas as pd

In [None]:
books = pd.read_csv("clean_book.csv")

In [None]:
books.columns

In [None]:
books.categories.value_counts().shape

In [None]:
books['categories'].value_counts().reset_index().query("count>=50")

In [None]:
category_mapping ={
    'Fiction':"Fiction",
    "Juvenile Fiction": "Children's Fiction",
    "Biography & Autobiography": "NonFiction",
    "History":"NonFiction",
    "Literary Criticism":"NonFiction",
    "Philosophy":"NonFiction",
    "Religion":"NonFiction",
    "Comics & Graphic Novels":"Fiction",
    "Drama":"Fiction",
    "Science":"NonFiction",
    "Juvenile Nonfiction":"Children's NonFiction",
    "Poetry":"Fiction"
}

In [None]:
books['modified_category'] = books['categories'].map(category_mapping)

In [None]:
books[~books['modified_category'].isna()].shape

In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI
from dotenv import load_dotenv
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import PydanticOutputParser
from pydantic import BaseModel,Field
from enum import Enum

In [None]:
class CategoryType(str, Enum):
    FICTION = "Fiction"
    NONFICTION = "NonFiction"

class Blueprint(BaseModel):
    category: CategoryType = Field(description="Category of the Provided Description")

In [None]:
model = ChatGoogleGenerativeAI(model="gemini-2.5-flash")
parser = PydanticOutputParser(pydantic_object=Blueprint)

In [None]:
prompt = PromptTemplate(
    template="Provided the Description of a book Categorize it as either Fiction or NonFiction. Description : \n {description} \n {format_instructions}",
    input_variables=['description'],
    validate_template=True,
    partial_variables= {'format_instructions':parser.get_format_instructions()}
)

In [None]:
chain = prompt | model |parser

In [None]:
example1 = books.loc[books["modified_category"] == "Fiction","description"].reset_index(drop=True)[0]
ex2=books.loc[books["modified_category"] == "NonFiction","description"].reset_index(drop=True)[0]

In [None]:
result=chain.invoke({'description':ex2})

In [None]:
result.category.value

In [None]:
def generate_prediction(task):
    chain = prompt | model |parser
    result =chain.invoke({'description':task})
    return result.category.value

In [None]:
generate_prediction(ex2)

In [None]:
from tqdm import tqdm

In [None]:
isbn=[]
predicted_cats=[]

missing_categ = books.loc[books['modified_category'].isna(),['isbn13','description']].reset_index(drop=True)

In [None]:
books['categories'].isna().sum()

In [None]:
missing_categ['isbn13']

In [None]:
for i in tqdm(range(0,len(missing_categ))):
    seq= missing_categ['description']
    predicted_cats += [generate_prediction(seq)]
    isbn+= [missing_categ['isbn13'][i]]

In [None]:
df=pd.DataFrame({"isbn":isbn,"category":predicted_cats})

In [None]:
df.shape

In [None]:
df.duplicated().sum()

In [None]:
df['category'].value_counts()

In [None]:
len(isbn)

In [None]:
len(predicted_cats)

In [None]:
new =pd.merge(missing_categ,df,left_on="isbn13",right_on="isbn",how="left")

In [None]:
new[new['isbn'].isna()]

In [None]:
new[new['isbn'] != new['isbn13']].shape

In [None]:
new_book= pd.merge(books,new,on="isbn13",how="left")

In [None]:
new_book.sample(4)

In [None]:
new_book['modified_category'] = np.where(
    new_book['modified_category'].isna() , new_book['category'],new_book['modified_category']
)

In [None]:
new_book.isna().sum()

In [None]:
new_book.drop(['isbn','category','description_y',],inplace=True,axis=1)

In [None]:
new_book.to_csv("new_book.csv",index=False)