In [None]:
from langchain_openai import OpenAIEmbeddings
from langchain_chroma  import Chroma
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter

In [None]:
from dotenv import load_dotenv

load_dotenv()


In [None]:
import pandas as pd

books=pd.read_csv("books_cleaned.csv")

In [None]:
books


In [None]:
books["tagged_description"]

In [None]:
books["tagged_description"].to_csv("tagged_description.csv",
                                   sep="\t",
                                   index=False,
                                   header=False
                                   )

In [None]:
raw_data=TextLoader("tagged_description.csv",encoding="utf-8").load()
text_splitter=CharacterTextSplitter(chunk_size=1  ,chunk_overlap = 0,separator="\n")
document=text_splitter.split_documents(raw_data)

In [None]:
document[0]


In [None]:
document[1].page_content.split(" ")[0]

In [None]:
from langchain_community.embeddings import HuggingFaceEmbeddings

In [None]:
embedder=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
db_books= Chroma.from_documents(document,embedding=embedder)
print(db_books)

In [None]:
query = "a book to teach children about nature"
docs=db_books.similarity_search(query,k=10)
docs

In [None]:
books.loc[books["isbn13"]==int(docs[0].page_content.split(" ")[0].strip())]


In [None]:
books[books["isbn13"]==int(docs[0].page_content.split()[0].strip())]

In [None]:
int(docs[0].page_content.split(" ")[0].strip())

In [None]:
def semantic_search_similarity(
        query:str,
        top_k : int = 10,

) -> pd.DataFrame:
    recs=db_books.similarity_search(query,k=50)

    books_list=[]
    for rec in recs:
        books_list += [int(rec.page_content.strip('"').split()[0])]

    return books.loc[books["isbn13"].isin(books_list)].head(top_k)

In [None]:
semantic_search_similarity("books")

In [None]:
books["categories"].value_counts().reset_index()

In [None]:
books["categories"].value_counts().reset_index().query("count >= 50")

In [None]:
category_mapping={
    "Fiction":"fiction",
"Juvenile Fiction":"Children's fiction",
"Biography & Autobiography":"nonfiction",
"History":"nonfiction",
"Literary Criticism":"nonfiction",
"Religion":"nonfiction",
"Philosophy":"nonfiction",
"Comics & Graphic Novels":"fiction",
"Drama":"fiction",
"Juvenile Nonfiction":"children nonfiction",
"Science":"nonfiction",
"Poetry":"fiction",
"Literary Collections":"nonfiction"

}

In [None]:
books["simple_categories"]=books["categories"].map(category_mapping)

In [None]:
books

In [None]:
books[~(books["simple_categories"].isna())]

In [None]:
from transformers import pipeline
labels=["fiction","nonfiction","childern/'s nonfiction", "children/'s fiction"]
pipe = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=0)


In [None]:
sequence=books.loc[books["simple_categories"]=="fiction","description"].reset_index(drop=True)[0]

In [None]:
sequence

In [None]:
labels=['fiction','nonfiction',"children's fiction", "children's nonfiction"]
pipe(sequence,labels)

In [None]:
import numpy as np
#argmax finds the index of maximum value
max_index=np.argmax(pipe(sequence,labels)["scores"])
max_label=pipe(sequence,labels)["labels"][max_index]

In [None]:
max_label
labels=['fiction','nonfiction']



In [None]:
def generate_prediction(
        sequence :str ,
        categories : list
      )-> str:
    max_index=np.argmax(pipe(sequence,categories)["scores"])
    max_label=pipe(sequence,categories)["labels"][max_index]
    return max_label


In [None]:
sequence1=books.loc[books["simple_categories"]=="nonfiction","description"].reset_index(drop=True)[0]


In [None]:
ans=generate_prediction(sequence1,labels)
ans

In [None]:
from tqdm import tqdm
# long loops looks like they are stuck therefore tqdm is used which shows the percentage
#     speed,items processed etc

In [None]:
actual_cats=[]
pred_cats=[]

for i in tqdm(range(0,300)):
    sequence=books.loc[books["simple_categories"]=="fiction","description"].reset_index(drop=True)[i]
    actual_cats += ["fiction"]
    pred_cats +=[generate_prediction(sequence,labels)]

In [None]:
for i in tqdm(range(0,300)):
    sequence=books.loc[books["simple_categories"]=="nonfiction","description"].reset_index(drop=True)[i]
    actual_cats += ["nonfiction"]
    pred_cats +=[generate_prediction(sequence,labels)]


In [None]:
# for i in tqdm(range(0,300)):
#     sequence=books.loc[books["simple_categories"]=="children's fiction","description"].reset_index(drop=True)[i]
#     actual_cats += ["children's fiction"]
#     pred_cats +=generate_prediction(sequence,labels)


In [None]:
# for i in tqdm(range(0,300)):
#     sequence=books.loc[books["simple_categories"]=="children's nonfiction","description"].reset_index(drop=True)[i]
#     actual_cats += ["children's nonfiction"]
#     pred_cats +=generate_prediction(sequence,labels)


In [None]:
predicted_categories = pd.DataFrame({"actuals_categories": actual_cats, "predicted_categories": pred_cats})


In [None]:
len(actual_cats)

In [None]:
len(pred_cats)

In [None]:
pred_cats

In [None]:
predicted_categories


In [None]:
predicted_categories["correct_prediction"]=(np.where(predicted_categories["actuals_categories"]==predicted_categories["predicted_categories"],1,0))

In [None]:
predicted_categories["correct_prediction"].sum()/len(predicted_categories)

In [None]:
books.loc[books["simple_categories"].isna()]

In [None]:
isbn=[]
pred_cats=[]

missing_cats_books=books.loc[books["simple_categories"].isna(),["isbn13","description"]].reset_index(drop=True)

In [None]:

for i in tqdm(range(0,len(missing_cats_books))):
    sequence=missing_cats_books["description"][i]
    pred_cats += [generate_prediction(sequence,labels)]
    isbn += [missing_cats_books["isbn13"][i]]

In [None]:
missing_pred_df=pd.DataFrame({"isbn13": isbn, "predicted_categories": pred_cats})

In [None]:
missing_pred_df

In [None]:
books=pd.merge(books,missing_pred_df,on="isbn13",how="left")
books["simple_categories"]=np.where(books["simple_categories"].isna(),books["predicted_categories"],books["simple_categories"])

In [None]:
books.loc[books["simple_categories"].isna()]

In [None]:
books=books.drop(columns=["predicted_categories"])

In [None]:
books

In [None]:
books[books["categories"].str.lower().isin([
    "romance",
    "science fiction",
    "scifi",
    "fantasy",
    "mystery",
    "thriller",
    "comedy",
    "crime",
    "historical"
])]

In [None]:
books.to_csv("book_with_categories.csv",index=False)

In [None]:
db_books
#sentiment analysis

In [None]:
books["tagged_description"].to_csv("tagged_description.txt",
                                   sep="\t",
                                   index=False,
                                   header=False
                                   )