In [1]:
import pandas as pd
import numpy as np

In [2]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter,RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.schema import Document
from langchain_chroma import Chroma
from langchain_groq import ChatGroq
from dotenv import load_dotenv

In [3]:
books = pd.read_csv(r'book-recommender\final_books.csv')

In [4]:
books.head(3)

Unnamed: 0.1,Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description
0,0,9780002005883,2005883,Gilead,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0,Gilead,9780002005883 A NOVEL THAT READERS and critics...
1,1,9780002261982,2261987,Spider's Web,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0,Spider's Web: A Novel,9780002261982 A new 'Christie for Christmas' -...
2,3,9780006178736,6178731,Rage of angels,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0,Rage of angels,"9780006178736 A memorable, mesmerizing heroine..."


In [6]:
loader = TextLoader('tagged_descripton.txt',encoding='utf-8')
raw_document = loader.load()

full_text = raw_document[0].page_content
metadata = raw_document[0].metadata

description_text = full_text.split('\n')

#final document object
documents = [Document(page_content=text.strip(),
                      metadata=metadata)
                      for text in description_text if text.strip()]




In [7]:
print(f"total documents:",len(documents))
print(documents[0])

total documents: 5197
page_content='9780002005883 A NOVEL THAT READERS and critics have been eagerly anticipating for over a decade, Gilead is an astonishingly imagined story of remarkable lives. John Ames is a preacher, the son of a preacher and the grandson (both maternal and paternal) of preachers. It’s 1956 in Gilead, Iowa, towards the end of the Reverend Ames’s life, and he is absorbed in recording his family’s story, a legacy for the young son he will never see grow up. Haunted by his grandfather’s presence, John tells of the rift between his grandfather and his father: the elder, an angry visionary who fought for the abolitionist cause, and his son, an ardent pacifist. He is troubled, too, by his prodigal namesake, Jack (John Ames) Boughton, his best friend’s lost son who returns to Gilead searching for forgiveness and redemption. Told in John Ames’s joyous, rambling voice that finds beauty, humour and truth in the smallest of life’s details, Gilead is a song of celebration and 

In [9]:
embed_model = HuggingFaceEmbeddings(
    model_name = 'all-MiniLM-L6-v2',
    model_kwargs={'device':'cpu'}
)

db_books = Chroma(
    embedding_function = embed_model
)

batch_size =250
total_documents = len(documents)

for i in range(0, total_documents,batch_size):
    batch=documents[i:i+batch_size]
    db_books.add_documents(batch)

    print(f"Processed docs {i+1}")

print("Vector database creation complete")


Processed docs 1
Processed docs 251
Processed docs 501
Processed docs 751
Processed docs 1001
Processed docs 1251
Processed docs 1501
Processed docs 1751
Processed docs 2001
Processed docs 2251
Processed docs 2501
Processed docs 2751
Processed docs 3001
Processed docs 3251
Processed docs 3501
Processed docs 3751
Processed docs 4001
Processed docs 4251
Processed docs 4501
Processed docs 4751
Processed docs 5001
Vector database creation complete


In [14]:
query = 'A book about world war'
docs=db_books.similarity_search(query,k=10)
docs

[Document(id='3d92e644-3f2a-49c0-bfda-5a6fa4d5f0e2', metadata={'source': 'tagged_descripton.txt'}, page_content="9780688085872 Despite the numerous books on World War II, until now there has been no one-volume survey that was both objective and comprehensive. Previous volumes have usually been written from an exclusively British or American point of view, or have ignored the important causes and consequences of the War. A Short History of World War II is essentially a military history, but it reaches from the peace settlements of World War I to the drastically altered postwar world of the late 1940's. Lucidly written and eminently readable, it is factual and accurate enough to satisfy professional historians. A Short History of World War II will appeal equally to the general reader, the veteran who fought in the War, and the student interested in understanding the contemporary political world."),
 Document(id='c25acf16-0a8b-409b-9344-618881977762', metadata={'source': 'tagged_descripto

In [15]:
books[books['isbn13'] == int(docs[0].page_content.split()[0].strip())]

Unnamed: 0.1,Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description
3180,4149,9780688085872,688085873,A Short History of World War II,James L. Stokesbury,History,http://books.google.com/books/content?id=uDBhl...,"Despite the numerous books on World War II, un...",1980.0,3.93,416.0,454.0,A Short History of World War II,9780688085872 Despite the numerous books on Wo...


In [16]:
def retrieve(
       query:str,
       top_k:int=10, 
):
    recs = db_books.similarity_search(query,k=50)
    books_list=[]

    for i in range(0,len(recs)):
        books_list += [int(recs[i].page_content.strip('"').split()[0])]
    
    return books[books['isbn13'].isin(books_list)].head(top_k)


In [17]:
retrieve('A book about world war 2')

Unnamed: 0.1,Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description
66,79,9780007162994,0007162995,If I Die in a Combat Zone,Tim O'Brien,"Vietnam War, 1961-1975",http://books.google.com/books/content?id=0qUtS...,Perhaps the best book to emerge from the Vietn...,2003.0,3.95,208.0,11.0,If I Die in a Combat Zone,9780007162994 Perhaps the best book to emerge ...
461,543,9780099273868,0099273861,The Great World,David Malouf,Australia,http://books.google.com/books/content?id=f8IiK...,"""Every city, town and village has its memorial...",1999.0,3.87,336.0,514.0,The Great World,"9780099273868 ""Every city, town and village ha..."
524,608,9780099483472,0099483475,All Quiet on the Western Front,Erich Maria Remarque,"World War, 1914-1918",,All Quiet on the Western Front is probably the...,2005.0,3.95,216.0,1018.0,All Quiet on the Western Front,9780099483472 All Quiet on the Western Front i...
541,625,9780099910107,0099910101,A Farewell to Arms,Ernest Hemingway,War,http://books.google.com/books/content?id=m68Lh...,"In 1918 Ernest Hemingway went to war, to the '...",1994.0,3.8,293.0,210197.0,A Farewell to Arms,9780099910107 In 1918 Ernest Hemingway went to...
572,670,9780140149241,0140149244,We Were the Rats,Lawson Glassop,"Tobruk, Battles of, 1941-1942",,Reissue of the famous novel based on the autho...,1991.0,3.23,275.0,13.0,We Were the Rats,9780140149241 Reissue of the famous novel base...
795,970,9780142002803,0142002801,"The Fall of Berlin, 1945",Antony Beevor,History,http://books.google.com/books/content?id=u6avD...,Chronicles the horror of Berlin's fall to the ...,2003.0,4.28,490.0,9635.0,"The Fall of Berlin, 1945",9780142002803 Chronicles the horror of Berlin'...
796,972,9780142002889,0142002887,When the Elephants Dance,Tess Uriza Holthe,Fiction,http://books.google.com/books/content?id=XKtOP...,In the final weeks of the Japanese occupation ...,2003.0,4.03,368.0,2525.0,When the Elephants Dance: A Novel,9780142002889 In the final weeks of the Japane...
859,1046,9780143037576,0143037579,Fear and Trembling,Søren Kierkegaard,Philosophy,http://books.google.com/books/content?id=acFp6...,"The perfect books for the true book lover, Pen...",2005.0,4.0,160.0,13333.0,Fear and Trembling,9780143037576 The perfect books for the true b...
876,1071,9780143104902,014310490X,Black Lamb and Grey Falcon,Rebecca West,History,http://books.google.com/books/content?id=Lfe0W...,"Written on the brink of World War II, West's c...",2007.0,4.23,1181.0,1550.0,Black Lamb and Grey Falcon: A Journey Through ...,9780143104902 Written on the brink of World Wa...
955,1184,9780192822895,0192822896,La Débâcle,Emile Zola;Elinor Dorday;Robert Lethbridge,Fiction,http://books.google.com/books/content?id=GxNnS...,Zola wrote that 'my title speaks not merely of...,2000.0,3.99,592.0,763.0,La Débâcle,9780192822895 Zola wrote that 'my title speaks...


In [19]:
#categorisation->zero-shot classification
books['categories'].value_counts().reset_index().query('count>50')


Unnamed: 0,categories,count
0,Fiction,2111
1,Juvenile Fiction,390
2,Biography & Autobiography,311
3,History,207
4,Literary Criticism,124
5,Philosophy,117
6,Religion,117
7,Comics & Graphic Novels,116
8,Drama,86
9,Juvenile Nonfiction,57


In [20]:
category_mapping ={
    'Fiction':'Fiction',
    'Juvenile Fiction':"Children's Fiction",
    'Biography & Autobiography':'Nonfiction',
    'History':'Nonfiction',
    'Literary Criticism':'Nonfiction',
    'Philosophy':'Nonfiction',
    'Religion':'Nonfiction',
    'Comics & Graphic Novels':'Fiction',
    'Drama':'Fiction',
    'Juvenile Nonfiction':"Children's Nonfiction",
    'Science':'Nonfiction',
    'Poetry':'Fiction'
}

In [21]:
books['simple_categories']= books['categories'].map(category_mapping)

In [22]:
books

Unnamed: 0.1,Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description,simple_categories
0,0,9780002005883,0002005883,Gilead,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0,Gilead,9780002005883 A NOVEL THAT READERS and critics...,Fiction
1,1,9780002261982,0002261987,Spider's Web,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0,Spider's Web: A Novel,9780002261982 A new 'Christie for Christmas' -...,
2,3,9780006178736,0006178731,Rage of angels,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0,Rage of angels,"9780006178736 A memorable, mesmerizing heroine...",Fiction
3,4,9780006280897,0006280897,The Four Loves,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0,The Four Loves,9780006280897 Lewis' work on the nature of lov...,
4,5,9780006280934,0006280935,The Problem of Pain,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=Kk-uV...,"""In The Problem of Pain, C.S. Lewis, one of th...",2002.0,4.09,176.0,37569.0,The Problem of Pain,"9780006280934 ""In The Problem of Pain, C.S. Le...",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5192,6802,9788172235222,8172235224,Mistaken Identity,Nayantara Sahgal,Indic fiction (English),http://books.google.com/books/content?id=q-tKP...,On A Train Journey Home To North India After L...,2003.0,2.93,324.0,0.0,Mistaken Identity,9788172235222 On A Train Journey Home To North...,
5193,6803,9788173031014,8173031010,Journey to the East,Hermann Hesse,Adventure stories,http://books.google.com/books/content?id=rq6JP...,This book tells the tale of a man who goes on ...,2002.0,3.70,175.0,24.0,Journey to the East,9788173031014 This book tells the tale of a ma...,
5194,6804,9788179921623,817992162X,The Monk Who Sold His Ferrari: A Fable About F...,Robin Sharma,Health & Fitness,http://books.google.com/books/content?id=c_7mf...,"Wisdom to Create a Life of Passion, Purpose, a...",2003.0,3.82,198.0,1568.0,The Monk Who Sold His Ferrari: A Fable About F...,9788179921623 Wisdom to Create a Life of Passi...,
5195,6805,9788185300535,8185300534,I Am that,Sri Nisargadatta Maharaj;Sudhakar S. Dikshit,Philosophy,http://books.google.com/books/content?id=Fv_JP...,This collection of the timeless teachings of o...,1999.0,4.51,531.0,104.0,I Am that: Talks with Sri Nisargadatta Maharaj,9788185300535 This collection of the timeless ...,Nonfiction


In [23]:
books[~(books['simple_categories'].isna())]

Unnamed: 0.1,Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description,simple_categories
0,0,9780002005883,0002005883,Gilead,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0,Gilead,9780002005883 A NOVEL THAT READERS and critics...,Fiction
2,3,9780006178736,0006178731,Rage of angels,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0,Rage of angels,"9780006178736 A memorable, mesmerizing heroine...",Fiction
8,12,9780006482079,0006482074,Warhost of Vastmark,Janny Wurts,Fiction,http://books.google.com/books/content?id=uOL0f...,"Tricked once more by his wily half-brother, Ly...",1995.0,4.03,522.0,2966.0,Warhost of Vastmark,9780006482079 Tricked once more by his wily ha...,Fiction
30,35,9780006646006,000664600X,Ocean Star Express,Mark Haddon;Peter Sutton,Juvenile Fiction,http://books.google.com/books/content?id=I2QZA...,Joe and his parents are enjoying a summer holi...,2002.0,3.50,32.0,1.0,Ocean Star Express,9780006646006 Joe and his parents are enjoying...,Children's Fiction
46,54,9780007121014,0007121016,Taken at the Flood,Agatha Christie,Fiction,http://books.google.com/books/content?id=3gWlx...,A Few Weeks After Marrying An Attractive Young...,2002.0,3.71,352.0,8852.0,Taken at the Flood,9780007121014 A Few Weeks After Marrying An At...,Fiction
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5178,6778,9781933648279,1933648279,Night Has a Thousand Eyes,Cornell Woolrich,Fiction,http://books.google.com/books/content?id=3Gk6s...,"""Cornell Woolrich's novels define the essence ...",2007.0,3.77,344.0,680.0,Night Has a Thousand Eyes,"9781933648279 ""Cornell Woolrich's novels defin...",Fiction
5188,6797,9784770028969,4770028962,Coin Locker Babies,村上龍,Fiction,http://books.google.com/books/content?id=87DJw...,Rescued from the lockers in which they were le...,2002.0,3.75,393.0,5560.0,Coin Locker Babies,9784770028969 Rescued from the lockers in whic...,Fiction
5189,6799,9788122200850,8122200850,"Cry, the Peacock",Anita Desai,Fiction,http://books.google.com/books/content?id=_QKwV...,This book is the story of a young girl obsesse...,1980.0,3.22,218.0,134.0,"Cry, the Peacock",9788122200850 This book is the story of a youn...,Fiction
5195,6805,9788185300535,8185300534,I Am that,Sri Nisargadatta Maharaj;Sudhakar S. Dikshit,Philosophy,http://books.google.com/books/content?id=Fv_JP...,This collection of the timeless teachings of o...,1999.0,4.51,531.0,104.0,I Am that: Talks with Sri Nisargadatta Maharaj,9788185300535 This collection of the timeless ...,Nonfiction


In [67]:
#Zero-shot classification
from transformers import pipeline
pipe = pipeline(
    "zero-shot-classification",
    model="typeform/distilbert-base-uncased-mnli"
)

config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/258 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cpu


In [68]:
fiction_categories = ['Fiction',"Nonfiction"]

In [69]:
sequence = books.loc[books['simple_categories']=='Fiction','description'].reset_index(drop=True)

In [70]:
sequence

0       A NOVEL THAT READERS and critics have been eag...
1       A memorable, mesmerizing heroine Jennifer -- b...
2       Tricked once more by his wily half-brother, Ly...
3       A Few Weeks After Marrying An Attractive Young...
4       For sixty years, Jewish refugees and their des...
                              ...                        
2359    In 1934, a "sickly pathetic marmoset" named Mi...
2360    In this sequel to the popular Cooking with Fer...
2361    "Cornell Woolrich's novels define the essence ...
2362    Rescued from the lockers in which they were le...
2363    This book is the story of a young girl obsesse...
Name: description, Length: 2364, dtype: object

In [71]:
result=pipe(sequence,fiction_categories)

In [72]:
result

{'sequence': 'A NOVEL THAT READERS and critics have been eagerly anticipating for over a decade, Gilead is an astonishingly imagined story of remarkable lives. John Ames is a preacher, the son of a preacher and the grandson (both maternal and paternal) of preachers. It’s 1956 in Gilead, Iowa, towards the end of the Reverend Ames’s life, and he is absorbed in recording his family’s story, a legacy for the young son he will never see grow up. Haunted by his grandfather’s presence, John tells of the rift between his grandfather and his father: the elder, an angry visionary who fought for the abolitionist cause, and his son, an ardent pacifist. He is troubled, too, by his prodigal namesake, Jack (John Ames) Boughton, his best friend’s lost son who returns to Gilead searching for forgiveness and redemption. Told in John Ames’s joyous, rambling voice that finds beauty, humour and truth in the smallest of life’s details, Gilead is a song of celebration and acceptance of the best and the worst

In [73]:
max_index = np.argmax(result['scores'])
max_label = result['labels'][max_index]
max_label

'Fiction'

In [59]:
def generate_predictions(sequence,categories):
    predictions= pipe(sequence,fiction_categories)
    max_index = np.argmax(predictions['scores'])
    max_label=predictions['labels'][max_index]
    return max_label

In [35]:
#checking accuracy
from tqdm import tqdm

In [74]:
isbn=[]
predicted_cats=[]

missing_cats =books.loc[books['simple_categories'].isna(),['isbn13','description']].reset_index(drop=True)

In [75]:
missing_cats.head()

Unnamed: 0,isbn13,description
0,9780002261982,A new 'Christie for Christmas' -- a full-lengt...
1,9780006280897,Lewis' work on the nature of love divides love...
2,9780006280934,"""In The Problem of Pain, C.S. Lewis, one of th..."
3,9780006380832,Until Vasco da Gama discovered the sea-route t...
4,9780006470229,A new-cover reissue of the fourth book in the ...


In [76]:
sequences_to_classify=missing_cats['description'].tolist()
isbn=missing_cats['isbn13'].tolist()
candidate_labels = fiction_categories

actual_cats =[]
predicted_cats=[]
processed_isbn=[]

#batch processing
batch_size=100
total_items=len(sequences_to_classify)

print(f'starting zero-shot-classification for {total_items} items in batches of {batch_size}...')

for i in tqdm(range(0,total_items,batch_size)):
    batch_sequence=sequences_to_classify[i:i+batch_size]
    batch_result=pipe(batch_sequence,candidate_labels)
    
    for j,result in enumerate(batch_result):
        max_index=np.argmax(result['scores'])
        max_label=result['labels'][max_index]

        predicted_cats.append(max_label)
        processed_isbn.append(isbn[i+j])



starting zero-shot-classification for1454 items in batches of 100...


100%|██████████████████████████████| 15/15 [39:43<00:00, 158.88s/it]


In [78]:
missing_predicted_df = pd.DataFrame({"isbn13": isbn, "predicted_categories": predicted_cats})

In [79]:
missing_predicted_df

Unnamed: 0,isbn13,predicted_categories
0,9780002261982,Fiction
1,9780006280897,Fiction
2,9780006280934,Fiction
3,9780006380832,Nonfiction
4,9780006470229,Fiction
...,...,...
1449,9788125026600,Fiction
1450,9788171565641,Fiction
1451,9788172235222,Fiction
1452,9788173031014,Fiction


In [81]:

books = pd.merge(books, missing_predicted_df, on="isbn13", how="left")
books["simple_categories"] = np.where(books["simple_categories"].isna(), books["predicted_categories"], books["simple_categories"])
books = books.drop(columns = ["predicted_categories"])

In [82]:
books

Unnamed: 0.1,Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description,simple_categories
0,0,9780002005883,0002005883,Gilead,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0,Gilead,9780002005883 A NOVEL THAT READERS and critics...,Fiction
1,1,9780002261982,0002261987,Spider's Web,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0,Spider's Web: A Novel,9780002261982 A new 'Christie for Christmas' -...,Fiction
2,3,9780006178736,0006178731,Rage of angels,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0,Rage of angels,"9780006178736 A memorable, mesmerizing heroine...",Fiction
3,4,9780006280897,0006280897,The Four Loves,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0,The Four Loves,9780006280897 Lewis' work on the nature of lov...,Fiction
4,5,9780006280934,0006280935,The Problem of Pain,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=Kk-uV...,"""In The Problem of Pain, C.S. Lewis, one of th...",2002.0,4.09,176.0,37569.0,The Problem of Pain,"9780006280934 ""In The Problem of Pain, C.S. Le...",Fiction
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5192,6802,9788172235222,8172235224,Mistaken Identity,Nayantara Sahgal,Indic fiction (English),http://books.google.com/books/content?id=q-tKP...,On A Train Journey Home To North India After L...,2003.0,2.93,324.0,0.0,Mistaken Identity,9788172235222 On A Train Journey Home To North...,Fiction
5193,6803,9788173031014,8173031010,Journey to the East,Hermann Hesse,Adventure stories,http://books.google.com/books/content?id=rq6JP...,This book tells the tale of a man who goes on ...,2002.0,3.70,175.0,24.0,Journey to the East,9788173031014 This book tells the tale of a ma...,Fiction
5194,6804,9788179921623,817992162X,The Monk Who Sold His Ferrari: A Fable About F...,Robin Sharma,Health & Fitness,http://books.google.com/books/content?id=c_7mf...,"Wisdom to Create a Life of Passion, Purpose, a...",2003.0,3.82,198.0,1568.0,The Monk Who Sold His Ferrari: A Fable About F...,9788179921623 Wisdom to Create a Life of Passi...,Nonfiction
5195,6805,9788185300535,8185300534,I Am that,Sri Nisargadatta Maharaj;Sudhakar S. Dikshit,Philosophy,http://books.google.com/books/content?id=Fv_JP...,This collection of the timeless teachings of o...,1999.0,4.51,531.0,104.0,I Am that: Talks with Sri Nisargadatta Maharaj,9788185300535 This collection of the timeless ...,Nonfiction


In [84]:
books.to_csv('books_with_categories.csv',index=False)

In [85]:
### Sentiment Analysis

In [87]:
books = pd.read_csv("books_with_categories.csv")

In [88]:
books

Unnamed: 0.1,Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description,simple_categories
0,0,9780002005883,0002005883,Gilead,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0,Gilead,9780002005883 A NOVEL THAT READERS and critics...,Fiction
1,1,9780002261982,0002261987,Spider's Web,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0,Spider's Web: A Novel,9780002261982 A new 'Christie for Christmas' -...,Fiction
2,3,9780006178736,0006178731,Rage of angels,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0,Rage of angels,"9780006178736 A memorable, mesmerizing heroine...",Fiction
3,4,9780006280897,0006280897,The Four Loves,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0,The Four Loves,9780006280897 Lewis' work on the nature of lov...,Fiction
4,5,9780006280934,0006280935,The Problem of Pain,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=Kk-uV...,"""In The Problem of Pain, C.S. Lewis, one of th...",2002.0,4.09,176.0,37569.0,The Problem of Pain,"9780006280934 ""In The Problem of Pain, C.S. Le...",Fiction
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5192,6802,9788172235222,8172235224,Mistaken Identity,Nayantara Sahgal,Indic fiction (English),http://books.google.com/books/content?id=q-tKP...,On A Train Journey Home To North India After L...,2003.0,2.93,324.0,0.0,Mistaken Identity,9788172235222 On A Train Journey Home To North...,Fiction
5193,6803,9788173031014,8173031010,Journey to the East,Hermann Hesse,Adventure stories,http://books.google.com/books/content?id=rq6JP...,This book tells the tale of a man who goes on ...,2002.0,3.70,175.0,24.0,Journey to the East,9788173031014 This book tells the tale of a ma...,Fiction
5194,6804,9788179921623,817992162X,The Monk Who Sold His Ferrari: A Fable About F...,Robin Sharma,Health & Fitness,http://books.google.com/books/content?id=c_7mf...,"Wisdom to Create a Life of Passion, Purpose, a...",2003.0,3.82,198.0,1568.0,The Monk Who Sold His Ferrari: A Fable About F...,9788179921623 Wisdom to Create a Life of Passi...,Nonfiction
5195,6805,9788185300535,8185300534,I Am that,Sri Nisargadatta Maharaj;Sudhakar S. Dikshit,Philosophy,http://books.google.com/books/content?id=Fv_JP...,This collection of the timeless teachings of o...,1999.0,4.51,531.0,104.0,I Am that: Talks with Sri Nisargadatta Maharaj,9788185300535 This collection of the timeless ...,Nonfiction


In [92]:

from transformers import pipeline
classifier = pipeline("text-classification",
                      model="j-hartmann/emotion-english-distilroberta-base",
                      top_k = None)
classifier("I love this!")

Device set to use cpu


[[{'label': 'joy', 'score': 0.9771687984466553},
  {'label': 'surprise', 'score': 0.008528688922524452},
  {'label': 'neutral', 'score': 0.005764586851000786},
  {'label': 'anger', 'score': 0.004419783595949411},
  {'label': 'sadness', 'score': 0.002092392183840275},
  {'label': 'disgust', 'score': 0.0016119893407449126},
  {'label': 'fear', 'score': 0.0004138524236623198}]]

In [93]:
classifier(books['description'][0])

[[{'label': 'fear', 'score': 0.6548405885696411},
  {'label': 'neutral', 'score': 0.16985228657722473},
  {'label': 'sadness', 'score': 0.11640921980142593},
  {'label': 'surprise', 'score': 0.02070065587759018},
  {'label': 'disgust', 'score': 0.019100677222013474},
  {'label': 'joy', 'score': 0.01516144908964634},
  {'label': 'anger', 'score': 0.003935146611183882}]]

In [94]:
classifier(books['description'][0].split("."))

[[{'label': 'surprise', 'score': 0.7296020984649658},
  {'label': 'neutral', 'score': 0.14038600027561188},
  {'label': 'fear', 'score': 0.06816228479146957},
  {'label': 'joy', 'score': 0.04794260859489441},
  {'label': 'anger', 'score': 0.009156366810202599},
  {'label': 'disgust', 'score': 0.0026284765917807817},
  {'label': 'sadness', 'score': 0.002122163539752364}],
 [{'label': 'neutral', 'score': 0.44937002658843994},
  {'label': 'disgust', 'score': 0.27359163761138916},
  {'label': 'joy', 'score': 0.10908330976963043},
  {'label': 'sadness', 'score': 0.09362746775150299},
  {'label': 'anger', 'score': 0.04047830402851105},
  {'label': 'surprise', 'score': 0.026970159262418747},
  {'label': 'fear', 'score': 0.006879047024995089}],
 [{'label': 'neutral', 'score': 0.6462159752845764},
  {'label': 'sadness', 'score': 0.24273329973220825},
  {'label': 'disgust', 'score': 0.04342271760106087},
  {'label': 'surprise', 'score': 0.028300564736127853},
  {'label': 'joy', 'score': 0.014211

In [99]:

import numpy as np

emotion_labels = ["anger", "disgust", "fear", "joy", "sadness", "surprise", "neutral"]
isbn = []
emotion_scores = {label: [] for label in emotion_labels}

def calculate_max_emotion_scores(predictions):
    per_emotion_scores = {label: [] for label in emotion_labels}
    for prediction in predictions:
        sorted_predictions = sorted(prediction, key=lambda x: x["label"])
        for index, label in enumerate(emotion_labels):
            per_emotion_scores[label].append(sorted_predictions[index]["score"])
    return {label: np.max(scores) for label, scores in per_emotion_scores.items()}

In [100]:

for i in range(10):
    isbn.append(books["isbn13"][i])
    sentences = books["description"][i].split(".")
    predictions = classifier(sentences)
    max_scores = calculate_max_emotion_scores(predictions)
    for label in emotion_labels:
        emotion_scores[label].append(max_scores[label])

In [101]:
emotion_scores

{'anger': [0.06413359194993973,
  0.6126202344894409,
  0.06413359194993973,
  0.35148438811302185,
  0.08141235262155533,
  0.2322249710559845,
  0.5381842255592346,
  0.06413359194993973,
  0.3006700277328491,
  0.06413359194993973],
 'disgust': [0.27359163761138916,
  0.3482847511768341,
  0.10400661826133728,
  0.1507224589586258,
  0.18449543416500092,
  0.727174699306488,
  0.155854731798172,
  0.10400661826133728,
  0.2794816195964813,
  0.17792661488056183],
 'fear': [0.9281681180000305,
  0.9425276517868042,
  0.9723208546638489,
  0.3607059419155121,
  0.09504340589046478,
  0.051362793892621994,
  0.7474274635314941,
  0.4044976532459259,
  0.9155241250991821,
  0.051362793892621994],
 'joy': [0.932798445224762,
  0.7044219970703125,
  0.7672380805015564,
  0.2518811821937561,
  0.04056438058614731,
  0.043375786393880844,
  0.872565746307373,
  0.04056438058614731,
  0.04056438058614731,
  0.04056438058614731],
 'sadness': [0.6462159752845764,
  0.887939453125,
  0.54947710

In [102]:

from tqdm import tqdm

emotion_labels = ["anger", "disgust", "fear", "joy", "sadness", "surprise", "neutral"]
isbn = []
emotion_scores = {label: [] for label in emotion_labels}

for i in tqdm(range(len(books))):
    isbn.append(books["isbn13"][i])
    sentences = books["description"][i].split(".")
    predictions = classifier(sentences)
    max_scores = calculate_max_emotion_scores(predictions)
    for label in emotion_labels:
        emotion_scores[label].append(max_scores[label])

100%|█████████████████████████| 5197/5197 [1:24:52<00:00,  1.02it/s]


In [103]:
emotions_df = pd.DataFrame(emotion_scores)
emotions_df["isbn13"] = isbn

In [104]:
emotions_df

Unnamed: 0,anger,disgust,fear,joy,sadness,surprise,neutral,isbn13
0,0.064134,0.273592,0.928168,0.932798,0.646216,0.967158,0.729602,9780002005883
1,0.612620,0.348285,0.942528,0.704422,0.887939,0.111690,0.252546,9780002261982
2,0.064134,0.104007,0.972321,0.767238,0.549477,0.111690,0.078765,9780006178736
3,0.351484,0.150722,0.360706,0.251881,0.732684,0.111690,0.078765,9780006280897
4,0.081412,0.184495,0.095043,0.040564,0.884390,0.475881,0.078765,9780006280934
...,...,...,...,...,...,...,...,...
5192,0.148208,0.030643,0.919165,0.255172,0.853721,0.980877,0.030656,9788172235222
5193,0.064134,0.114383,0.051363,0.400262,0.883198,0.111690,0.227765,9788173031014
5194,0.009997,0.009929,0.339218,0.947779,0.375754,0.066685,0.057625,9788179921623
5195,0.064134,0.104007,0.459269,0.759456,0.951104,0.368111,0.078765,9788185300535


In [105]:
books = pd.merge(books, emotions_df, on = "isbn13")

In [106]:

books.to_csv("books_with_emotions.csv", index = False)