In [1]:
import numpy as np
import pandas as pd
import fitz     # PyMuPDF Library
from tqdm.auto import tqdm      #   Visualization of Loading Bars
import tensorflow as tf
import keras   
import re       #   A very powerful library for some basic text pre-processing!!

  from .autonotebook import tqdm as notebook_tqdm


# Loading text and pre-processing

In [2]:
def text_formatter(text: str) -> tuple[str, str]:
    """
    Performs minor formatting on text
    """
    new_text = text.replace('\n', ' ').strip()      #   All the text will be in a single line now
    
    # Remove double and triple spaces
    new_text = re.sub(r'\s{2,}', ' ', new_text)
    
    # Fix '- ', ' ?', ' ;' and '" ' symbols
    new_text = new_text.replace('- ', '-')
    new_text = new_text.replace(' ?', '?')
    new_text = new_text.replace(' "', '"')
    new_text = new_text.replace(' ;', ';')
    
    new_text = new_text.replace("\\", "")
    new_text = ' '.join(new_text.split())
    new_text = new_text.replace(r"\'s", "'s")
    new_text = re.sub(r'\d+', '', new_text)
    new_text = re.sub(r"\\'|\\|\'", '', new_text)

    new_text = new_text.replace('THE RAMAYANA OF VALMIKI', '')
    new_text = new_text.replace('OF VALMIKI', '')
    new_text = new_text.replace('RAMAYANA', '')
    new_text = new_text.replace('THE', '')

    new_text = re.sub(r'.*?VALMIKI', '', new_text)

    
    match = re.search(r'((\b\w+\b\s+){0,2})KANDA', new_text)
    if match:
        preceding_words = match.group(1).strip()
        result = f"{preceding_words} KANDA".strip()
    else:
        result = ""

    # new_text = re.sub(r'.*?KANDA', '', new_text)        #   Do u really want to use it?

    return new_text, result

In [3]:
def open_and_read_pdf(pdf_path: str) -> list[dict]:
    """
    Takes in the path of the pdf file and returns a list of dictionaries containing all the details
    You can easily convert this output into a dataframe
    """
    doc = fitz.open(pdf_path)

    pages_and_text = []
    for page_num, page in tqdm(enumerate(doc)):
        text = page.get_text()
        if text:
            text, Kanda = text_formatter(text)

            pages_and_text.append({
                'page_num': page_num - 18,
                "page_char_cnt": len(text),
                "page_word_cnt": len(text.split()),
                "page_sentence_cnt": len(text.split(".")),
                "page_token_cnt": len(text) / 4,  # 1 token => 4 English characters
                "text": text,
                "kanda": Kanda
            })
    
    return pages_and_text

In [4]:
pdf_path = "C:/Users/91629/Desktop/6th Sem/eGyanam Technologies/Assignment 1/Ramayana.pdf"
doc = fitz.open(pdf_path)
pages_and_text = open_and_read_pdf(pdf_path=pdf_path)

1709it [01:04, 26.65it/s]


In [20]:
import random
num = random.randint(0, 1709)
pages_and_text[num]

{'page_num': 649,
 'page_char_cnt': 1862,
 'page_word_cnt': 340,
 'page_sentence_cnt': 9,
 'page_token_cnt': 465.5,
 'text': 'KISHKINDHA KANDA CHAPTER  Her I.amntationr SEEING her lord lying on the earth, pierced by that death-dealing arrow discharged by Rama, Tara, whose face resembled the moon, approaching him, embraced him. At the sight of Bali, who lay like an elephant wounded by an arrow, that monkey resembling a huge mountain or an uprooted tree, Tara poured out her heart, tom with grief, in lamentation :-"  Thou who wert full of valour in combat !  Hero !  Best of Monkeys! It is because of my recent importunities that thou wilt not now speak to me I Rise,  Lion among Monkeys and rest on a comfortable couch ! Those great monarchs, thine equals, do not sleep on the earth; or is the earth thy cherished love, since even in dying thou dost lie by her and scornest me?" Without doubt,  Warrior, thanks to thy great exploits, thou hast founded another and more glorious Kishkindha in heav

In [21]:
def simple_formatter(new_text: str) ->str:
    new_text = new_text.replace('\n', ' ').strip()      #   All the text will be in a single line now
    
    # Remove double and triple spaces
    new_text = re.sub(r'\s{2,}', ' ', new_text)
    
    # Fix '- ', ' ?', ' ;' and '" ' symbols
    new_text = new_text.replace('- ', '-')
    new_text = new_text.replace(' ?', '?')
    new_text = new_text.replace(' "', '"')
    new_text = new_text.replace(' ;', ';')
    
    new_text = new_text.replace("\\", "")
    new_text = ' '.join(new_text.split())
    new_text = new_text.replace(r"\'s", "'s")
    new_text = re.sub(r'\d+', '', new_text)
    new_text = re.sub(r"\\'|\\|\'", '', new_text)

    return new_text

In [22]:
#   Error: No text got filtered for these pages
#       However: 8/1709 => 0.4%. So, maybe you can manually add the text of these 8 pages as well..!!
cnt = 0
pages = []
for item in pages_and_text:
    if len(item['text']) == 0:
        cnt += 1
        pages.append(item['page_num'])


print(f"Count of pages: {cnt}")
print(pages)

Count of pages: 0
[]


In [23]:
df = pd.DataFrame(pages_and_text)

In [24]:
#   Adding those pages which could not come (didn't perform any filtering here)
for page in tqdm(pages):
    m = simple_formatter(doc.get_page_text(pno= page-18))
    
    # Find the index for the current page
    idx = df[df['page_num'] == page].index
    
    if not idx.empty:
        df.loc[idx, 'text'] = m
        df.loc[idx, 'page_char_cnt'] = len(m)
        df.loc[idx, 'page_word_cnt'] = len(m.split())
        df.loc[idx, 'page_sentence_cnt'] = len(m.split("."))
        df.loc[idx, 'token_cnt'] = (len(m.split(".")) / 4)

0it [00:00, ?it/s]


In [23]:
#   Kandas missing from so many pages....!!


# c = 0
# for item in df['kanda']:
#     if item == "":
#         c += 1

# print(c)

In [25]:
df.describe().T.round(2)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
page_num,1703.0,837.1,493.6,-18.0,409.5,837.0,1264.5,1690.0
page_char_cnt,1703.0,1981.76,314.63,17.0,1831.5,2002.0,2205.0,2604.0
page_word_cnt,1703.0,338.49,57.03,4.0,311.0,343.0,377.0,463.0
page_sentence_cnt,1703.0,16.19,14.42,1.0,9.0,12.0,16.0,102.0
page_token_cnt,1703.0,495.44,78.66,4.25,457.88,500.5,551.25,651.0


In [26]:
df.iloc[984:995]

Unnamed: 0,page_num,page_char_cnt,page_word_cnt,page_sentence_cnt,page_token_cnt,text,kanda
984,971,1211,187,64,302.75,GLOSSARY K.sHA. A Whip. KouMODAK!. A Weapon gi...,
985,972,1189,184,62,297.25,"GLOSSARY p PAISHA AsTRA. The Ghosdy Weapoa, be...",
986,973,1247,188,63,311.75,GLOSSARY SBATAGNI. Either I spited mace or I S...,
987,974,72,14,5,18.0,"GLOSSARY w X y YAMIYA. The Weapon of"" Dead>. Y...",
988,976,114,17,4,28.5,Translated by HARI PRASAD SHASTRI Vol. III. Y...,YUDDHA KANDA
989,977,73,11,1,18.25,Prinl~tl ia Gr•tll Brilaifl at lh# BuaLBJGR Pa...,
990,978,1586,282,30,396.5,CONTENTS BooK VI-YUDDHA KANDA CHAPTER PAGE I. ...,YUDDHA KANDA
991,979,1734,309,31,433.5,CONTENTS CHAPTER PAGE · Malyavan advises Ravan...,
992,980,1785,325,34,446.25,CONTENTS PAGE · Lanka is set on fire by the Mo...,
993,981,765,141,19,191.25,CONTENTS CHAPTER PAGE . Hanuman carries Ramas ...,


# Further text pre-processing using Spacy

In [27]:
from spacy.lang.en import English

In [28]:
nlp = English()
nlp.add_pipe('sentencizer')

for item in tqdm(pages_and_text):
    item['sentences'] = list(nlp(item['text']).sents)

    #   Make sure all datatypes are strings (the default type is SpaCy datatype)
    item['sentences'] = [str(sent) for sent in item['sentences']]

    #   Count the Sentences
    item['page_sentence_count_spacy'] = len(item['sentences'])

100%|██████████| 1703/1703 [00:05<00:00, 326.19it/s]


In [29]:
import random
num = random.randint(0, 1208)
pages_and_text[num]

{'page_num': 948,
 'page_char_cnt': 1758,
 'page_word_cnt': 283,
 'page_sentence_cnt': 87,
 'page_token_cnt': 439.5,
 'text': 'GLOSSARY to be bom of the sacred cow Sbabola for her proteclioll. See Balakanda. MllmANGA. A kind of drum. MlltGt. Daughter of Krodhavasha, mother of elephants. MRrrYu. The God of Death, anotber name for Yama. Mt:HURTA. An instant, a moment, an hour, according to the context. MUNt. A holy Sage, a pious and learned person, a tide applied to Rishis and others. MURAGA. A tamboutine. MUSHTII<AS. People cursed by Vishwamitra who assumed the lowest caste. N NABHAGA. The son of Yayati and fatber of Aja, who was Dasarathas father. NAGAS. The Serpent Race. NAHISHA. The father of King Yayati, Nahushas curious story is found in the Mahabharata and Putanas. NAIRJUTAS. A race of Demons, offs~ of Nainiti or Niritti. NARAPRISIITA. The highest heaven (from Nab-vault) in which there is no unhappiness. NAKSHATRAS. The Hindus, beside the common division of the Zodiac into twelve 

In [30]:
#   Sentence splitting with Spacy and manually are almost same (Just slightly different)
df2 = pd.DataFrame(pages_and_text)
df2.describe().T.round(2)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
page_num,1703.0,837.1,493.6,-18.0,409.5,837.0,1264.5,1690.0
page_char_cnt,1703.0,1981.76,314.63,17.0,1831.5,2002.0,2205.0,2604.0
page_word_cnt,1703.0,338.49,57.03,4.0,311.0,343.0,377.0,463.0
page_sentence_cnt,1703.0,16.19,14.42,1.0,9.0,12.0,16.0,102.0
page_token_cnt,1703.0,495.44,78.66,4.25,457.88,500.5,551.25,651.0
page_sentence_count_spacy,1703.0,16.47,12.52,1.0,11.0,13.0,17.0,77.0


In [31]:
df2.iloc[1629:1640]

Unnamed: 0,page_num,page_char_cnt,page_word_cnt,page_sentence_cnt,page_token_cnt,text,kanda,sentences,page_sentence_count_spacy
1629,1617,2100,336,15,525.0,"sun, he issued from his palace barefooted and...",,"[ sun, he issued from his palace barefooted an...",15
1630,1618,1939,323,9,484.75,UTTARA KANDA CHAPTER I llama ascmds to Heaom ...,UTTARA KANDA,[UTTARA KANDA CHAPTER I llama ascmds to Heaom...,10
1631,1619,1973,341,10,493.25,"in heaven felt a supreme delight, their desir...",,"[ in heaven felt a supreme delight, their desi...",11
1632,1620,1317,235,11,329.25,UTTARA KANDA The Lord Vishnu returned to Svarg...,UTTARA KANDA,[UTTARA KANDA The Lord Vishnu returned to Svar...,10
1633,1621,1626,265,56,406.5,"GLOSSARY (For Flowers, Trees and Weapons, see ...",,"[GLOSSARY (For Flowers, Trees and Weapons, see...",48
1634,1622,2070,361,70,517.5,GLOSSARY AoNutUNDA. A pit or hole where the sa...,,"[GLOSSARY AoNutUNDA., A pit or hole where the ...",67
1635,1623,1965,337,69,491.25,GLOSSARY ANDHAKA. A de-the son of the Sage Kas...,,"[GLOSSARY ANDHAKA., A de-the son of the Sage K...",60
1636,1624,2001,328,64,500.25,GLOSSARY ABA. A monkey leader. Aassw~MAN. A ti...,,"[GLOSSARY ABA., A monkey leader., Aassw~MAN., ...",61
1637,1625,2123,359,58,530.75,GLOSSARY AsHvAGBlVA. The son of Kashyapa. AsHv...,,"[GLOSSARY AsHvAGBlVA., The son of Kashyapa., A...",53
1638,1626,1820,309,54,455.0,GLOSSARY AYUS. The son of Pururavas and Urvash...,,"[GLOSSARY AYUS., The son of Pururavas and Urva...",51


# Perform Chunking

In [32]:
num_sentences_chunk_size = 10

def split_sentences(input_list: list[str], slice_size: int=num_sentences_chunk_size) -> list[list[str]]:
    return [input_list[i:i+slice_size] for i in range(0, len(input_list), slice_size)]

In [33]:
for item in tqdm(pages_and_text):
    item['sentence_chunks'] = split_sentences(input_list=item['sentences'], slice_size=num_sentences_chunk_size)
    item['num_chunks'] = len(item['sentence_chunks'])

100%|██████████| 1703/1703 [00:00<00:00, 218037.23it/s]


In [34]:
random.sample(pages_and_text, k=1)

[{'page_num': 1163,
  'page_char_cnt': 2287,
  'page_word_cnt': 389,
  'page_sentence_cnt': 10,
  'page_token_cnt': 571.75,
  'text': 'YUDDHA KANDA showered rocks, stones and trees of every kind on the head of Kumbhakarna but that mighty one split and evaded that rain of trees. And he hurled himself upon that great army of monkeys btandisbing his sbarp pick and, as he rushed forward Hanuman placed himself in the way armed with the peak of a mountain, and, in futy sttuck Kumbhakama a violent blow, who, in his appalling cotpulence, appeated like a hill! Then he whose limbs were dripping with fat and streaming with blood, stumbled under the shock and the titan hurled his spear, which was as bright as lightning and like a mountain spouting fotth flames, striking Maruti between the arms, as Guha formerly struck the Krauncha Mountain with his formidable lance. His bteast pierced by that spesr, beside himself, vomiting blood, Hanuman, in fury, let fotth a terrible cry in the midst of the batt

In [35]:
df3 = pd.DataFrame(pages_and_text)
df3.describe().T.round(2)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
page_num,1703.0,837.1,493.6,-18.0,409.5,837.0,1264.5,1690.0
page_char_cnt,1703.0,1981.76,314.63,17.0,1831.5,2002.0,2205.0,2604.0
page_word_cnt,1703.0,338.49,57.03,4.0,311.0,343.0,377.0,463.0
page_sentence_cnt,1703.0,16.19,14.42,1.0,9.0,12.0,16.0,102.0
page_token_cnt,1703.0,495.44,78.66,4.25,457.88,500.5,551.25,651.0
page_sentence_count_spacy,1703.0,16.47,12.52,1.0,11.0,13.0,17.0,77.0
num_chunks,1703.0,2.12,1.28,1.0,2.0,2.0,2.0,8.0


## Splitting a chunk into it's own item

In [36]:
import re

#   'pages_and_text' is the dictionary corresponding to the dataframe.....

pages_and_chunks = []
for item in tqdm(pages_and_text):
    
    if item['page_num'] >= -1:       #   Unwanted Pages removed
        for sentence_chunk in item['sentence_chunks']:

            # 'item['sentence_chunk']' is the list of lists that we created before: Contains the chunks of sentences
            chunk_dict = {}
            chunk_dict['page_num'] = item['page_num']

            #   Join the sentences together into a paragraph-like structure aka join the list of sentences into one paragraph
            joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
            joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk)

            chunk_dict['sentence_chunk'] = joined_sentence_chunk

            #   Get some stats on our chunks
            chunk_dict['chunk_char_count'] = len(joined_sentence_chunk)
            chunk_dict['chunk_word_count'] = len([word for word in joined_sentence_chunk.split(" ")])
            chunk_dict['chunk_token_count'] = len(joined_sentence_chunk)/4

            pages_and_chunks.append(chunk_dict)


len(pages_and_chunks)

100%|██████████| 1703/1703 [00:00<00:00, 7931.50it/s]


3574

In [37]:
df4 = pd.DataFrame(pages_and_chunks)
df4.describe().T.round(2)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
page_num,3574.0,882.26,521.1,-1.0,415.0,907.5,1355.75,1690.0
chunk_char_count,3574.0,934.21,657.75,0.0,313.0,818.0,1495.75,2418.0
chunk_word_count,3574.0,158.86,112.01,1.0,53.0,141.0,256.0,420.0
chunk_token_count,3574.0,233.55,164.44,0.0,78.25,204.5,373.94,604.5


## Filter chunks of text for short chunks

These chunks may not contain much useful information

In [38]:
min_token_length = 30

for row in df4[df4['chunk_token_count'] <= min_token_length].sample(5).iterrows():
    print(f"Chunk token count: {row[1]['chunk_token_count']} | Text: {row[1]['sentence_chunk']}")

Chunk token count: 28.75 | Text: Chief of the gods, preserve me and thyaelf fi:om Gautama."Indra laughed and answered :  Thou of beautiful waist, !J
Chunk token count: 5.25 | Text: BOOK II AYODHYA KANDA
Chunk token count: 14.0 | Text: KsHIJpA, A plant or shrub with short branches and roots.
Chunk token count: 14.75 | Text: The monkeys, however, their faces shining like full-blown »
Chunk token count: 17.75 | Text: GLOSSARY w X y YAMIYA. The Weapon of" Dead>. YoGANDHARA. The United.z ~


In [39]:
max_token_length = 550

for row in df4[(df4['chunk_token_count'] > min_token_length) & (df4['chunk_token_count'] < max_token_length)].sample(5).iterrows():
    print(f"Chunk token count: {row[1]['chunk_token_count']} | Text: {row[1]['sentence_chunk']}")

Chunk token count: 298.75 | Text: AYODHYA KANDA have been obedient to their fathers will. I follow no new law, nor one conuary to the traditions of the royal dynasty, but tread the patb of my illustrious ancestors. I am accomplishing nought whieb has not already been accomplished in this world. He who acts in accordance witb his fathers commands does not fall from virtue."Having spoken tbus to his motber, Shri Rama addressed Lakshmana, saying :" Lakshmana, I am acquainted witb thine immeasurable love for me, tby valour and tby prowess; none can witbstand tbcc. Lakshmana, my motber endowed witb every good quality is now subject to misery and grief through ignorance of dharma and laek of resignation. Brotber, dharma is the highest good on earth, Trutb and dharma arc one. My fathers command is founded on dharma, hence it is superior to my mothers ruling. Hero, it is unwortby in one seeking tbe supreme fruit of dharma, not to fullil the promise made to his fatber, motber or a learned brahm

## Embedding our text chunks

In [9]:
from sentence_transformers import SentenceTransformer




In [10]:
embedding_model = SentenceTransformer("Alibaba-NLP/gte-large-en-v1.5", trust_remote_code=True)

In [48]:
pages_and_chunks_modified = []
for item in tqdm(pages_and_chunks):
    if item['chunk_token_count'] > 30:
        pages_and_chunks_modified.append(item)

len(pages_and_chunks_modified)

100%|██████████| 3574/3574 [00:00<00:00, 3575964.34it/s]


3391

In [49]:
#   Will take quite a bit time. So, better store and save these embeddings in a Vector Database
#   Took 3.5 hrs to run
meow = []
for item in tqdm(pages_and_chunks_modified):
    embeddings = embedding_model.encode(item['sentence_chunk'])
    item['embedding'] = embeddings
    meow.append(embeddings)

100%|██████████| 3391/3391 [3:21:57<00:00,  3.57s/it]  


In [50]:
meow = np.array(meow)
meow.shape

(3391, 1024)

# Save the embeddings for future use

In [53]:
df5 = pd.DataFrame(pages_and_chunks)

In [56]:
df5.iloc[1254:1260]

Unnamed: 0,page_num,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
1254,601,"KISHKINDHA KANDA all inflame my desire. I, who...",1103,193,275.75,"[0.35453555, 0.31067443, 0.7647348, 0.78929484..."
1255,601,When shall I hear the incomparable voice of Va...,1078,185,269.5,"[-0.19307077, 0.30760765, 0.48369014, 0.215577..."
1256,602,RAM A Y AN A F v:A L M I;K I he descend into h...,792,152,198.0,"[-0.54592574, -0.3410208, -0.12372417, 0.21413..."
1257,602,Hast thou perchance forgotten the greatness of...,1252,204,313.0,"[0.6150873, 0.35031578, 0.16450037, 0.06497544..."
1258,603,KISHKINDHA KANDA CHAPTER SugrifJtl sends Hanum...,1820,301,455.0,"[0.3802888, 0.48232132, 0.9606592, -0.0235045,..."
1259,604,"with apprehension and caused thy flight, Bull ...",1135,189,283.75,"[0.31381765, 0.31953374, 0.90550166, -0.162502..."


In [60]:
questions = []
for item in tqdm(pages_and_chunks_modified):
    questions.append(item['sentence_chunk'])

100%|██████████| 3391/3391 [00:00<00:00, 849228.86it/s]


In [61]:
batch_size = 128
vector_limit = 100000

questions = questions[:vector_limit]

In [62]:
pro_dict = {} 
for i in range(meow.shape[0]):
    pro_dict[i] = {
        'sentence': questions[i],
        'embedding': meow[i].tolist()  # convert the tensor to a list if necessary
    }

In [64]:
lion = pd.DataFrame.from_dict(pro_dict, orient='index')

In [65]:
lion.to_csv("C:/Users/91629/Desktop/Ramayana Embeddings.csv")

### Saved embeddings in the Vector Database as well for efficient Semantic Search

In [70]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec, PodSpec

In [71]:
pc = Pinecone(api_key='abjskdbsdkbskbskjbsdva')

In [72]:
import os
use_serverless = True

In [73]:
if use_serverless:
    spec = ServerlessSpec(cloud='aws', region='us-east-1')
else:
    spec = PodSpec(environment=environment)

In [15]:
index_name = "ramayana-embeddings"

In [76]:
#   Delete the index, if index of the same name already exists
if index_name in pc.list_indexes().names():
    pc.delete_index(index_name)

In [78]:
pc.create_index(
        index_name,
        dimension=embedding_model.get_sentence_embedding_dimension(),     #   Set the dimension in this way
        metric='cosine',
        spec=spec
    )

In [16]:
# connect to index
index = pc.Index(index_name)

# view index stats
index.describe_index_stats()

NameError: name 'pc' is not defined

In [80]:
for i in tqdm(range(0, len(questions), batch_size)):
    # find end of batch
    i_end = min(i+batch_size, len(questions))
    # create IDs batch
    ids = [str(x) for x in range(i, i_end)]
    # create metadata batch
    metadatas = [{'text': text} for text in questions[i:i_end]]
    xc = meow[i:i_end]    
    
    # create records list for upsert
    records = zip(ids, xc, metadatas)
    
    # upsert to Pinecone
    index.upsert(vectors=records)

100%|██████████| 27/27 [01:19<00:00,  2.95s/it]


In [81]:
# view index stats
index.describe_index_stats()

{'dimension': 1024,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 3391}},
 'total_vector_count': 3391}

#   Fetch the embeddings stored in the local file

In [66]:
lioness = pd.read_csv("C:/Users/91629/Desktop/Ramayana Embeddings.csv")
lioness['Embeddings'] = lioness['embedding'].apply(lambda e: np.array(eval(e)))
lioness.drop('embedding', axis=1, inplace=True)
lioness.head()

Unnamed: 0.1,Unnamed: 0,sentence,Embeddings
0,0,CHAPTER I Sltri Narada rta to V almiki tM rtor...,"[0.41220077872276306, -0.23101629316806793, 0...."
1,1,"Nony hymaa of the IUg.,-la ... -a.ut<d to him....","[-0.8503050804138184, 0.254336416721344, -0.50..."
2,2,"proportioned limbs and skin of bluish tint, on...","[0.19522586464881897, 0.6316515803337097, 0.50..."
3,3,Violmu. ·-The Incarnation~ or Dmnc Delcent cal...,"[-0.4730081856250763, -0.9195123314857483, 0.7..."
4,4,"BALA KANDA The daughter of King Janab, an inca...","[-0.19800657033920288, 0.714275598526001, 1.46..."


In [67]:
Meow = []
for i in range(lioness.shape[0]):
    Meow.append(list(lioness['Embeddings'].iloc[i]))

Meow = np.array(Meow)
Meow.shape

(3391, 1024)

In [69]:
type(Meow)

numpy.ndarray

#   Query the Embeddings from Pinecone

#### Write your query here

In [13]:
query = "Who is Dasarath?"

In [21]:
from pinecone import Pinecone

# Initialize the Pinecone client with your API key
pc = Pinecone(api_key="kjvbkjbsvjkvkjabvjkvbv")

# List all indexes to verify connection
print(pc.list_indexes())
# pinecone.init(api_key="fhjbv,jvab,jvbjkvjkvfjkvjfbkvjbfjvbdf jv dbvjb", region='us-west-2')

{'indexes': [{'deletion_protection': 'disabled',
              'dimension': 1024,
              'host': 'ramayana-embeddings-azjut3w.svc.aped-4627-b74a.pinecone.io',
              'metric': 'cosine',
              'name': 'ramayana-embeddings',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
              'status': {'ready': True, 'state': 'Ready'}}]}


In [22]:
# Specify your index name
index_name = "ramayana-embeddings"

# Connect to the existing index
index = pc.Index(index_name)

In [23]:
# create the query vector
xq = embedding_model.encode(query).tolist()

# now query
top_results = index.query(vector=xq, top_k=10, include_metadata=True)
top_results

{'matches': [{'id': '821',
              'metadata': {'text': 'DASARATHA. King of Koshala, father of Shri '
                                   'Rams, Lablunana Bharata and~· DBVAS. The '
                                   'Gods or s~ Ooes. Dm. A title given to '
                                   'Parvan, Shivss consort. DHANIJDA. A name '
                                   'of Kuvera. DHAIIA. W"lfe of the Sage ~· '
                                   'DHARMA. Traditionally or · course of '
                                   'conduct, or duty.'},
              'score': 0.749836683,
              'values': []},
             {'id': '3026',
              'metadata': {'text': 'DASARATHA. King of Kosbala and father of '
                                   'Rama, Lakshmana, Bbarata and Sbatrughna. '
                                   'DASHAGRIVA. Ten-necked One •, a title '
                                   'given to Ravana. DASHANANA. Ten .. faced '
                                   'One, a

#   Perform re-ranking of the top k results

In [24]:
from sentence_transformers import CrossEncoder

In [25]:
# Load the model, here we use our base sized model
reranking_model = CrossEncoder("mixedbread-ai/mxbai-rerank-xsmall-v1")

In [35]:
for match in top_scores:
    print(match)

{'id': '821',
 'metadata': {'text': 'DASARATHA. King of Koshala, father of Shri Rams, '
                      'Lablunana Bharata and~· DBVAS. The Gods or s~ Ooes. Dm. '
                      'A title given to Parvan, Shivss consort. DHANIJDA. A '
                      'name of Kuvera. DHAIIA. W"lfe of the Sage ~· DHARMA. '
                      'Traditionally or · course of conduct, or duty.'},
 'score': 0.749836683,
 'values': []}
{'id': '3026',
 'metadata': {'text': 'DASARATHA. King of Kosbala and father of Rama, '
                      'Lakshmana, Bbarata and Sbatrughna. DASHAGRIVA. '
                      'Ten-necked One •, a title given to Ravana. DASHANANA. '
                      'Ten .. faced One, a title given to Ravana. .'},
 'score': 0.715905905,
 'values': []}
{'id': '257',
 'metadata': {'text': 'Having taken leave of the King of Mithila, Shri '
                      'Dasaratha, preceded by the holy sage, went away.~~;:bL '
                      'tJ: ~·~ ~~ seuon ULtara Pha

In [41]:
for match in top_scores:
    print(match['metadata']['text'])
    # print(match['metadata'])

DASARATHA. King of Koshala, father of Shri Rams, Lablunana Bharata and~· DBVAS. The Gods or s~ Ooes. Dm. A title given to Parvan, Shivss consort. DHANIJDA. A name of Kuvera. DHAIIA. W"lfe of the Sage ~· DHARMA. Traditionally or · course of conduct, or duty.
DASARATHA. King of Kosbala and father of Rama, Lakshmana, Bbarata and Sbatrughna. DASHAGRIVA. Ten-necked One •, a title given to Ravana. DASHANANA. Ten .. faced One, a title given to Ravana. .
Having taken leave of the King of Mithila, Shri Dasaratha, preceded by the holy sage, went away.~~;:bL tJ: ~·~ ~~ seuon ULtara PhaJpanl il co.u ..
King Dasaratha thereupon duly honoured his kinsman who spent the night happily in company with the princes. The following day, rising early, King Dasaratha performing his customary devotions, proceeded to the sacrificial pavilion, escorted by the sages. At an auspicious hour in the presence of Shri Vaaishtha and other sages, Shri Ramachandra and his brothers adorned with every ornament being present

In [42]:
top_scores = top_results.matches  # Get the top 10 matches
documents = [match['metadata']['text'] for match in top_scores]

In [43]:
# Lets get the scores
final_results = reranking_model.rank(query, documents, return_documents=True, top_k=5)

In [44]:
query

'Who is Dasarath?'

In [45]:
final_results

[{'corpus_id': 1,
  'score': 0.3505557,
  'text': 'DASARATHA. King of Kosbala and father of Rama, Lakshmana, Bbarata and Sbatrughna. DASHAGRIVA. Ten-necked One •, a title given to Ravana. DASHANANA. Ten .. faced One, a title given to Ravana. .'},
 {'corpus_id': 0,
  'score': 0.3379044,
  'text': 'DASARATHA. King of Koshala, father of Shri Rams, Lablunana Bharata and~· DBVAS. The Gods or s~ Ooes. Dm. A title given to Parvan, Shivss consort. DHANIJDA. A name of Kuvera. DHAIIA. W"lfe of the Sage ~· DHARMA. Traditionally or · course of conduct, or duty.'},
 {'corpus_id': 6,
  'score': 0.20243756,
  'text': 'Thy son, resembling Shri Vishnu himself, is benevolent and generous to all. King, do this with a cheerful mind."CHAPTIR TM i!ruolws Sltri Rllln shall be instalkd To those who with joined palms were making this petition, King Dasaratha answered courteously :-" To-day, indeed, I am happy and fortunate since the peopl~ desire my aon, Shri Rama, to be proclaimed regent."Thus, in the presenc

In [48]:
for item in final_results:
    print(item['text'])

DASARATHA. King of Kosbala and father of Rama, Lakshmana, Bbarata and Sbatrughna. DASHAGRIVA. Ten-necked One •, a title given to Ravana. DASHANANA. Ten .. faced One, a title given to Ravana. .
DASARATHA. King of Koshala, father of Shri Rams, Lablunana Bharata and~· DBVAS. The Gods or s~ Ooes. Dm. A title given to Parvan, Shivss consort. DHANIJDA. A name of Kuvera. DHAIIA. W"lfe of the Sage ~· DHARMA. Traditionally or · course of conduct, or duty.
Thy son, resembling Shri Vishnu himself, is benevolent and generous to all. King, do this with a cheerful mind."CHAPTIR TM i!ruolws Sltri Rllln shall be instalkd To those who with joined palms were making this petition, King Dasaratha answered courteously :-" To-day, indeed, I am happy and fortunate since the peopl~ desire my aon, Shri Rama, to be proclaimed regent."Thus, in the presence of his subjects, the king in gracious aa:enta addressed Shri Vasishtha, Vamadeva and other sages :-" In this month of Cbitra, when the woods are beautiful wit

### Try to get the page no. of the responses as well

#   Pass to an LLM

In [46]:
from langchain.prompts.prompt import PromptTemplate
from langchain_ollama import ChatOllama

In [63]:
#   Pass the retrieved context here

# N = 3        #   Hyperaparameter to pass the top-3 results of re-ranking to an LLM
# information = [item['text'] for item in final_results[:N]]
# information = final_results[0]['text']

In [71]:
information = """
DASARATHA. King of Kosbala and father of Rama, Lakshmana, Bbarata and Sbatrughna. DASHAGRIVA. Ten-necked One •, a title given to Ravana. DASHANANA. Ten .. faced One, a title given to Ravana. .
"""

In [73]:
summary_template = """
    Given the information {information} about a prompt, I want to answer the given query from the Indian epic Ramayana: {query}.
    You can use your knowledge along with the provide information to answer the question. However, it it given that the information passed here
    is exactly correct. Despite all these, if you don't know the answer, simply say No. Don't try to generate
    wrong answers to the given query.
"""

In [74]:
summary_prompt_template = PromptTemplate(
    input_variables=["information"], template=summary_template
)

In [75]:
llm = ChatOllama(model='llama3', temperature=0.3)

In [80]:
#   Will take 4 to 5 minutes to answer. Also, your laptop may lag a bit during running this...

chain = summary_prompt_template | llm
res = chain.invoke(input= {"information": information,
                           "query": query})

In [81]:
print(query)

Who is Dasarath?


In [82]:
print(res)

content='Based on the provided information and my knowledge of the Indian epic Ramayana, I can confidently answer that Dasarath is King of Kosbala and father of Rama, Lakshmana, Bharata, and Shatrughna.' response_metadata={'model': 'llama3', 'created_at': '2024-07-28T06:36:35.6236096Z', 'message': {'role': 'assistant', 'content': ''}, 'done_reason': 'stop', 'done': True, 'total_duration': 176570674300, 'load_duration': 82261304600, 'prompt_eval_count': 170, 'prompt_eval_duration': 18199678000, 'eval_count': 50, 'eval_duration': 76052640000} id='run-b13508a2-9631-4fd2-8e30-983727fec6be-0' usage_metadata={'input_tokens': 170, 'output_tokens': 50, 'total_tokens': 220}


### Langchain Output Parsers:-

An output parser is an object in Langchain that helps us parse the output of an LLM. We are going to use it a lot as we can do a lot of cool stuffs with the help of this, such as:-
1. Convert into a JSON object
2. etc..

As of now, we are going to use a String output Parser, which is basically going to access the .content attribute of the AI message that we get.

In [5]:
from langchain_core.output_parsers import StrOutputParser

In [None]:
chain = summary_prompt_template | llm | StrOutputParser()
res = chain.invoke(input= {"information": information})

print(res)