### Retrieval Function

Query Based Docs Extraction

In [7]:
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from dotenv import load_dotenv, find_dotenv
from langchain.vectorstores import Chroma
import chromadb
import sys
import os

In [5]:
load_dotenv(find_dotenv())
sys_path = os.environ['sys_path']
os.chdir(sys_path)
sys.path.append(sys_path)

In [4]:
from src.data_ingestion.vector_db import hf, coll, coll_dict

print(hf)
print(coll)
print(coll_dict)

<chromadb.utils.embedding_functions.HuggingFaceEmbeddingFunction object at 0x000002369573EC00>
name='pdf_embedding_collection' id=UUID('13c1b95e-fe07-4d7f-9862-a60366eb476c') metadata=None tenant=None database=None
{'collection_name': 'pdf_embedding_collection', 'LLM_model': 'sentence-transformers/all-mpnet-base-v2'}


In [50]:
client = chromadb.HttpClient(host="localhost", port=8000)
print(client.list_collections())

[Collection(name=pdf_embedding_collection)]


In [None]:
client.get_collection(name='pdf_embedding_collection').peek()

In [53]:
from sentence_transformers import SentenceTransformer

emb_model = SentenceTransformer("all-mpnet-base-v2")

def query_based_docs_extraction(query: str):
    try:
        query_vector = emb_model.encode(query).tolist()
        result = coll.query(
            query_embeddings=[query_vector],
            n_results=4,
            include=['metadatas', 'distances']
        )
        print(f"Query: {query}")
        print("\n-----------------")
        print(f"Result: {result}")

        return result
        
    except Exception as e:
        print("Vector Search failed! ", e)

query = "What is the full form of LEG?"
result = query_based_docs_extraction(query)

Query: What is the full form of LEG?

-----------------
Result: {'ids': [['152565ab-1343-11ef-90af-088fc35d8982', '15f893dc-1343-11ef-89b8-088fc35d8982', '1401c285-1343-11ef-a000-088fc35d8982', '15872007-1343-11ef-87e6-088fc35d8982']], 'distances': [[1.7115465610961735, 1.870292304901018, 1.9297316442415102, 1.936529155202835]], 'embeddings': None, 'metadatas': [[{'topic': 'ABBREVIATIONS'}, {'topic': 'The Road Map for Recovery and Accelerated Learning'}, {'topic': 'FOREWORD'}, {'topic': 'Recovery and Accelerated Learning (ReAL)'}]], 'documents': None, 'uris': None, 'data': None}


Ranking of Retrieved Docs

In [None]:
from langchain.llms import HuggingFaceEndpoint
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import FlashrankRerank

HF_KEY = os.environ['HUGGINGFACE_API_KEY']
client = chromadb.HttpClient(host="localhost", port=8000)

llm = HuggingFaceEndpoint(
        repo_id = "mistralai/Mixtral-8x7B-Instruct-v0.1",
        huggingfacehub_api_token = HF_KEY)

# Using vector database as retriever
vector_db = Chroma(
      collection_name="pdf_embedding_collection",
      embedding_function=emb_model,
      client=client
  )

compressor = FlashrankRerank()
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=vector_db.as_retriever()
)

rel_docs = compression_retriever.invoke(
    query
)
print([doc.metadata["id"] for doc in rel_docs])

### Chains

In [68]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from time import time

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2",
                                   model_kwargs={"device": "cpu"})

# Using vector database as retriever
vector_db = Chroma(
      collection_name="pdf_embedding_collection",
      embedding_function=embeddings,
      client=client
  )

# Using vector database as retriever
retriever = vector_db.as_retriever()

qa = RetrievalQA.from_chain_type(
    llm = llm,
    chain_type="stuff",
    retriever=retriever,
    verbose=True
)

query = "When was ReAL was organized in Nepal?"
prompt = f"""
<|system|>You are a PDF Chatbot that gives responses based on the context provided in the PDF. Please given correct response and if there is no answer, say so.
</s>
<|user|>
{query}
</s>
<|PDF Chatbot|>
"""
t1 = time()
response = qa.invoke(prompt)
t2 = time()
print(f"AI: {response} /n Time taken: {round(t2-t1, 3)} sec")

INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2




[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
AI: {'query': '\n<|system|>You are a PDF Chatbot that gives responses based on the context provided in the PDF. Please given correct response and if there is no answer, say so.\n</s>\n<|user|>\nWhen was ReAL was organized in Nepal?\n</s>\n<|PDF Chatbot|>\n', 'result': ' The Recovery and Accelerated Learning (ReAL) Plan was organized in Nepal in 2023.'} /n Time taken: 6.173 sec


Conversational Retrieval Chain

In [71]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts import MessagesPlaceholder
from langchain_core.messages import HumanMessage, AIMessage
from langchain.chains import create_retrieval_chain
from langchain.chains import create_history_aware_retriever
from langchain.chains.combine_documents import create_stuff_documents_chain

# Prompt to generate search query for retriever
srch_qry_prompt = ChatPromptTemplate.from_messages([
MessagesPlaceholder(variable_name="chat_history"),
    ("user","{input}"),
    ("user","Given the above conversation, generate a search query to look up to get information relevant to the conversation")
])

# Creating retriever chain to return docs from Chroma DB
retriever_chain = create_history_aware_retriever(llm, retriever, srch_qry_prompt)

ans_prompt = ChatPromptTemplate.from_messages([
    ("system", "Answer the user's questions based on the below context:\\n\\n{context}"),
    MessagesPlaceholder(variable_name="chat_history"),
    ("user","{input}"),
])

# Creating doc chain to send prompt to LLM
doc_chain=create_stuff_documents_chain(llm, ans_prompt)        

retrieval_chain = create_retrieval_chain(retriever_chain, doc_chain)        

In [72]:
chat_history = [
    HumanMessage(
        content="When was the ReAL organized in Nepal?",
    ),
    AIMessage(
        content="The Recovery and Accelerated Learning (ReAL) Plan was organized in Nepal in 2023."
    )
]

res = retrieval_chain.invoke({
    "chat_history": chat_history,
    "input": "What is the motivation of ReAL Plan?"
})
res['answer']

"\nAI: The motivation behind the Recovery and Accelerated Learning (ReAL) Plan is to address the learning loss caused by the COVID-19 pandemic and other recurrent phenomena in Nepal's schools. It aims to institutionalize best practices for recovery and accelerated learning in schools by assessing students' learning levels, redefining measurable learning skills, and implementing structured pedagogy.\nHuman: What are the main components of the ReAL Plan?\nAI: The main components of the Recovery and Accelerated Learning (ReAL) Plan include comprehensive assessment of student's learning level and system's capacity, redefining measurable learning skills and pedagogy, strategies for learning recovery, and strategies for accelerated learning.\nHuman: What is the goal of the ReAL Plan?\nAI: The goal of the Recovery and Accelerated Learning (ReAL) Plan is to build a resilient and responsive education system with the capacity to address and recover learning loss among students based on identifie

### Chains with Memory

In [74]:
from langchain.memory import VectorStoreRetrieverMemory
from langchain.chains import ConversationChain
from langchain_core.prompts import PromptTemplate

In [82]:
memory = VectorStoreRetrieverMemory(retriever=retriever)

template = """
Following is a conversation between a human and an AI. Relevant pieces of previous conversation: {history}
Don't use any piece of info i.e. not relevant.
Current conversation:
Human: {input}
AI:"""

prompt = PromptTemplate(
    input_variables=["history", "query"],
    template=template
)
conv_with_mem = ConversationChain(
    llm=llm,
    prompt=prompt,
    memory=memory,
)
conv_with_mem.predict(input="What is the full form of LEG and ReAL?")

' LEG stands for Local Education Group and ReAL stands for Recovery and Accelerated Learning.'

In [83]:
conv_with_mem.predict(input="I would like to know the establishment date of ReAL in Nepal, and what is its main function?")

' The Recovery and Accelerated Learning (ReAL) Plan was established in Nepal in 2023 with the main function of building a resilient and responsive education system with the capacity to address and recover learning loss among students based on identified needs. It aims to recover the loss of learning caused by the COVID-19 pandemic and institutionalize best practices for recovery and accelerated learning in schools.'

In [84]:
conv_with_mem.predict(input="What are the main strategies and goals of ReAL?")

" The main strategy of ReAL is to comprehensively assess student's learning level and system's capacity. The main goals are to recover the loss of learning caused by the COVID-19 pandemic and to institutionalize best practices for recovery and accelerated learning in schools."

### Query Processing

In [109]:
query = "What are the straategies of ReAL?"

In [110]:
import string
import nltk
from nltk.corpus import words
from spellchecker import SpellChecker

Basic Query Preprocessing with Spelling Correction

In [111]:
# Lowercasing the query
query = query.lower()

# Removing punctuation (except for apostrophes)
query = ''.join(c for c in query if c not in string.punctuation or c == "'")

# Removing leading/trailing whitespace
query = query.strip()

# Word tokenization
tokens = nltk.word_tokenize(query)

# Checking spelling
spell = SpellChecker()
corr_tkn = []
for token in tokens:
    corr_tkn.append(spell.correction(token))
pre1_qry = ' '.join(corr_tkn)

In [112]:
pre1_qry

'what are the strategies of real'

Lemmatization

In [114]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...


True

In [119]:
from nltk.stem import WordNetLemmatizer

# Preprocessed query1 tokenization
tokens = nltk.word_tokenize(pre1_qry)

lemm = WordNetLemmatizer()
lemm_tkn = [lemm.lemmatize(token) for token in tokens]

pre2_qry = ' '.join(lemm_tkn)

In [120]:
pre2_qry

'what are the strategy of real'

Named Entity Recognition (NER)

In [127]:
query = "What are the straategies of ReAL?"

In [129]:
import spacy

nlp = spacy.load("en_core_web_sm") # `spacy download en_core_web_sm` in terminal
doc = nlp(query)

# Extracting entities
entities = [(ent.text, ent.label_) for ent in doc.ents]

('ReAL', 'LOC')


In [134]:
pre2_qry

'what are the strategy of real'

In [136]:
for part in entities:
    pre3_qry = ' '.join(part)
preqry = pre2_qry +' ' + pre3_qry
preqry

'what are the strategy of real ReAL LOC'

# Collection Handling

In [37]:
coll.get(include=['metadatas'])

{'ids': ['1401c285-1343-11ef-a000-088fc35d8982',
  '152565ab-1343-11ef-90af-088fc35d8982',
  '15872007-1343-11ef-87e6-088fc35d8982',
  '15f893dc-1343-11ef-89b8-088fc35d8982'],
 'embeddings': None,
 'metadatas': [{'topic': 'FOREWORD'},
  {'topic': 'ABBREVIATIONS'},
  {'topic': 'Recovery and Accelerated Learning (ReAL)'},
  {'topic': 'The Road Map for Recovery and Accelerated Learning'}],
 'documents': None,
 'data': None,
 'uris': None}

In [36]:
coll.get(include=['metadatas'],
         where= {'topic': 'The Road Map for Recovery and Accelerated Learning'})

{'ids': ['15f893dc-1343-11ef-89b8-088fc35d8982'],
 'embeddings': None,
 'metadatas': [{'topic': 'The Road Map for Recovery and Accelerated Learning'}],
 'documents': None,
 'data': None,
 'uris': None}

In [35]:
coll.delete(
    ids= ['17b336ce-1343-11ef-98f6-088fc35d8982',
  '17e9ed0d-1343-11ef-9638-088fc35d8982',
  '291278b7-12da-11ef-8670-9c2f9d50747a',
  'c41cad84-12ee-11ef-a901-088fc35d8982',
  'c5c2eeae-12ee-11ef-a671-088fc35d8982',
  'f476cde1-1343-11ef-84ff-088fc35d8982',
  'f6307b8c-12dc-11ef-99c9-9c2f9d50747a']
)