Reading the Documents and creating the chunks of text

In [22]:
import numpy as np
import PyPDF2

In [23]:
def load_pdf(path):
    """
        Purpose: Loading the pdf file
        Input: Path
        Output: Returns the text of all pages concatenated in a string
    """
    text=""
    with open(path,"rb") as f:
        reader=PyPDF2.PdfReader(f)
        for page in reader.pages:
            text+=page.extract_text()
    return text

In [None]:
pdf_path=input("Provide path to the pdf document")
#Load PDF
doc_text=load_pdf(pdf_path)

In [24]:
def chunk_text(text,max_len=200):
    """
        Purpose: Splits the text into chunks
        Input: String of Text, maximum length of a chunk
        Output: Returns a list of chunks
    """
    sentences=text.split(".")
    chunks,chunk=[],""
    for sentence in sentences:
        if len(chunk)+len(sentence)<max_len:
            chunk+=sentence+"."
        else:
            chunks.append(chunk.strip())
            chunk=sentence+"."
    if chunk:
        chunks.append(chunk.strip())
    return chunks

In [None]:
chunks=chunk_text(doc_text)

Creating Sentence Embeddings

In [None]:
from sentence_transformers import SentenceTransformer
model=SentenceTransformer("all-MiniLM-L6-v2")


In [None]:
embeddings=model.encode(chunks,convert_to_numpy=True)

Saving the embeddings in the vector store

In [None]:
import faiss
dimension=embeddings.shape[1]
index=faiss.IndexFlatL2(dimension)
index.add(embeddings)

Create system for getting the Document Reference for the user input 

In [None]:
def search(query,top_k=3):
    """
        Purpose: Search the sentence embeddings of the document and return the top_k similar search
        Input: Query of the user, top k documents
        Output: Returns the most similar chunks to the text
    """
    query_embedding=model.encode([query])
    distances,indices=index.search(np.array(query_embedding),top_k)
    return [chunks[i] for i in indices[0]]

In [25]:
def generate_answer(query):
    """
        Purpose: Generates the answer to the query of the user
        Input: Query string of the user
        Output: Returns the most relevant chunk to the query of the user
    """
    top_docs=search(query)
    context="\n".join(top_docs)
    return context

In [None]:
#A continuous loop
while True:
    #Takes input from the user
    user_input=input(">>You:")
    #Add the stop condition for the conversation
    if user_input.lower()=="bye":
        break
    print(">>You: ",user_input)
    bot_response=generate_answer(user_input)
    print(">>Bot: ",bot_response)