Installing the dependencies

In [None]:
!pip install pypdf
!pip install -U sentence-transformers
!pip install chromadb
!pip install langchain
!pip install openai

Importing the necessary dependencies

In [58]:
import os
from tqdm.auto import tqdm
import pypdf as PyPDF
from langchain.vectorstores import Chroma
from langchain.docstore.document import Document
from langchain.embeddings import HuggingFaceEmbeddings
import urllib.request
from langchain.llms import AzureOpenAI
from langchain.chains import RetrievalQA

Downloading the sample PDF

In [12]:
def download_file(url, destination_path):
    try:
        urllib.request.urlretrieve(url, destination_path)
        print(f"File downloaded successfully and saved to: {destination_path}")
    except Exception as e:
        print(f"Error occurred while downloading the file: {e}")

Processing the document and converting it to chunks or documents.

In [13]:
def pdf_to_documents(path : str, chunk_length : int , overlap: int = 0, preprocess = None) -> list[Document]:
    '''
    Convert PDF document to text chunks with page numbers.
    Each chunk has a specified length and is prepended with the page number.
    '''
    pdf_file = open(path, 'rb')
    pdf_reader = PyPDF.PdfReader(pdf_file)
    total_pages = len(pdf_reader.pages)

    chunks = []

    for page_num in tqdm(range(total_pages)):
        page = pdf_reader.pages[page_num]
        page_text = page.extract_text()

        # Split the page text into chunks of specified length
        for i in range(0, len(page_text), chunk_length - overlap):
            chunk = page_text[i:i+chunk_length]

            if preprocess:
                chunk = preprocess(chunk)

            if chunk:
                chunks.append(
                    Document(
                        page_content=chunk,
                        metadata={
                            'page_num' : page_num + 1,
                            }))

    pdf_file.close()
    return chunks

In [14]:
def preprocess(chunk):
    return chunk.replace("\\n", "") # sample preprocessing

In [18]:
file_url = "https://proceedings.neurips.cc/paper_files/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf"
path_to_pdf = 'attentionIsAllYouNeedPaper.pdf'
chunk_length = 500
overlap = 150
if not os.path.exists(path_to_pdf):
  download_file(file_url, path_to_pdf)
result = pdf_to_documents(path_to_pdf, chunk_length, overlap, preprocess)

  0%|          | 0/11 [00:00<?, ?it/s]

Downloading the sentence-bert embeddings

In [19]:
sbert = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

Downloading (…)a8e1d/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)0bca8e1d/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)e1d/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)a8e1d/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)8e1d/train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)bca8e1d/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

Initializing the Chroma instance

In [20]:
vectordb = Chroma(
  embedding_function = sbert,
  persist_directory = 'chroma_store'
)
vectordb.persist()

Adding all documents to the DB

In [21]:
for doc in tqdm(result):
  vectordb.add_documents([doc])

  0%|          | 0/100 [00:00<?, ?it/s]

Querying the DB

In [22]:
docs = vectordb.similarity_search(query="What optimization technique is used?", k=2)
for doc in docs:
    print(doc)

page_content='he paper, each training step took about 0.4 seconds. We\ntrained the base models for a total of 100,000 steps or 12 hours. For our big models,(described on the\nbottom line of table 3), step time was 1.0 seconds. The big models were trained for 300,000 steps\n(3.5 days).\n5.3 Optimizer\nWe used the Adam optimizer [ 17] withβ1= 0.9,β2= 0.98andϵ= 10−9. We varied the learning\nrate over the course of training, according to the formula:\nlrate =d−0.5\nmodel·min(step_num−0.5,step _num·warmup _steps−1.5) (3)\n' metadata={'page_num': 7}
page_content='by one position, ensures that the\npredictions for position ican depend only on the known outputs at positions less than i.\n3.2 Attention\nAn attention function can be described as mapping a query and a set of key-value pairs to an output,\nwhere the query, keys, values, and output are all vectors. The output is computed as a weighted sum\nof the values, where the weight assigned to each value is computed by a compatibility functio

Adding Question Answering

In [56]:
os.environ["OPENAI_API_TYPE"] = "<OPENAI_API_TYPE>"
os.environ["OPENAI_API_KEY"] = "<OPENAI_API_KEY>"
os.environ["OPENAI_API_BASE"] = "<OPENAI_API_BASE>"
os.environ["OPENAI_API_VERSION"] = "<OPENAI_API_VERSION>"

In [61]:
qa_chain = RetrievalQA.from_chain_type(
    llm = AzureOpenAI(deployment_name="Text-Davinci"),
    retriever=vectordb.as_retriever(search_kwargs={'k': 3}),
    return_source_documents=True
)

In [62]:
out = qa_chain({'query': 'What optimization technique is used in the paper?'})
result
# result['result']
# result['source_documents']

In [63]:
result

{'query': 'What optimization technique is used in the paper?',
 'result': ' The paper uses the Adam optimizer with β1= 0.9, β2 = 0.98 and ϵ= 10−9.',
 'source_documents': [Document(page_content='he paper, each training step took about 0.4 seconds. We\ntrained the base models for a total of 100,000 steps or 12 hours. For our big models,(described on the\nbottom line of table 3), step time was 1.0 seconds. The big models were trained for 300,000 steps\n(3.5 days).\n5.3 Optimizer\nWe used the Adam optimizer [ 17] withβ1= 0.9,β2= 0.98andϵ= 10−9. We varied the learning\nrate over the course of training, according to the formula:\nlrate =d−0.5\nmodel·min(step_num−0.5,step _num·warmup _steps−1.5) (3)\n', metadata={'page_num': 7}),
  Document(page_content='by one position, ensures that the\npredictions for position ican depend only on the known outputs at positions less than i.\n3.2 Attention\nAn attention function can be described as mapping a query and a set of key-value pairs to an output,\n

In [None]:
!rm -rf chroma_store/