Installing the dependencies

In [None]:
!pip install pypdf
!pip install -U sentence-transformers
!pip install chromadb
!pip install langchain

Importing the necessary dependencies

In [None]:
import os
from tqdm.auto import tqdm
import pypdf as PyPDF
from langchain.vectorstores import Chroma
from langchain.docstore.document import Document
from langchain.embeddings import HuggingFaceEmbeddings
import urllib.request

Downloading the sample PDF

In [None]:
def download_file(url, destination_path):
    try:
        urllib.request.urlretrieve(url, destination_path)
        print(f"File downloaded successfully and saved to: {destination_path}")
    except Exception as e:
        print(f"Error occurred while downloading the file: {e}")

Processing the document and converting it to chunks or documents.

In [None]:
def pdf_to_documents(path : str, chunk_length : int , overlap: int = 0, preprocess = None) -> list[Document]:
    '''
    Convert PDF document to text chunks with page numbers.
    Each chunk has a specified length and is prepended with the page number.
    '''
    pdf_file = open(path, 'rb')
    pdf_reader = PyPDF.PdfReader(pdf_file)
    total_pages = len(pdf_reader.pages)

    chunks = []

    for page_num in tqdm(range(total_pages)):
        page = pdf_reader.pages[page_num]
        page_text = page.extract_text()

        # Split the page text into chunks of specified length
        for i in range(0, len(page_text), chunk_length - overlap):
            chunk = page_text[i:i+chunk_length]

            if preprocess:
                chunk = preprocess(chunk)

            if chunk:
                chunks.append(
                    Document(
                        page_content=chunk,
                        metadata={
                            'page_num' : page_num + 1,
                            }))

    pdf_file.close()
    return chunks

In [None]:
def preprocess(chunk):
    return chunk.replace("\\n", "") # sample preprocessing

In [None]:
file_url = "https://arxiv.org/pdf/1706.03762.pdf"
path_to_pdf = 'attentionIsAllYouNeedPaper.pdf'
chunk_length = 500
overlap = 150
download_file(file_url, path_to_pdf)
result = pdf_to_documents(path_to_pdf, chunk_length, overlap, preprocess)

File downloaded successfully and saved to: attentionIsAllYouNeedPaper.pdf


  0%|          | 0/15 [00:00<?, ?it/s]

Downloading the sentence-bert embeddings

In [None]:
sbert = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

Downloading (…)a8e1d/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)0bca8e1d/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)e1d/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)a8e1d/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)8e1d/train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)bca8e1d/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

Initializing the Chroma instance

In [None]:
vectordb = Chroma(
  embedding_function = sbert,
  persist_directory = 'chroma_store'
)
vectordb.persist()

Adding all documents to the DB

In [None]:
for doc in tqdm(result):
  vectordb.add_documents([doc])

  0%|          | 0/122 [00:00<?, ?it/s]

Querying the DB

In [None]:
docs = vectordb.similarity_search(query="What optimization technique is used?", k=2)
for doc in docs:
    print(doc)

page_content='m optimizer [ 20] with β1= 0.9,β2= 0.98andϵ= 10−9. We varied the learning\nrate over the course of training, according to the formula:\nlrate =d−0.5\nmodel·min(step_num−0.5, step _num·warmup _steps−1.5) (3)\nThis corresponds to increasing the learning rate linearly for the first warmup _steps training steps,\nand decreasing it thereafter proportionally to the inverse square root of the step number. We used\nwarmup _steps = 4000 .\n5.4 Regularization\nWe employ three types of regularization during traini' metadata={'page_num': 7}
page_content='to the inverse square root of the step number. We used\nwarmup _steps = 4000 .\n5.4 Regularization\nWe employ three types of regularization during training:\n7' metadata={'page_num': 7}


In [None]:
!rm -rf chroma_store/