In [41]:
import os
pdf_path = "Documents/budget_speech.pdf"
print(pdf_path)

Documents/budget_speech.pdf


In [77]:
# Requires !pip install PyMuPDF, see: https://github.com/pymupdf/pymupdf
import fitz # (pymupdf, found this is better than pypdf for our use case, note: licence is AGPL-3.0, keep that in mind if you want to use any code commercially)
from tqdm.auto import tqdm # for progress bars, requires !pip install tqdm 

def text_formatter(text: str) -> str:
    """Performs minor formatting on text."""
    cleaned_text = text.replace("\n", " ").strip() # note: this might be different for each doc (best to experiment)

    # Other potential text formatting functions can go here
    return cleaned_text

# Open PDF and get lines/pages
# Note: this only focuses on text, rather than images/figures etc
def open_and_read_pdf(pdf_path: str) -> list[dict]:
    """
    Opens a PDF file, reads its text content page by page, and collects statistics.

    Parameters:
        pdf_path (str): The file path to the PDF document to be opened and read.

    Returns:
        list[dict]: A list of dictionaries, each containing the page number
        (adjusted), character count, word count, sentence count, token count, and the extracted text
        for each page.
    """
    doc = fitz.open(pdf_path)  # open a document
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc)):  # iterate the document pages
        if page_number <= 2:  # Skip the first 2 pages
         continue
        text = page.get_text()  # get plain text encoded as UTF-8
        text = text_formatter(text)
        pages_and_texts.append({"page_number": page_number - 5,  # adjust page numbers since our PDF starts on page 5
                                "page_char_count": len(text),
                                "page_word_count": len(text.split(" ")),
                                "page_sentence_count_raw": len(text.split(". ")),
                                "page_token_count": len(text) / 4,  # 1 token = ~4 chars, see: https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
                                "text": text})
    return pages_and_texts

pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)
pages_and_texts


58it [00:00, 684.23it/s]


[{'page_number': -2,
  'page_char_count': 0,
  'page_word_count': 1,
  'page_sentence_count_raw': 1,
  'page_token_count': 0.0,
  'text': ''},
 {'page_number': -1,
  'page_char_count': 1465,
  'page_word_count': 272,
  'page_sentence_count_raw': 16,
  'page_token_count': 366.25,
  'text': 'Budget 2023-2024    Speech of  Nirmala Sitharaman  Minister of Finance  February 1, 2023  Hon’ble Speaker,     I present the Budget for 2023-24. This is the first Budget in Amrit  Kaal.  Introduction  1.  This Budget hopes to build on the foundation laid in the previous  Budget, and the blueprint drawn for India@100. We envision a prosperous  and inclusive India, in which the fruits of development reach all regions and  citizens, especially our youth, women, farmers, OBCs, Scheduled Castes and  Scheduled Tribes.   2.  In the 75th year of our Independence, the world has recognised the  Indian economy as a ‘bright star’. Our current year’s economic growth is  estimated to be at 7 per cent. It is notabl

In [104]:
import random
random.sample(pages_and_texts, k=3)

[{'page_number': 31,
  'page_char_count': 1701,
  'page_word_count': 387,
  'page_sentence_count_raw': 21,
  'page_token_count': 425.25,
  'text': '33        Personal Income Tax  145.  Now, I come to what everyone is waiting for -- personal income tax. I  have five major announcements to make in this regard. These primarily  benefit our hard-working middle class.  146.  The first one concerns rebate. Currently, those with income up to   ` 5 lakh do not pay any income tax in both old and new tax regimes. I  propose to increase the rebate limit to ` 7 lakh in the new tax regime. Thus,  persons in the new tax regime, with income up to ` 7 lakh will not have to  pay any tax.   147.  The  second  proposal  relates  to  middle-class  individuals.   I had introduced, in the year 2020, the new personal income tax regime  with six income slabs starting from ` 2.5 lakh. I propose to change the tax  structure in this regime by reducing the number of slabs to five and  increasing the tax exemption

In [107]:
import pandas as pd

df = pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,-2,0,1,1,0.0,
1,-1,1465,272,16,366.25,Budget 2023-2024 Speech of Nirmala Sithara...
2,0,1811,323,15,452.75,2 profile is because of several accompl...
3,1,1536,294,18,384.0,3 9. The economy has become a lot more...
4,2,2061,345,15,515.25,4 1) Economic Empowerment of Women: Dee...


In [108]:
# Get stats
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,55.0,55.0,55.0,55.0,55.0
mean,25.0,1609.02,304.22,13.2,402.25
std,16.02,432.11,77.93,4.12,108.03
min,-2.0,0.0,1.0,1.0,0.0
25%,11.5,1459.5,260.0,11.0,364.88
50%,25.0,1660.0,301.0,14.0,415.0
75%,38.5,1829.0,346.0,16.0,457.25
max,52.0,2291.0,452.0,22.0,572.75


In [109]:
from spacy.lang.en import English # see https://spacy.io/usage for install instructions

nlp = English()

# Add a sentencizer pipeline, see https://spacy.io/api/sentencizer/ 
nlp.add_pipe("sentencizer")

# Create a document instance as an example
doc = nlp("This is a sentence. This another sentence.")
assert len(list(doc.sents)) == 2

# Access the sentences of the document
list(doc.sents)

[This is a sentence., This another sentence.]

In [112]:
for item in tqdm(pages_and_texts):
    item["sentences"] = list(nlp(item["text"]).sents)
    
    # Make sure all sentences are strings
    item["sentences"] = [str(sentence) for sentence in item["sentences"]]
    
    # Count the sentences 
    item["page_sentence_count_spacy"] = len(item["sentences"])

100%|██████████| 55/55 [00:00<00:00, 601.15it/s]


In [113]:

df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy
count,55.0,55.0,55.0,55.0,55.0,55.0
mean,25.0,1609.02,304.22,13.2,402.25,12.76
std,16.02,432.11,77.93,4.12,108.03,4.22
min,-2.0,0.0,1.0,1.0,0.0,0.0
25%,11.5,1459.5,260.0,11.0,364.88,10.5
50%,25.0,1660.0,301.0,14.0,415.0,13.0
75%,38.5,1829.0,346.0,16.0,457.25,15.5
max,52.0,2291.0,452.0,22.0,572.75,21.0


In [118]:
# Define split size to turn groups of sentences into chunks
num_sentence_chunk_size = 22

# Create a function that recursively splits a list into desired sizes
def split_list(input_list: list, 
               slice_size: int) -> list[list[str]]:
    """
    Splits the input_list into sublists of size slice_size (or as close as possible).

    For example, a list of 17 sentences would be split into two lists of [[10], [7]]
    """
    return [input_list[i:i + slice_size] for i in range(0, len(input_list), slice_size)]

# Loop through pages and texts and split sentences into chunks
for item in tqdm(pages_and_texts):
    item["sentence_chunks"] = split_list(input_list=item["sentences"],
                                         slice_size=num_sentence_chunk_size)
    item["num_chunks"] = len(item["sentence_chunks"])

100%|██████████| 55/55 [00:00<00:00, 307582.29it/s]


In [119]:
# Sample an example from the group (note: many samples have only 1 chunk as they have <=10 sentences total)
random.sample(pages_and_texts, k=1)

[{'page_number': 28,
  'page_char_count': 1768,
  'page_word_count': 337,
  'page_sentence_count_raw': 15,
  'page_token_count': 442.0,
  'text': '30          Direct Taxes  132.  I now come to my direct tax proposals. These proposals aim to  maintain continuity and stability of taxation, further simplify and rationalise  various provisions to reduce the compliance burden, promote the  entrepreneurial spirit and provide tax relief to citizens.  133.  It has been the constant endeavour of the Income Tax Department  to improve Tax Payers Services by making compliance easy and smooth. Our  tax payers’ portal received a maximum of 72 lakh returns in a day;  processed more than 6.5 crore returns this year; average processing period  reduced from 93 days in financial year 13-14 to 16 days now;   and 45 per cent of the returns were processed within 24 hours. We intend  to further improve this, roll out a next-generation Common IT Return Form  for tax payer convenience, and also plan to strengt

In [122]:
# Create a DataFrame to get stats
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy,num_chunks
count,55.0,55.0,55.0,55.0,55.0,55.0,55.0
mean,25.0,1609.02,304.22,13.2,402.25,12.76,0.98
std,16.02,432.11,77.93,4.12,108.03,4.22,0.13
min,-2.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,11.5,1459.5,260.0,11.0,364.88,10.5,1.0
50%,25.0,1660.0,301.0,14.0,415.0,13.0,1.0
75%,38.5,1829.0,346.0,16.0,457.25,15.5,1.0
max,52.0,2291.0,452.0,22.0,572.75,21.0,1.0


In [123]:
import re

# Split each chunk into its own item
pages_and_chunks = []
for item in tqdm(pages_and_texts):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]
        
        # Join the sentences together into a paragraph-like structure, aka a chunk (so they are a single string)
        joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk) # ".A" -> ". A" for any full-stop/capital letter combo 
        chunk_dict["sentence_chunk"] = joined_sentence_chunk

        # Get stats about the chunk
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4 # 1 token = ~4 characters
        
        pages_and_chunks.append(chunk_dict)

# How many chunks do we have?
len(pages_and_chunks)

100%|██████████| 55/55 [00:00<00:00, 19647.96it/s]


54

In [183]:
# View a random sample
random.sample(pages_and_chunks, k=1)

[{'page_number': 45,
  'sentence_chunk': '47    products, synthetic diamonds, cotton, fertilizer grade urea etc. This will also help in trade facilitation by better identification of the above items, getting clarity on availing concessional import duty through various notifications and thus reducing dwell time. These changes shall come into effect from 01.05.2023. A.4   Amendment in the Second Schedule to the Customs Tariff Act, 1975 The Second Schedule (Export Tariff) is being amended to align the entries under heading 1202 with that of the First Schedule (Import Tariff) . B. LEGISLATIVE CHANGES IN GST LAWS B.1 Decriminalisation Section 132 and section 138 of CGST Act are being amended, inter alia, to - \uf0b7 raise the minimum threshold of tax amount for launching prosecution under GST from ` one crore to ` two crore, except for the offence of issuance of invoices without supply of goods or services or both; \uf0b7 reduce the compounding amount from the present range of 50 per cent t

In [126]:
# Get stats about our chunks
df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,54.0,54.0,54.0,54.0
mean,25.5,1592.46,263.48,398.12
std,15.73,375.29,64.6,93.82
min,-1.0,626.0,111.0,156.5
25%,12.25,1432.5,221.5,358.12
50%,25.5,1635.5,258.0,408.88
75%,38.75,1792.5,306.75,448.12
max,52.0,2232.0,395.0,558.0


In [184]:

pages_and_chunks_over_min_token_len = df.to_dict(orient="records")
# pages_and_chunks_over_min_token_len[:2]
print(pages_and_chunks_over_min_token_len)

[{'page_number': -1, 'sentence_chunk': 'Budget 2023-2024  Speech of Nirmala Sitharaman Minister of Finance February 1, 2023 Hon’ble Speaker,   I present the Budget for 2023-24. This is the first Budget in Amrit Kaal. Introduction 1. This Budget hopes to build on the foundation laid in the previous Budget, and the blueprint drawn for India@100. We envision a prosperous and inclusive India, in which the fruits of development reach all regions and citizens, especially our youth, women, farmers, OBCs, Scheduled Castes and Scheduled Tribes. 2. In the 75th year of our Independence, the world has recognised the Indian economy as a ‘bright star’. Our current year’s economic growth is estimated to be at 7 per cent. It is notable that this is the highest among all the major economies. This is in spite of the massive slowdown globally caused by Covid-19 and a war. The Indian economy is therefore on the right track, and despite a time of challenges, heading towards a bright future. 3. Today as Ind

In [None]:
# Requires !pip install sentence-transformers
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(model_name_or_path="dunzhang/stella_en_1.5B_v5", 
                                      device="mps") # choose the device to load the model to (note: GPU will often be *much* faster than CPU)

# Create a list of sentences to turn into numbers
sentences = [
    "The Sentences Transformers library provides an easy and open-source way to create embeddings.",
    "Sentences can be embedded one by one or as a list of strings.",
    "Embeddings are one of the most powerful concepts in machine learning!",
    "Learn to use embeddings well and you'll be well on your way to being an AI engineer."
]

# Sentences are encoded/embedded by calling model.encode()
embeddings =  embedding_model.encode(sentences)
embeddings_dict = dict(zip(sentences, embeddings))

# See the embeddings
for sentence, embedding in embeddings_dict.items():
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")

Sentence: The Sentences Transformers library provides an easy and open-source way to create embeddings.
Embedding: [-0.0783452   0.2825111   0.33010766 ... -0.2993959   0.09925018
  0.08549123]

Sentence: Sentences can be embedded one by one or as a list of strings.
Embedding: [-0.14116079  0.70169544  0.5244387  ...  0.05883359  0.30396914
  0.7997837 ]

Sentence: Embeddings are one of the most powerful concepts in machine learning!
Embedding: [-0.18257134  0.8398939   0.68544364 ... -1.0129157   0.49978787
  0.59593904]

Sentence: Learn to use embeddings well and you'll be well on your way to being an AI engineer.
Embedding: [-0.2706708   0.03768877  0.5299301  ... -0.49734288 -0.26635906
  0.34244403]



In [185]:
single_sentence = "Yo! How cool are embeddings?"
single_embedding = embedding_model.encode(single_sentence)
print(f"Sentence: {single_sentence}")
print(f"Embedding:\n{single_embedding}")
print(f"Embedding size: {single_embedding.shape}")
print(len(single_embedding))

Sentence: Yo! How cool are embeddings?
Embedding:
[-0.29576552  0.75850856  0.45881143 ... -1.6232553   0.08443563
  1.228344  ]
Embedding size: (1024,)
1024


In [130]:
%%time

# Uncomment to see how long it takes to create embeddings on CPU

# Make sure the model is on the CPU
embedding_model.to("cpu")

# Embed each chunk one by one
for item in tqdm(pages_and_chunks_over_min_token_len):
    item["embedding"] = embedding_model.encode(item["sentence_chunk"])


100%|██████████| 54/54 [00:35<00:00,  1.51it/s]

CPU times: user 3min 17s, sys: 2min 36s, total: 5min 54s
Wall time: 36.5 s





In [29]:
%%time

# Send the model to the Metal GPU
embedding_model.to("mps") # requires a GPU installed, for reference on my local machine, I'm using an M1 Pro

# Create embeddings one by one on the GPU
for item in tqdm(pages_and_chunks_over_min_token_len):
    item["embedding"] = embedding_model.encode(item["sentence_chunk"])


100%|██████████| 98/98 [00:16<00:00,  6.07it/s]

CPU times: user 9.9 s, sys: 1.89 s, total: 11.8 s
Wall time: 16.8 s





In [148]:
# Turn text chunks into a single list
text_chunks = [item["sentence_chunk"] for item in pages_and_chunks_over_min_token_len]
print(text_chunks)

['Budget 2023-2024  Speech of Nirmala Sitharaman Minister of Finance February 1, 2023 Hon’ble Speaker,   I present the Budget for 2023-24. This is the first Budget in Amrit Kaal. Introduction 1. This Budget hopes to build on the foundation laid in the previous Budget, and the blueprint drawn for India@100. We envision a prosperous and inclusive India, in which the fruits of development reach all regions and citizens, especially our youth, women, farmers, OBCs, Scheduled Castes and Scheduled Tribes. 2. In the 75th year of our Independence, the world has recognised the Indian economy as a ‘bright star’. Our current year’s economic growth is estimated to be at 7 per cent. It is notable that this is the highest among all the major economies. This is in spite of the massive slowdown globally caused by Covid-19 and a war. The Indian economy is therefore on the right track, and despite a time of challenges, heading towards a bright future. 3. Today as Indians stands with their head held high,

In [175]:
#Vector Search DB In PineCode
import pinecone
from pinecone import Pinecone
pc = Pinecone(api_key = os.environ['PINECONE_API_KEY'])
index = pc.Index("llmchat")
print(index)
index_name = 'llmchat'

<pinecone.data.index.Index object at 0x4ffefc6a0>


In [180]:
from langchain.schema import Document
documents = [
    Document(
        source = pdf_path,
        page_content=text_chunks["sentence_chunk"],
        metadata={"page": text_chunks["page_number"]}
    )
    for text_chunks in pages_and_chunks_over_min_token_len
]
print(documents)

[Document(metadata={'page': -1}, page_content='Budget 2023-2024  Speech of Nirmala Sitharaman Minister of Finance February 1, 2023 Hon’ble Speaker,   I present the Budget for 2023-24. This is the first Budget in Amrit Kaal. Introduction 1. This Budget hopes to build on the foundation laid in the previous Budget, and the blueprint drawn for India@100. We envision a prosperous and inclusive India, in which the fruits of development reach all regions and citizens, especially our youth, women, farmers, OBCs, Scheduled Castes and Scheduled Tribes. 2. In the 75th year of our Independence, the world has recognised the Indian economy as a ‘bright star’. Our current year’s economic growth is estimated to be at 7 per cent. It is notable that this is the highest among all the major economies. This is in spite of the massive slowdown globally caused by Covid-19 and a war. The Indian economy is therefore on the right track, and despite a time of challenges, heading towards a bright future. 3. Today

In [None]:
# # Extract texts from documents
# texts = [doc.page_content for doc in documents]
# print(texts)
# # Generate embeddings
# embeddings = embedding_model.encode(texts, device="mps")  # Ensure the device matches your setup
# print(embedding)


['Budget 2023-2024  Speech of Nirmala Sitharaman Minister of Finance February 1, 2023 Hon’ble Speaker,   I present the Budget for 2023-24. This is the first Budget in Amrit Kaal. Introduction 1. This Budget hopes to build on the foundation laid in the previous Budget, and the blueprint drawn for India@100. We envision a prosperous and inclusive India, in which the fruits of development reach all regions and citizens, especially our youth, women, farmers, OBCs, Scheduled Castes and Scheduled Tribes. 2. In the 75th year of our Independence, the world has recognised the Indian economy as a ‘bright star’. Our current year’s economic growth is estimated to be at 7 per cent. It is notable that this is the highest among all the major economies. This is in spite of the massive slowdown globally caused by Covid-19 and a war. The Indian economy is therefore on the right track, and despite a time of challenges, heading towards a bright future. 3. Today as Indians stands with their head held high,

KeyboardInterrupt: 

In [None]:
# # Generate unique IDs and extract metadata
# ids = [f"doc_{i}" for i in range(len(documents))]  # Unique IDs for each document
# metadatas = [doc.metadata for doc in documents]

In [None]:
# # Upsert into Pinecone
# index.upsert(vectors=[
#     {"id": doc_id, "values": embedding.tolist(), "metadata": metadata}
#     for doc_id, embedding, metadata in zip(ids, embeddings, metadatas)
# ])

{'upserted_count': 54}

In [182]:
from langchain_pinecone import PineconeVectorStore
embedding_fn = lambda text: embedding_model.encode(text, convert_to_tensor=True)
vectorstore_from_docs = PineconeVectorStore.from_documents(
    documents,
    index_name=index_name,
    embedding= embedding_fn  # Wrap embedding_model.encode with a lambda function
)


AttributeError: 'function' object has no attribute 'embed_documents'

In [None]:
from langchain_pinecone import PineconeVectorStore
embedding_fn = lambda text: embedding_model.encode(text, convert_to_tensor=True)
vectorstore_from_docs = PineconeVectorStore.from_documents(
    documents,
    index_name=index_name,
    embedding= embedding_fn  # Wrap embedding_model.encode with a lambda function
)


AttributeError: 'function' object has no attribute 'embed_documents'

In [None]:
from langchain_pinecone import PineconeVectorStore
embedding_fn = lambda text: embedding_model.encode(text, convert_to_tensor=True)
vectorstore_from_docs = PineconeVectorStore.from_documents(
    documents,
    index_name=index_name,
    embedding= embedding_fn  # Wrap embedding_model.encode with a lambda function
)


AttributeError: 'function' object has no attribute 'embed_documents'

In [None]:
from langchain_pinecone import PineconeVectorStore
embedding_fn = lambda text: embedding_model.encode(text, convert_to_tensor=True)
vectorstore_from_docs = PineconeVectorStore.from_documents(
    documents,
    index_name=index_name,
    embedding= embedding_fn  # Wrap embedding_model.encode with a lambda function
)


AttributeError: 'function' object has no attribute 'embed_documents'

In [None]:
from langchain_pinecone import PineconeVectorStore
embedding_fn = lambda text: embedding_model.encode(text, convert_to_tensor=True)
vectorstore_from_docs = PineconeVectorStore.from_documents(
    documents,
    index_name=index_name,
    embedding= embedding_fn  # Wrap embedding_model.encode with a lambda function
)


AttributeError: 'function' object has no attribute 'embed_documents'

In [None]:
from langchain_pinecone import PineconeVectorStore
embedding_fn = lambda text: embedding_model.encode(text, convert_to_tensor=True)
vectorstore_from_docs = PineconeVectorStore.from_documents(
    documents,
    index_name=index_name,
    embedding= embedding_fn  # Wrap embedding_model.encode with a lambda function
)


AttributeError: 'function' object has no attribute 'embed_documents'

In [176]:
from langchain_pinecone import PineconeVectorStore

def retrieve_query(query, k=4, index_name=index_name):
    """
    Retrieves the top-k most relevant documents from the vectorstore.
    """
    # Initialize the PineconeVectorStore
    pinecone_index = pc.Index(index_name)
    print(pinecone_index)
    vectorstore = PineconeVectorStore(pinecone_index, embedding_model.encode, text_key="page_content")

    # Perform similarity search
    matching_results = vectorstore.similarity_search(query, k=k)
    return matching_results


In [177]:
output = retrieve_query("what are the governments Achievements since 2014?", k=4)
print(output)

<pinecone.data.index.Index object at 0x39821e7d0>


AttributeError: 'function' object has no attribute 'embed_query'