In [1]:
import os
pdf_path = "Documents/budget_speech.pdf"
print(pdf_path)

Documents/budget_speech.pdf


In [2]:
import os
from dotenv import load_dotenv
load_dotenv()
 

True

In [None]:
# Requires !pip install PyMuPDF, see: https://github.com/pymupdf/pymupdf
import fitz # (pymupdf, found this is better than pypdf for our use case, note: licence is AGPL-3.0, keep that in mind if you want to use any code commercially)
from tqdm.auto import tqdm # for progress bars, requires !pip install tqdm 

def text_formatter(text: str) -> str:
    """Performs minor formatting on text."""
    cleaned_text = text.replace("\n", " ").strip() # note: this might be different for each doc (best to experiment)

    # Other potential text formatting functions can go here
    return cleaned_text

# Open PDF and get lines/pages
# Note: this only focuses on text, rather than images/figures etc
def open_and_read_pdf(pdf_path: str) -> list[dict]:
    """
    Opens a PDF file, reads its text content page by page, and collects statistics.

    Parameters:
        pdf_path (str): The file path to the PDF document to be opened and read.

    Returns:
        list[dict]: A list of dictionaries, each containing the page number
        (adjusted), character count, word count, sentence count, token count, and the extracted text
        for each page.
    """
    doc = fitz.open(pdf_path)  # open a document
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc)):  # iterate the document pages
        if page_number <= 2:  # Skip the first 2 pages
         continue
        text = page.get_text()  # get plain text encoded as UTF-8
        text = text_formatter(text)
        pages_and_texts.append({"page_number": page_number - 5,  # adjust page numbers since our PDF starts on page 5
                                "page_char_count": len(text),
                                "page_word_count": len(text.split(" ")),
                                "page_sentence_count_raw": len(text.split(". ")),
                                "page_token_count": len(text) / 4,  # 1 token = ~4 chars, see: https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
                                "text": text})
    return pages_and_texts

pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)
pages_and_texts


58it [00:00, 743.33it/s]


[{'page_number': -2,
  'page_char_count': 0,
  'page_word_count': 1,
  'page_sentence_count_raw': 1,
  'page_token_count': 0.0,
  'text': ''},
 {'page_number': -1,
  'page_char_count': 1465,
  'page_word_count': 272,
  'page_sentence_count_raw': 16,
  'page_token_count': 366.25,
  'text': 'Budget 2023-2024    Speech of  Nirmala Sitharaman  Minister of Finance  February 1, 2023  Hon’ble Speaker,     I present the Budget for 2023-24. This is the first Budget in Amrit  Kaal.  Introduction  1.  This Budget hopes to build on the foundation laid in the previous  Budget, and the blueprint drawn for India@100. We envision a prosperous  and inclusive India, in which the fruits of development reach all regions and  citizens, especially our youth, women, farmers, OBCs, Scheduled Castes and  Scheduled Tribes.   2.  In the 75th year of our Independence, the world has recognised the  Indian economy as a ‘bright star’. Our current year’s economic growth is  estimated to be at 7 per cent. It is notabl

In [None]:
import random
random.sample(pages_and_texts, k=3)

[{'page_number': 28,
  'page_char_count': 1768,
  'page_word_count': 337,
  'page_sentence_count_raw': 15,
  'page_token_count': 442.0,
  'text': '30          Direct Taxes  132.  I now come to my direct tax proposals. These proposals aim to  maintain continuity and stability of taxation, further simplify and rationalise  various provisions to reduce the compliance burden, promote the  entrepreneurial spirit and provide tax relief to citizens.  133.  It has been the constant endeavour of the Income Tax Department  to improve Tax Payers Services by making compliance easy and smooth. Our  tax payers’ portal received a maximum of 72 lakh returns in a day;  processed more than 6.5 crore returns this year; average processing period  reduced from 93 days in financial year 13-14 to 16 days now;   and 45 per cent of the returns were processed within 24 hours. We intend  to further improve this, roll out a next-generation Common IT Return Form  for tax payer convenience, and also plan to strengt

In [None]:
import pandas as pd

df = pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,-2,0,1,1,0.0,
1,-1,1465,272,16,366.25,Budget 2023-2024 Speech of Nirmala Sithara...
2,0,1811,323,15,452.75,2 profile is because of several accompl...
3,1,1536,294,18,384.0,3 9. The economy has become a lot more...
4,2,2061,345,15,515.25,4 1) Economic Empowerment of Women: Dee...


In [None]:
# Get stats
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,55.0,55.0,55.0,55.0,55.0
mean,25.0,1609.02,304.22,13.2,402.25
std,16.02,432.11,77.93,4.12,108.03
min,-2.0,0.0,1.0,1.0,0.0
25%,11.5,1459.5,260.0,11.0,364.88
50%,25.0,1660.0,301.0,14.0,415.0
75%,38.5,1829.0,346.0,16.0,457.25
max,52.0,2291.0,452.0,22.0,572.75


In [None]:
from spacy.lang.en import English # see https://spacy.io/usage for install instructions

nlp = English()

# Add a sentencizer pipeline, see https://spacy.io/api/sentencizer/ 
nlp.add_pipe("sentencizer")

# Create a document instance as an example
doc = nlp("This is a sentence. This another sentence.")
assert len(list(doc.sents)) == 2

# Access the sentences of the document
list(doc.sents)

[This is a sentence., This another sentence.]

In [None]:
for item in tqdm(pages_and_texts):
    item["sentences"] = list(nlp(item["text"]).sents)
    
    # Make sure all sentences are strings
    item["sentences"] = [str(sentence) for sentence in item["sentences"]]
    
    # Count the sentences 
    item["page_sentence_count_spacy"] = len(item["sentences"])

100%|██████████| 55/55 [00:00<00:00, 582.00it/s]


In [None]:

df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy
count,55.0,55.0,55.0,55.0,55.0,55.0
mean,25.0,1609.02,304.22,13.2,402.25,12.76
std,16.02,432.11,77.93,4.12,108.03,4.22
min,-2.0,0.0,1.0,1.0,0.0,0.0
25%,11.5,1459.5,260.0,11.0,364.88,10.5
50%,25.0,1660.0,301.0,14.0,415.0,13.0
75%,38.5,1829.0,346.0,16.0,457.25,15.5
max,52.0,2291.0,452.0,22.0,572.75,21.0


In [None]:
# Define split size to turn groups of sentences into chunks
num_sentence_chunk_size = 22

# Create a function that recursively splits a list into desired sizes
def split_list(input_list: list, 
               slice_size: int) -> list[list[str]]:
    """
    Splits the input_list into sublists of size slice_size (or as close as possible).

    For example, a list of 17 sentences would be split into two lists of [[10], [7]]
    """
    return [input_list[i:i + slice_size] for i in range(0, len(input_list), slice_size)]

# Loop through pages and texts and split sentences into chunks
for item in tqdm(pages_and_texts):
    item["sentence_chunks"] = split_list(input_list=item["sentences"],
                                         slice_size=num_sentence_chunk_size)
    item["num_chunks"] = len(item["sentence_chunks"])

100%|██████████| 55/55 [00:00<00:00, 845006.30it/s]


In [None]:
# Sample an example from the group (note: many samples have only 1 chunk as they have <=10 sentences total)
random.sample(pages_and_texts, k=1)

[{'page_number': 12,
  'page_char_count': 1725,
  'page_word_count': 301,
  'page_sentence_count_raw': 17,
  'page_token_count': 431.25,
  'text': '14        Mission Karmayogi  58.  Under Mission Karmayogi, Centre, States and Union Territories are  making and implementing capacity-building plans for civil servants. The  government has also launched an integrated online training platform, iGOT  Karmayogi, to provide continuous learning opportunities for lakhs of  government employees to upgrade their skills and facilitate people-centric  approach.    59.  For  enhancing  ease  of  doing  business,  more  than   39,000  compliances  have  been  reduced  and  more  than   3,400 legal provisions have been decriminalized. For furthering the trust- based governance, we have introduced the Jan Vishwas Bill to amend 42  Central Acts. This Budget proposes a series of measures to unleash the  potential of our economy.   Centres of Excellence for Artificial Intelligence  60.  For realizing the vi

In [None]:
# Create a DataFrame to get stats
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy,num_chunks
count,55.0,55.0,55.0,55.0,55.0,55.0,55.0
mean,25.0,1609.02,304.22,13.2,402.25,12.76,0.98
std,16.02,432.11,77.93,4.12,108.03,4.22,0.13
min,-2.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,11.5,1459.5,260.0,11.0,364.88,10.5,1.0
50%,25.0,1660.0,301.0,14.0,415.0,13.0,1.0
75%,38.5,1829.0,346.0,16.0,457.25,15.5,1.0
max,52.0,2291.0,452.0,22.0,572.75,21.0,1.0


In [None]:
import re

# Split each chunk into its own item
pages_and_chunks = []
for item in tqdm(pages_and_texts):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]
        
        # Join the sentences together into a paragraph-like structure, aka a chunk (so they are a single string)
        joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk) # ".A" -> ". A" for any full-stop/capital letter combo 
        chunk_dict["sentence_chunk"] = joined_sentence_chunk

        # Get stats about the chunk
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4 # 1 token = ~4 characters
        
        pages_and_chunks.append(chunk_dict)

# How many chunks do we have?
len(pages_and_chunks)

100%|██████████| 55/55 [00:00<00:00, 44680.75it/s]


54

In [None]:
# View a random sample
random.sample(pages_and_chunks, k=1)

[{'page_number': 9,
  'sentence_chunk': '11    Bharat Shared Repository of Inscriptions (Bharat SHRI) 41. ‘Bharat Shared Repository of Inscriptions’ will be set up in a digital epigraphy museum, with digitization of one lakh ancient inscriptions in the first stage.  Support for poor prisoners 42. For poor persons who are in prisons and unable to afford the penalty or the bail amount, required financial support will be provided.   Priority 3: Infrastructure & Investment 43. Investments in Infrastructure and productive capacity have a large multiplier impact on growth and employment. After the subdued period of the pandemic, private investments are growing again. The Budget takes the lead once again to ramp up the virtuous cycle of investment and job creation.   Capital Investment as driver of growth and jobs 44. Capital investment outlay is being increased steeply for the third year in a row by 33 per cent to ` 10 lakh crore, which would be 3.3 per cent of GDP. This will be almost three

In [None]:
# Get stats about our chunks
df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,54.0,54.0,54.0,54.0
mean,25.5,1592.46,263.48,398.12
std,15.73,375.29,64.6,93.82
min,-1.0,626.0,111.0,156.5
25%,12.25,1432.5,221.5,358.12
50%,25.5,1635.5,258.0,408.88
75%,38.75,1792.5,306.75,448.12
max,52.0,2232.0,395.0,558.0


In [None]:

pages_and_chunks_over_min_token_len = df.to_dict(orient="records")
# pages_and_chunks_over_min_token_len[:2]
print(pages_and_chunks_over_min_token_len)

[{'page_number': -1, 'sentence_chunk': 'Budget 2023-2024  Speech of Nirmala Sitharaman Minister of Finance February 1, 2023 Hon’ble Speaker,   I present the Budget for 2023-24. This is the first Budget in Amrit Kaal. Introduction 1. This Budget hopes to build on the foundation laid in the previous Budget, and the blueprint drawn for India@100. We envision a prosperous and inclusive India, in which the fruits of development reach all regions and citizens, especially our youth, women, farmers, OBCs, Scheduled Castes and Scheduled Tribes. 2. In the 75th year of our Independence, the world has recognised the Indian economy as a ‘bright star’. Our current year’s economic growth is estimated to be at 7 per cent. It is notable that this is the highest among all the major economies. This is in spite of the massive slowdown globally caused by Covid-19 and a war. The Indian economy is therefore on the right track, and despite a time of challenges, heading towards a bright future. 3. Today as Ind

In [None]:
# # Requires !pip install sentence-transformers
# from sentence_transformers import SentenceTransformer
# embedding_model = SentenceTransformer(model_name_or_path="dunzhang/stella_en_1.5B_v5", 
#                                       device="mps") # choose the device to load the model to (note: GPU will often be *much* faster than CPU)

from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings ## embeddings using Interface endpoint

embeddings = HuggingFaceInferenceAPIEmbeddings(api_key= os.environ['HUGGING_API_KEY'], model_name="sentence-transformers/all-mpnet-base-v2")
# embeddings = HuggingFaceEndpointEmbeddings() ## embedding using local huggingface
print(embeddings)


api_key=SecretStr('**********') model_name='sentence-transformers/all-mpnet-base-v2' api_url=None additional_headers={}


In [None]:
# Create a list of sentences to turn into numbers
sentences = [
    "The Sentences Transformers library provides an easy and open-source way to create embeddings.",
    "Sentences can be embedded one by one or as a list of strings.",
    "Embeddings are one of the most powerful concepts in machine learning!",
    "Learn to use embeddings well and you'll be well on your way to being an AI engineer."
]

# Sentences are encoded/embedded by calling model.encode()
embeddings =  embeddings.embed_documents(sentences)
embeddings_dict = dict(zip(sentences, embeddings))
print(embeddings_dict)
# See the embeddings
for sentence, embedding in embeddings_dict.items():
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")

{'The Sentences Transformers library provides an easy and open-source way to create embeddings.': [-0.020798319950699806, 0.030316464602947235, -0.020121799781918526, 0.06864849478006363, -0.02552560716867447, -0.008476873859763145, -0.00020723622583318502, -0.0632377415895462, 0.0281606987118721, -0.033335376530885696, 0.03026341088116169, 0.05307215824723244, -0.05035270005464554, 0.026228871196508408, 0.03333137184381485, -0.045157741755247116, 0.036304496228694916, -0.0013711730716750026, -0.012017124332487583, 0.0114947110414505, 0.05045110359787941, 0.047085680067539215, 0.021191375330090523, 0.05146066099405289, -0.020374629646539688, -0.03588895872235298, -0.0006677835481241345, -0.02943938970565796, 0.04958592355251312, -0.010563945397734642, -0.015201376751065254, -0.0013175965286791325, 0.044819723814725876, 0.015602342784404755, 8.603794299233414e-07, -0.0012139284517616034, -0.023797864094376564, -0.0009093867265619338, 0.007344875484704971, -0.0025393629912286997, 0.05233

In [None]:
# Turn text chunks into a single list
text_chunks = [item["sentence_chunk"] for item in pages_and_chunks_over_min_token_len]
print(text_chunks)

['Budget 2023-2024  Speech of Nirmala Sitharaman Minister of Finance February 1, 2023 Hon’ble Speaker,   I present the Budget for 2023-24. This is the first Budget in Amrit Kaal. Introduction 1. This Budget hopes to build on the foundation laid in the previous Budget, and the blueprint drawn for India@100. We envision a prosperous and inclusive India, in which the fruits of development reach all regions and citizens, especially our youth, women, farmers, OBCs, Scheduled Castes and Scheduled Tribes. 2. In the 75th year of our Independence, the world has recognised the Indian economy as a ‘bright star’. Our current year’s economic growth is estimated to be at 7 per cent. It is notable that this is the highest among all the major economies. This is in spite of the massive slowdown globally caused by Covid-19 and a war. The Indian economy is therefore on the right track, and despite a time of challenges, heading towards a bright future. 3. Today as Indians stands with their head held high,

In [None]:
#Vector Search DB In PineCode
##Create a pinecone index first and the run this code
import pinecone
from pinecone import Pinecone
pc = Pinecone(api_key = os.environ['PINECONE_API_KEY'])
index = pc.Index("llmchat")
print(index)
index_name = 'llmchat'

<pinecone.data.index.Index object at 0x321107430>


In [None]:
from langchain.schema import Document
document_list = [
    Document(
        page_content=text_chunks["sentence_chunk"],
        metadata={"page": text_chunks["page_number"]}
    )
    for text_chunks in pages_and_chunks_over_min_token_len
]
print(document_list)

[Document(metadata={'page': -1}, page_content='Budget 2023-2024  Speech of Nirmala Sitharaman Minister of Finance February 1, 2023 Hon’ble Speaker,   I present the Budget for 2023-24. This is the first Budget in Amrit Kaal. Introduction 1. This Budget hopes to build on the foundation laid in the previous Budget, and the blueprint drawn for India@100. We envision a prosperous and inclusive India, in which the fruits of development reach all regions and citizens, especially our youth, women, farmers, OBCs, Scheduled Castes and Scheduled Tribes. 2. In the 75th year of our Independence, the world has recognised the Indian economy as a ‘bright star’. Our current year’s economic growth is estimated to be at 7 per cent. It is notable that this is the highest among all the major economies. This is in spite of the massive slowdown globally caused by Covid-19 and a war. The Indian economy is therefore on the right track, and despite a time of challenges, heading towards a bright future. 3. Today

In [None]:
# converting the document into embeddings and insertion into pinecone vector db based on the chunks - Each chunk is stored as a single record
from langchain_pinecone import PineconeVectorStore
vectorstore_from_docs = PineconeVectorStore.from_documents(
        document_list,
        index_name=index_name,
        embedding=embeddings
    )