# Importing and Setup

In [1]:
# library imports 

from dotenv import load_dotenv
from langchain_google_genai import ChatGoogleGenerativeAI

from langchain.indexes.vectorstore import VectorStoreIndexWrapper
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores.cassandra import Cassandra
from langchain_google_genai import GoogleGenerativeAIEmbeddings

from datasets import load_dataset
from PyPDF2 import PdfReader
from typing_extensions import Concatenate

import cassio
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import warnings

warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.filterwarnings(action="ignore", category=DeprecationWarning)

In [3]:
os.environ["GOOGLE_API_KEY"] = os.environ["GOOGLE_API_KEY"]
os.environ["ASTRA_DB_TOKEN"] = os.environ["ASTRA_DB_TOKEN"]
os.environ["ASTRA_DB_ID"] = os.environ["ASTRA_DB_ID"]

# Loading and Reading the Dataset

In [4]:
# Reading all the filepaths in the dataset
for dirname, _, filenames in os.walk('data'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

data/Indian Budget 2025-26.pdf


In [5]:
# Loading the entire dataset into memory
pdf_input = PdfReader('data/Indian Budget 2025-26.pdf')

In [6]:
# Reading the text form the PDF
raw_text = ''

for page in pdf_input.pages:
    if content := page.extract_text():
        raw_text += content

In [7]:
print(f'Lenght of the document: {len(raw_text)}')
print('--------------')
print(raw_text[:125])

Lenght of the document: 93092
--------------
GOVERNMENT OF INDIA
BUDGET 2025-2026
SPEECH
OF
NIRMALA SITHARAMAN
MINISTER OF FINANCE
February 1,  2025 
CONTENTS  
 
PART – 


# Database and Embeddings

In [8]:
# Initializing ASTRA DB database
cassio.init(token=os.environ["ASTRA_DB_TOKEN"],
            database_id=os.environ["ASTRA_DB_ID"])

In [9]:
# Initializing the llm using Google
llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-pro",
    temperature=0,
    max_retries=2,
)

In [10]:
# Inorder to reduce the token size, splitting the raw_text from the input document
# First creating the splitter object and then applying on the raw_text
text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=800,
    chunk_overlap=200,
    length_function=len,
)

texts = text_splitter.split_text(raw_text)
print(f'Chucks after split: {len(texts)}')

Chucks after split: 155


In [11]:
texts[:5]

['GOVERNMENT OF INDIA\nBUDGET 2025-2026\nSPEECH\nOF\nNIRMALA SITHARAMAN\nMINISTER OF FINANCE\nFebruary 1,  2025 \nCONTENTS  \n \nPART – A \n Page No.  \nIntroduction  1 \nBudget Theme  1 \nAgriculture as the 1st engine  3 \nMSMEs as the 2nd engine  6 \nInvestment as the 3rd engine  8 \nA. Investing in People  8 \nB. Investing in  the Economy  10 \nC. Investing in Innovation  14 \nExports as the 4th engine  15 \nReforms as the Fuel  16 \nFiscal Policy  18 \n \n \nPART – B \nIndirect taxes  20 \nDirect Taxes   23 \n \nAnnexure to Part -A 29 \nAnnexure to Part -B 31 \n \n   \n \nBudget 202 5-2026 \n \nSpeech of  \nNirmala Sitharaman  \nMinister of Finance  \nFebruary 1 , 202 5 \nHon’ble Speaker,  \n I present the Budget for 2025 -26. \nIntroduction  \n1. This Budget continues our Government ’s efforts to:  \na) accelerate growth,',
 'Minister of Finance  \nFebruary 1 , 202 5 \nHon’ble Speaker,  \n I present the Budget for 2025 -26. \nIntroduction  \n1. This Budget continues our Government

In [12]:
# Initializing the embeddings creator for the Google LLM
embeddings = GoogleGenerativeAIEmbeddings(
    model="models/text-embedding-004",
)

# Initializing the vector storage process
astra_vector_storage = Cassandra(
    embedding=embeddings,
    table_name="qa_mini_demo",
    session=None,
    keyspace=None,
)

# Adding the split text into the storage and creating indices
astra_vector_storage.add_texts(texts)
astra_vector_index = VectorStoreIndexWrapper(vectorstore=astra_vector_storage)

# Creating ChatBot

In [18]:
def pdf_chatBot():
    while True:
        query = input('\nWhat would you like to know about the 2025-26 Budget? (or type "quit" to exit): ').strip()
        
        if query.lower() == 'quit':
            break

        if query == '':
            continue

        print(f'Question: {query}')

        print(f'Result: {astra_vector_index.query(query, llm=llm).strip()}')

        for document, score in astra_vector_storage.similarity_search_with_score(query, k=3):
            print(f'Relevent Documents: {score, document.page_content[:92]}')
        print('\n')
    
    return None

# Q&A Bot

In [19]:
pdf_chatBot()

Question: What is the amount recevied by startups as part of the budget
Result: Startups have received commitments of more than ₹91,000 crore from Alternate Investment Funds (AIFs).  A new Fund of Funds with a fresh contribution of ₹10,000 crore will also be established to support startups. Additionally, a new scheme will provide term loans up to ₹2 crore for 5 lakh first-time entrepreneurs, including women, Scheduled Castes, and Scheduled Tribes.
Relevent Documents: (0.8325711718558805, 'be issued.   7  \n \nFund of Funds for Startups  \n31. The Alternate Investment Funds (AIFs) fo')
Relevent Documents: (0.8325711718558805, 'be issued.   7  \n \nFund of Funds for Startups  \n31. The Alternate Investment Funds (AIFs) fo')
Relevent Documents: (0.816801128193984, 'enhanced : \na) For Micro and Small Enterprises, from ` 5 crore to 10 crore, leading \nto addi')


Question: Who is the speaker?
Result: Nirmala Sitharaman, Minister of Finance
Relevent Documents: (0.7337355121591106, 'GOVERNME