In [1]:
pip install PyMuPDF chromadb sentence-transformers

Collecting PyMuPDF
  Downloading pymupdf-1.25.1-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting chromadb
  Downloading chromadb-0.5.23-py3-none-any.whl.metadata (6.8 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.115.6-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.34.0-py3-none-any.whl.metadata (6.5 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.7.4-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.20.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting op

In [2]:
import fitz

In [5]:
#Extract the text form pdf
def extract_text_from_pdf(pdf_path, num_pages=5):
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(min(num_pages, doc.page_count)):
        page = doc.load_page(page_num)
        text += page.get_text("text")
    return text

In [4]:
pdf_path = "/content/budget_speech.pdf"
pdf_text = extract_text_from_pdf(pdf_path)

In [7]:
print(pdf_text[:500])

GOVERNMENT OF INDIA
BUDGET 2023-2024
SPEECH
OF
NIRMALA SITHARAMAN
MINISTER OF FINANCE
February 1,  2023
CONTENTS 
PART-A 
Page No.
 
Introduction 
1
 
Achievements since 2014: Leaving no one behind 
2
 
Vision for Amrit Kaal – an empowered and inclusive economy 
3
 
Priorities of this Budget 
5
i. Inclusive Development  
ii. Reaching the Last Mile 
iii. Infrastructure and Investment 
iv. Unleashing the Potential 
v. Green Growth 
vi. Youth Power  
vii. Financial Sector 
 
 
 
 
 
 
 
 
 
Fi


In [8]:
#Creating a vector database and i am using chromadb as pinecone give me some authentication errors and search for that and i dont get the answer
import chromadb

In [9]:
# Initialize Chroma client
client = chromadb.Client()

In [10]:
#creating collection for storing the data
collection = client.create_collection("business_bot_pdf")

In [11]:
#creating chunks of data
chunks = pdf_text.split("\n")

In [12]:
#adding chunks to the collection of chromadb
for idx, chunk in enumerate(chunks):
    if chunk.strip():
        collection.add(
            documents=[chunk],
            metadatas=[{"source": f"chunk_{idx}"}],
            ids=[f"chunk_{idx}"]
        )

/root/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:00<00:00, 92.6MiB/s]


In [15]:
#Creating embeddings
import numpy as np
from sentence_transformers import SentenceTransformer


In [16]:
#loading a pretrained transformer
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [17]:
#function to create embedding text
def embed_text(text):
    embedding_tensor = embedding_model.encode(text, convert_to_tensor=True)
    embedding_numpy = embedding_tensor.detach().cpu().numpy()
    return embedding_numpy

In [18]:
embeddings = []#list to store the values

In [19]:
for chunk in chunks:
    if chunk.strip():
        embeddings.append(embed_text(chunk))

In [20]:
#adding the embeddings into the chromadb collections
for idx, embedding in enumerate(embeddings):
    collection.add(
        documents=[chunks[idx]],
        metadatas=[{"source": f"chunk_{idx}"}],
        ids=[f"chunk_{idx}"],
        embeddings=[embedding]
    )



In [21]:
#function to retireve relevant documents based on the query
def retrieve_relevant_docs(query: str):
    query_embedding = embed_text(query)
    results = collection.query(query_embeddings=[query_embedding], n_results=3)
    documents = results.get("documents", [])
    return [doc for doc in documents]

In [22]:
#Example usuage
query = "What are the key priorities outlined in the 2023-2024 Budget speech by Nirmala Sitharaman, and how do they align with the vision for an empowered and inclusive economy?"
retrieved_docs = retrieve_relevant_docs(query)
print("Retrieved Documents:", retrieved_docs)

Retrieved Documents: [['Priorities of this Budget ', 'Annexure to Part B of the Budget Speech 2023-24 ', 'BUDGET 2023-2024']]


In [23]:
from transformers import BertForQuestionAnswering, BertTokenizer
import torch

In [24]:
#function to flatten
def flatten_and_join(documents):
    flat_documents = []
    for item in documents:
        if isinstance(item, list):
            flat_documents.extend(flatten_and_join(item))
        else:
            flat_documents.append(str(item))
    return flat_documents

In [25]:
#function to call the hugging face model and i am using this because my openai free trails are over and it said to upgrade to get the credits so for that i am using this free model from hugging face.
def generate_answer_with_bert(documents, query):
    documents = flatten_and_join(documents)
    context = " ".join(documents)
    if not context or not query:
        return "Error: Context or query is empty."
    #Loading the BERT tokenizer and model for question answering
    tokenizer = BertTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
    model = BertForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
    #tokinizing the input
    inputs = tokenizer.encode_plus(query, context, return_tensors="pt", add_special_tokens=True)
    with torch.no_grad():
        output = model(**inputs)
    start_scores = output.start_logits
    end_scores = output.end_logits
    start_index = torch.argmax(start_scores)
    end_index = torch.argmax(end_scores)
    #decoding the answer using the tokenizer
    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][start_index:end_index+1]))
    return answer

In [30]:
#example query
query = "What are your ai services?"


In [31]:
#calling the genrated fucntion
generated_answer = generate_answer_with_bert(retrieved_docs, query)

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [32]:
print("Generated Answer:", generated_answer)

Generated Answer: ai services ? [SEP]
