In [29]:
import os
pdf_path = "Documents/budget_speech.pdf"
print(pdf_path)

Documents/budget_speech.pdf


In [30]:
# Requires !pip install PyMuPDF, see: https://github.com/pymupdf/pymupdf
import fitz # (pymupdf, found this is better than pypdf for our use case, note: licence is AGPL-3.0, keep that in mind if you want to use any code commercially)
from tqdm.auto import tqdm # for progress bars, requires !pip install tqdm 

def text_formatter(text: str) -> str:
    """Performs minor formatting on text."""
    cleaned_text = text.replace("\n", " ").strip() # note: this might be different for each doc (best to experiment)

    # Other potential text formatting functions can go here
    return cleaned_text

# Open PDF and get lines/pages
# Note: this only focuses on text, rather than images/figures etc
def open_and_read_pdf(pdf_path: str) -> list[dict]:
    """
    Opens a PDF file, reads its text content page by page, and collects statistics.

    Parameters:
        pdf_path (str): The file path to the PDF document to be opened and read.

    Returns:
        list[dict]: A list of dictionaries, each containing the page number
        (adjusted), character count, word count, sentence count, token count, and the extracted text
        for each page.
    """
    doc = fitz.open(pdf_path)  # open a document
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc)):  # iterate the document pages
        if page_number <= 2:  # Skip the first 2 pages
         continue
        text = page.get_text()  # get plain text encoded as UTF-8
        text = text_formatter(text)
        pages_and_texts.append({"page_number": page_number - 5,  # adjust page numbers since our PDF starts on page 5
                                "page_char_count": len(text),
                                "page_word_count": len(text.split(" ")),
                                "page_sentence_count_raw": len(text.split(". ")),
                                "page_token_count": len(text) / 4,  # 1 token = ~4 chars, see: https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
                                "text": text})
    return pages_and_texts

pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)
pages_and_texts


58it [00:00, 746.59it/s]


[{'page_number': -2,
  'page_char_count': 0,
  'page_word_count': 1,
  'page_sentence_count_raw': 1,
  'page_token_count': 0.0,
  'text': ''},
 {'page_number': -1,
  'page_char_count': 1465,
  'page_word_count': 272,
  'page_sentence_count_raw': 16,
  'page_token_count': 366.25,
  'text': 'Budget 2023-2024    Speech of  Nirmala Sitharaman  Minister of Finance  February 1, 2023  Hon’ble Speaker,     I present the Budget for 2023-24. This is the first Budget in Amrit  Kaal.  Introduction  1.  This Budget hopes to build on the foundation laid in the previous  Budget, and the blueprint drawn for India@100. We envision a prosperous  and inclusive India, in which the fruits of development reach all regions and  citizens, especially our youth, women, farmers, OBCs, Scheduled Castes and  Scheduled Tribes.   2.  In the 75th year of our Independence, the world has recognised the  Indian economy as a ‘bright star’. Our current year’s economic growth is  estimated to be at 7 per cent. It is notabl

In [31]:
import random
random.sample(pages_and_texts, k=3)

[{'page_number': 33,
  'page_char_count': 1589,
  'page_word_count': 347,
  'page_sentence_count_raw': 9,
  'page_token_count': 397.25,
  'text': '35        Annexure to Part B of the Budget Speech 2023-24  Amendments relating to Direct Taxes  A. PROVIDING TAX RELIEF UNDER NEW PERSONAL TAX REGIME  A.1     The new tax regime for Individual and HUF, introduced by the  Finance Act 2020, is now proposed to be the default regime.   A.2      This regime would also become the default regime for AOP (other  than co-operative), BOI and AJP.   A.3      Any individual, HUF, AOP (other than co-operative), BOI or AJP not  willing to be taxed under this new regime can opt to be taxed  under the old regime. For those person having income under the  head “profit and gains of business or profession” and having opted  for old regime can revoke that option only once and after that  they will continue to be taxed under the new regime. For those  not having income under the head “profit and gains of busines

In [32]:
import pandas as pd

df = pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,-2,0,1,1,0.0,
1,-1,1465,272,16,366.25,Budget 2023-2024 Speech of Nirmala Sithara...
2,0,1811,323,15,452.75,2 profile is because of several accompl...
3,1,1536,294,18,384.0,3 9. The economy has become a lot more...
4,2,2061,345,15,515.25,4 1) Economic Empowerment of Women: Dee...


In [33]:
# Get stats
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,55.0,55.0,55.0,55.0,55.0
mean,25.0,1609.02,304.22,13.2,402.25
std,16.02,432.11,77.93,4.12,108.03
min,-2.0,0.0,1.0,1.0,0.0
25%,11.5,1459.5,260.0,11.0,364.88
50%,25.0,1660.0,301.0,14.0,415.0
75%,38.5,1829.0,346.0,16.0,457.25
max,52.0,2291.0,452.0,22.0,572.75


In [34]:
from spacy.lang.en import English # see https://spacy.io/usage for install instructions

nlp = English()

# Add a sentencizer pipeline, see https://spacy.io/api/sentencizer/ 
nlp.add_pipe("sentencizer")

# Create a document instance as an example
doc = nlp("This is a sentence. This another sentence.")
assert len(list(doc.sents)) == 2

# Access the sentences of the document
list(doc.sents)

[This is a sentence., This another sentence.]

In [35]:
for item in tqdm(pages_and_texts):
    item["sentences"] = list(nlp(item["text"]).sents)
    
    # Make sure all sentences are strings
    item["sentences"] = [str(sentence) for sentence in item["sentences"]]
    
    # Count the sentences 
    item["page_sentence_count_spacy"] = len(item["sentences"])

100%|██████████| 55/55 [00:00<00:00, 559.15it/s]


In [36]:

df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy
count,55.0,55.0,55.0,55.0,55.0,55.0
mean,25.0,1609.02,304.22,13.2,402.25,12.76
std,16.02,432.11,77.93,4.12,108.03,4.22
min,-2.0,0.0,1.0,1.0,0.0,0.0
25%,11.5,1459.5,260.0,11.0,364.88,10.5
50%,25.0,1660.0,301.0,14.0,415.0,13.0
75%,38.5,1829.0,346.0,16.0,457.25,15.5
max,52.0,2291.0,452.0,22.0,572.75,21.0


In [37]:
# Define split size to turn groups of sentences into chunks
num_sentence_chunk_size = 22

# Create a function that recursively splits a list into desired sizes
def split_list(input_list: list, 
               slice_size: int) -> list[list[str]]:
    """
    Splits the input_list into sublists of size slice_size (or as close as possible).

    For example, a list of 17 sentences would be split into two lists of [[10], [7]]
    """
    return [input_list[i:i + slice_size] for i in range(0, len(input_list), slice_size)]

# Loop through pages and texts and split sentences into chunks
for item in tqdm(pages_and_texts):
    item["sentence_chunks"] = split_list(input_list=item["sentences"],
                                         slice_size=num_sentence_chunk_size)
    item["num_chunks"] = len(item["sentence_chunks"])

100%|██████████| 55/55 [00:00<00:00, 860771.34it/s]


In [38]:
# Sample an example from the group (note: many samples have only 1 chunk as they have <=10 sentences total)
random.sample(pages_and_texts, k=1)

[{'page_number': 11,
  'page_char_count': 1772,
  'page_word_count': 316,
  'page_sentence_count_raw': 16,
  'page_token_count': 443.0,
  'text': '13        Sustainable Cities of Tomorrow  53.  States and cities will be encouraged to undertake urban planning  reforms and actions to transform our cities into ‘sustainable cities of  tomorrow’. This means efficient use of land resources, adequate resources  for  urban  infrastructure,  transit-oriented  development,  enhanced  availability and affordability of urban land, and opportunities for all.   Making Cities ready for Municipal Bonds  54.  Through property tax governance reforms and ring-fencing user  charges on urban infrastructure, cities will be incentivized to improve their  credit worthiness for municipal bonds.    Urban Infrastructure Development Fund   55.  Like the RIDF, an Urban Infrastructure Development Fund (UIDF) will  be established through use of priority sector lending shortfall. This will be  managed by the National

In [39]:
# Create a DataFrame to get stats
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy,num_chunks
count,55.0,55.0,55.0,55.0,55.0,55.0,55.0
mean,25.0,1609.02,304.22,13.2,402.25,12.76,0.98
std,16.02,432.11,77.93,4.12,108.03,4.22,0.13
min,-2.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,11.5,1459.5,260.0,11.0,364.88,10.5,1.0
50%,25.0,1660.0,301.0,14.0,415.0,13.0,1.0
75%,38.5,1829.0,346.0,16.0,457.25,15.5,1.0
max,52.0,2291.0,452.0,22.0,572.75,21.0,1.0


In [40]:
import re

# Split each chunk into its own item
pages_and_chunks = []
for item in tqdm(pages_and_texts):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]
        
        # Join the sentences together into a paragraph-like structure, aka a chunk (so they are a single string)
        joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk) # ".A" -> ". A" for any full-stop/capital letter combo 
        chunk_dict["sentence_chunk"] = joined_sentence_chunk

        # Get stats about the chunk
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4 # 1 token = ~4 characters
        
        pages_and_chunks.append(chunk_dict)

# How many chunks do we have?
len(pages_and_chunks)

100%|██████████| 55/55 [00:00<00:00, 37663.14it/s]


54

In [41]:
# View a random sample
random.sample(pages_and_chunks, k=1)

[{'page_number': 27,
  'sentence_chunk': '29    opportunity, I propose to reduce basic customs duty on seeds used in their manufacture.  Precious Metals 127. Customs Duties on dore and bars of gold and platinum were increased earlier this fiscal. I now propose to increase the duties on articles made therefrom to enhance the duty differential. I also propose to increase the import duty on silver dore, bars and articles to align them with that on gold and platinum. Metals 128. To facilitate availability of raw materials for the steel sector, exemption from Basic Customs Duty on raw materials for manufacture of CRGO Steel, ferrous scrap and nickel cathode is being continued. 129. Similarly, the concessional BCD of 2.5 per cent on copper scrap is also being continued to ensure the availability of raw materials for secondary copper producers who are mainly in the MSME sector. Compounded Rubber 130. The basic customs duty rate on compounded rubber is being increased from 10 per cent to ‘25 p

In [42]:
# Get stats about our chunks
df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,54.0,54.0,54.0,54.0
mean,25.5,1592.46,263.48,398.12
std,15.73,375.29,64.6,93.82
min,-1.0,626.0,111.0,156.5
25%,12.25,1432.5,221.5,358.12
50%,25.5,1635.5,258.0,408.88
75%,38.75,1792.5,306.75,448.12
max,52.0,2232.0,395.0,558.0


In [43]:

pages_and_chunks_over_min_token_len = df.to_dict(orient="records")
# pages_and_chunks_over_min_token_len[:2]
print(pages_and_chunks_over_min_token_len)

[{'page_number': -1, 'sentence_chunk': 'Budget 2023-2024  Speech of Nirmala Sitharaman Minister of Finance February 1, 2023 Hon’ble Speaker,   I present the Budget for 2023-24. This is the first Budget in Amrit Kaal. Introduction 1. This Budget hopes to build on the foundation laid in the previous Budget, and the blueprint drawn for India@100. We envision a prosperous and inclusive India, in which the fruits of development reach all regions and citizens, especially our youth, women, farmers, OBCs, Scheduled Castes and Scheduled Tribes. 2. In the 75th year of our Independence, the world has recognised the Indian economy as a ‘bright star’. Our current year’s economic growth is estimated to be at 7 per cent. It is notable that this is the highest among all the major economies. This is in spite of the massive slowdown globally caused by Covid-19 and a war. The Indian economy is therefore on the right track, and despite a time of challenges, heading towards a bright future. 3. Today as Ind

In [70]:
# # Requires !pip install sentence-transformers
# from sentence_transformers import SentenceTransformer
# embedding_model = SentenceTransformer(model_name_or_path="dunzhang/stella_en_1.5B_v5", 
#                                       device="mps") # choose the device to load the model to (note: GPU will often be *much* faster than CPU)

from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings ## embeddings using Interface endpoint

embeddings = HuggingFaceInferenceAPIEmbeddings(api_key= os.environ['HUGGING_API_KEY'], model_name="sentence-transformers/all-mpnet-base-v2")
# embeddings = HuggingFaceEndpointEmbeddings() ## embedding using local huggingface
print(embeddings)


api_key=SecretStr('**********') model_name='sentence-transformers/all-mpnet-base-v2' api_url=None additional_headers={}


In [71]:
# Create a list of sentences to turn into numbers
sentences = [
    "The Sentences Transformers library provides an easy and open-source way to create embeddings.",
    "Sentences can be embedded one by one or as a list of strings.",
    "Embeddings are one of the most powerful concepts in machine learning!",
    "Learn to use embeddings well and you'll be well on your way to being an AI engineer."
]

# Sentences are encoded/embedded by calling model.encode()
embeddings =  embeddings.embed_documents(sentences)
embeddings_dict = dict(zip(sentences, embeddings))
print(embeddings_dict)
# See the embeddings
for sentence, embedding in embeddings_dict.items():
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")

{'The Sentences Transformers library provides an easy and open-source way to create embeddings.': [-0.020798319950699806, 0.030316464602947235, -0.020121799781918526, 0.06864849478006363, -0.02552560716867447, -0.008476873859763145, -0.00020723622583318502, -0.0632377415895462, 0.0281606987118721, -0.033335376530885696, 0.03026341088116169, 0.05307215824723244, -0.05035270005464554, 0.026228871196508408, 0.03333137184381485, -0.045157741755247116, 0.036304496228694916, -0.0013711730716750026, -0.012017124332487583, 0.0114947110414505, 0.05045110359787941, 0.047085680067539215, 0.021191375330090523, 0.05146066099405289, -0.020374629646539688, -0.03588895872235298, -0.0006677835481241345, -0.02943938970565796, 0.04958592355251312, -0.010563945397734642, -0.015201376751065254, -0.0013175965286791325, 0.044819723814725876, 0.015602342784404755, 8.603794299233414e-07, -0.0012139284517616034, -0.023797864094376564, -0.0009093867265619338, 0.007344875484704971, -0.0025393629912286997, 0.05233

In [None]:
%%time

# Uncomment to see how long it takes to create embeddings on CPU

# Make sure the model is on the CPU
embedding_model.to("cpu")

# Embed each chunk one by one
for item in tqdm(pages_and_chunks_over_min_token_len):
    item["embedding"] = embedding_model.encode(item["sentence_chunk"])


100%|██████████| 54/54 [00:35<00:00,  1.51it/s]

CPU times: user 3min 17s, sys: 2min 36s, total: 5min 54s
Wall time: 36.5 s





In [None]:
%%time

# Send the model to the Metal GPU
embedding_model.to("mps") # requires a GPU installed, for reference on my local machine, I'm using an M1 Pro

# Create embeddings one by one on the GPU
for item in tqdm(pages_and_chunks_over_min_token_len):
    item["embedding"] = embedding_model.encode(item["sentence_chunk"])


100%|██████████| 98/98 [00:16<00:00,  6.07it/s]

CPU times: user 9.9 s, sys: 1.89 s, total: 11.8 s
Wall time: 16.8 s





In [72]:
# Turn text chunks into a single list
text_chunks = [item["sentence_chunk"] for item in pages_and_chunks_over_min_token_len]
print(text_chunks)

['Budget 2023-2024  Speech of Nirmala Sitharaman Minister of Finance February 1, 2023 Hon’ble Speaker,   I present the Budget for 2023-24. This is the first Budget in Amrit Kaal. Introduction 1. This Budget hopes to build on the foundation laid in the previous Budget, and the blueprint drawn for India@100. We envision a prosperous and inclusive India, in which the fruits of development reach all regions and citizens, especially our youth, women, farmers, OBCs, Scheduled Castes and Scheduled Tribes. 2. In the 75th year of our Independence, the world has recognised the Indian economy as a ‘bright star’. Our current year’s economic growth is estimated to be at 7 per cent. It is notable that this is the highest among all the major economies. This is in spite of the massive slowdown globally caused by Covid-19 and a war. The Indian economy is therefore on the right track, and despite a time of challenges, heading towards a bright future. 3. Today as Indians stands with their head held high,

In [None]:
#Vector Search DB In PineCode
##Create a pinecone index first and the run thsi code
import pinecone
from pinecone import Pinecone
pc = Pinecone(api_key = os.environ['PINECONE_API_KEY'])
index = pc.Index("llmchat")
print(index)
index_name = 'llmchat'

<pinecone.data.index.Index object at 0x16b6b66e0>


In [74]:
from langchain.schema import Document
document_list = [
    Document(
        page_content=text_chunks["sentence_chunk"],
        metadata={"page": text_chunks["page_number"]}
    )
    for text_chunks in pages_and_chunks_over_min_token_len
]
print(document_list)

[Document(metadata={'page': -1}, page_content='Budget 2023-2024  Speech of Nirmala Sitharaman Minister of Finance February 1, 2023 Hon’ble Speaker,   I present the Budget for 2023-24. This is the first Budget in Amrit Kaal. Introduction 1. This Budget hopes to build on the foundation laid in the previous Budget, and the blueprint drawn for India@100. We envision a prosperous and inclusive India, in which the fruits of development reach all regions and citizens, especially our youth, women, farmers, OBCs, Scheduled Castes and Scheduled Tribes. 2. In the 75th year of our Independence, the world has recognised the Indian economy as a ‘bright star’. Our current year’s economic growth is estimated to be at 7 per cent. It is notable that this is the highest among all the major economies. This is in spite of the massive slowdown globally caused by Covid-19 and a war. The Indian economy is therefore on the right track, and despite a time of challenges, heading towards a bright future. 3. Today

In [None]:
# #checking if the text is converted to emebeddings or not with the embedding model that we are using
# embedded_documents =  embeddings.embed_documents(text_chunks)
# print("Documents embedded.", embedded_documents)

Documents embedded. [[0.004434800706803799, 0.07342689484357834, -0.023321377113461494, 0.03155279904603958, -0.016791580244898796, -0.005796727724373341, -0.03853666037321091, 0.01639395020902157, -0.0274188332259655, 0.006378340534865856, 0.03805853798985481, 0.022438358515501022, 0.017592303454875946, 0.08086136728525162, 0.02276364155113697, -0.08040440082550049, 0.01961592398583889, -0.013978501781821251, -0.04606174677610397, 0.011858731508255005, -0.02749793417751789, 0.019982261583209038, -0.034486524760723114, 0.024147916585206985, -0.040044501423835754, -0.011821617372334003, 0.0056400783360004425, -0.012414288707077503, -0.04405819997191429, -0.068231001496315, 0.047684915363788605, -0.029924634844064713, 0.02015235833823681, -0.030084991827607155, 2.6123625502805226e-06, -0.09252216666936874, 0.013352534733712673, -2.7214831789024174e-05, -0.03586035966873169, -0.0034283227287232876, -0.002434007590636611, -0.03347236290574074, 0.010838191956281662, 0.036433979868888855, -0

In [84]:
# converting the document into embeddings and insertion into pinecone vector db based on the chunks - Each chunk is stored as a single record
from langchain_pinecone import PineconeVectorStore
vectorstore_from_docs = PineconeVectorStore.from_documents(
        document_list,
        index_name=index_name,
        embedding=embeddings
    )

AttributeError: 'list' object has no attribute 'embed_documents'

In [None]:
# Function to retrieve matching results (context) from the vectorstore
def retrieve_query(query, k=2):
    """
    Retrieves the top-k most relevant documents from the vectorstore.
    """
    matching_results = vectorstore_from_docs.similarity_search(query, k)
    print(matching_results)
    return matching_results

In [None]:
print(retrieve_query("What are the priorities of 2023-2024 budget"))

NameError: name 'vectorstore_from_docs' is not defined

In [None]:
##checking my GPU memory
import subprocess

def get_gpu_memory():
    try:
        output = subprocess.check_output(['sysctl', '-n', 'hw.memsize'])
        total_memory_bytes = int(output.decode().strip())

        output = subprocess.check_output(['sysctl', '-n', 'hw.ncpu'])
        num_cpus = int(output.decode().strip())

        # Assume 2GB of memory is reserved for the operating system and CPU
        reserved_memory_bytes = 2 * 1024 * 1024 * 1024

        available_memory_bytes = total_memory_bytes - reserved_memory_bytes
        available_memory_gb = round(available_memory_bytes / (1024 ** 3), 2)

        return available_memory_gb
    except (subprocess.CalledProcessError, ValueError):
        return None

gpu_memory_gb = get_gpu_memory()
if gpu_memory_gb is not None:
    print(f"Available GPU memory: {gpu_memory_gb} GB")
else:
    print("Failed to retrieve GPU memory information.")


Available GPU memory: 34.0 GB


In [None]:
#Selecting the LLM Model Based on the Memory size
# Note: the following is Gemma focused, however, there are more and more LLMs of the 2B and 7B size appearing for local use.
if gpu_memory_gb < 5.1:
    print(f"Your available GPU memory is {gpu_memory_gb}GB, you may not have enough memory to run a Gemma LLM locally without quantization.")
elif gpu_memory_gb < 8.1:
    print(f"GPU memory: {gpu_memory_gb} | Recommended model: Gemma 2B in 4-bit precision.")
    use_quantization_config = True 
    model_id = "google/gemma-2b-it"
elif gpu_memory_gb < 19.0:
    print(f"GPU memory: {gpu_memory_gb} | Recommended model: Gemma 2B in float16 or Gemma 7B in 4-bit precision.")
    use_quantization_config = False 
    model_id = "google/gemma-2b-it"
elif gpu_memory_gb > 19.0:
    print(f"GPU memory: {gpu_memory_gb} | Recommend model: Gemma 7B in 4-bit or float16 precision.")
    use_quantization_config = False 
    model_id = "google/gemma-7b-it"

print(f"use_quantization_config set to: {use_quantization_config}")
print(f"model_id set to: {model_id}")

GPU memory: 34.0 | Recommend model: Gemma 7B in 4-bit or float16 precision.
use_quantization_config set to: False
model_id set to: google/gemma-7b-it


In [None]:
## Downloading the model into our local
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers.utils import is_flash_attn_2_available

# 1. Create quantization config for smaller model loading (optional)
# Requires !pip install bitsandbytes accelerate, see: https://github.com/TimDettmers/bitsandbytes, https://huggingface.co/docs/accelerate/
# For models that require 4-bit quantization (use this if you have low GPU memory available)
from transformers import BitsAndBytesConfig
quantization_config = BitsAndBytesConfig(load_in_4bit=True,
                                         bnb_4bit_compute_dtype=torch.float16)

# Bonus: Setup Flash Attention 2 for faster inference, default to "sdpa" or "scaled dot product attention" if it's not available
# Flash Attention 2 requires NVIDIA GPU compute capability of 8.0 or above, see: https://developer.nvidia.com/cuda-gpus
# Requires !pip install flash-attn, see: https://github.com/Dao-AILab/flash-attention
if (is_flash_attn_2_available()) and (torch.cuda.is_available()):
    attn_implementation = "flash_attention_2"
else:
    attn_implementation = "sdpa"
print(f"[INFO] Using attention implementation: {attn_implementation}")

# 2. Pick a model we'd like to use (this will depend on how much GPU memory you have available)
#model_id = "google/gemma-7b-it"
model_id = model_id # (we already set this above)
print(f"[INFO] Using model_id: {model_id}")

# 3. Instantiate tokenizer (tokenizer turns text into numbers ready for the model)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_id)

# 4. Instantiate the model
llm_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_id,
                                                 torch_dtype=torch.float16, # datatype to use, we want float16
                                                 quantization_config=quantization_config if use_quantization_config else None,
                                                 low_cpu_mem_usage=False, # use full memory
                                                 attn_implementation=attn_implementation) # which attention version to use

if not use_quantization_config and torch.cuda.is_available(): # quantization takes care of device setting automatically, so if it's not used, send model to GPU
    llm_model.to("cuda")


[INFO] Using attention implementation: sdpa
[INFO] Using model_id: google/gemma-7b-it


Loading checkpoint shards: 100%|██████████| 4/4 [00:22<00:00,  5.60s/it]


In [None]:
llm_model

GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 3072, padding_idx=0)
    (layers): ModuleList(
      (0-27): 28 x GemmaDecoderLayer(
        (self_attn): GemmaSdpaAttention(
          (q_proj): Linear(in_features=3072, out_features=4096, bias=False)
          (k_proj): Linear(in_features=3072, out_features=4096, bias=False)
          (v_proj): Linear(in_features=3072, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=3072, bias=False)
          (rotary_emb): GemmaRotaryEmbedding()
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear(in_features=3072, out_features=24576, bias=False)
          (up_proj): Linear(in_features=3072, out_features=24576, bias=False)
          (down_proj): Linear(in_features=24576, out_features=3072, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): GemmaRMSNorm((3072,), eps=1e-06)
        (post_attention_layernorm): GemmaRMSNorm((3072,), eps=1

In [None]:
##checking the parameters size of the LLM model
def get_model_num_params(model: torch.nn.Module):
    return sum([param.numel() for param in model.parameters()])

get_model_num_params(llm_model)


8537680896

In [None]:
## checking the Space required for the model to run on local
def get_model_mem_size(model: torch.nn.Module):
    """
    Get how much memory a PyTorch model takes up.

    See: https://discuss.pytorch.org/t/gpu-memory-that-model-uses/56822
    """
    # Get model parameters and buffer sizes
    mem_params = sum([param.nelement() * param.element_size() for param in model.parameters()])
    mem_buffers = sum([buf.nelement() * buf.element_size() for buf in model.buffers()])

    # Calculate various model sizes
    model_mem_bytes = mem_params + mem_buffers # in bytes
    model_mem_mb = model_mem_bytes / (1024**2) # in megabytes
    model_mem_gb = model_mem_bytes / (1024**3) # in gigabytes

    return {"model_mem_bytes": model_mem_bytes,
            "model_mem_mb": round(model_mem_mb, 2),
            "model_mem_gb": round(model_mem_gb, 2)}

get_model_mem_size(llm_model)

{'model_mem_bytes': 17075376128,
 'model_mem_mb': 16284.35,
 'model_mem_gb': 15.9}

In [None]:
## Check if GPU is available; otherwise, fallback to CPU
import torch
# Check if MPS (Metal Performance Shaders) is available; otherwise, fallback to CPU
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")

# Move the model to the selected device
llm_model = llm_model.to(device)
# print("llm_model", llm_model)

In [None]:
# Function to tokenize the input text
def tokenize_input(prompt, context):
    """
    Combines the query (prompt) and context into a single input format and tokenizes it.
    """
    combined_input = f"Context: {context}\n\nQuestion: {prompt}"
    input_ids = tokenizer(combined_input, return_tensors="pt").to(device)
    return input_ids

In [None]:
# Function to generate outputs from the LLM model
def generate_response(input_ids):
    """
    Generates a response using the LLM model based on tokenized input.
    """
    outputs = llm_model.generate(input_ids=input_ids["input_ids"], max_new_tokens=512)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

In [None]:
# Main input text
input_text = "what are the Priorities of 2023-2024 Budget?"
print(f"Input text:\n{input_text}")

# Create prompt template for instruction-tuned model
dialogue_template = [
    {"role": "user",
     "content": input_text}
]

# Apply the chat template
prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                       tokenize=False,  # Keep as raw text (not tokenized)
                                       add_generation_prompt=True)
print(f"\nPrompt (formatted):\n{prompt}")

Input text:
what are the Priorities of 2023-2024 Budget?

Prompt (formatted):
<bos><start_of_turn>user
what are the Priorities of 2023-2024 Budget?<end_of_turn>
<start_of_turn>model



In [None]:

# Function to generate a response using the local LLM model
def retrieve_answers_with_llm_model(query):
    """
    Combines retrieved context and query, formats the input, and generates a response using the local LLM model.
    """
    # Retrieve matching results from the vectorstore
    doc_search = retrieve_query(query)
    print(doc_search)
    
    # Combine all retrieved documents as context
    context = "\n".join([doc.page_content for doc in doc_search])
    print(f"Context:\n{context}\n")
    
    # Tokenize input for the LLM model
    input_ids = tokenize_input(query, context)
    
    # Generate the response from the model
    response = generate_response(input_ids)
    return response



In [None]:
# Use the input text to query the model
answer = retrieve_answers_with_llm_model(input_text)
print(f"Answer:\n{answer}")

[Document(id='516b13dc-5035-4e1a-a76a-b8a8d63adaa9', metadata={'page': -1.0}, page_content='Budget 2023-2024  Speech of Nirmala Sitharaman Minister of Finance February 1, 2023 Hon’ble Speaker,   I present the Budget for 2023-24. This is the first Budget in Amrit Kaal. Introduction 1. This Budget hopes to build on the foundation laid in the previous Budget, and the blueprint drawn for India@100. We envision a prosperous and inclusive India, in which the fruits of development reach all regions and citizens, especially our youth, women, farmers, OBCs, Scheduled Castes and Scheduled Tribes. 2. In the 75th year of our Independence, the world has recognised the Indian economy as a ‘bright star’. Our current year’s economic growth is estimated to be at 7 per cent. It is notable that this is the highest among all the major economies. This is in spite of the massive slowdown globally caused by Covid-19 and a war. The Indian economy is therefore on the right track, and despite a time of challeng

KeyboardInterrupt: 