## Llama 3 [docs](https://scontent.fktm6-1.fna.fbcdn.net/v/t39.2365-6/453304228_1160109801904614_7143520450792086005_n.pdf?_nc_cat=108&ccb=1-7&_nc_sid=3c67a6&_nc_ohc=FwYrXqt8EN8Q7kNvgGu4ixg&_nc_ht=scontent.fktm6-1.fna&oh=00_AYAbKc2EJnmRq-Yh8Qh9pbNRd5UbcIR75zjlcL3P16iPsA&oe=66D9B407)

- 1 pdf
- 2 convert to text chunks
- 3 convert to vectors using embedding
- 4 store in vectorSearchDB
- 5 similarity search of query by human

In [43]:
# %pip install pinecone==5.1.0
# %pip install "pinecone-client[grpc]"

In [21]:
import langchain
import pinecone
from langchain.llms import Ollama
from langchain.document_loaders import PyPDFLoader,PyPDFDirectoryLoader
from langchain.text_splitter import CharacterTextSplitter,RecursiveCharacterTextSplitter
from langchain.vectorstores import Pinecone

In [2]:
from langchain.embeddings.base import Embeddings
import requests

class OllamaEmbeddings(Embeddings):
    def __init__(self, model_name, ollama_url='http://localhost:11434'):
        self.model_name = model_name
        self.ollama_url = ollama_url
    
    def embed_documents(self, texts):
        return [self._get_embedding(text) for text in texts]
    
    def embed_query(self, text):
        return self._get_embedding(text)
    
    def _get_embedding(self, text):
        response = requests.post(
            f'{self.ollama_url}/api/embeddings',
            json={
                "model": self.model_name,
                "prompt": text
            }
        )
        response.raise_for_status()
        return response.json()['embedding']  # Adjust if the key is different


In [3]:
file = 'budget.pdf'
loader = PyPDFLoader(file_path=file)
doc = loader.load()

In [4]:
loader.load()[:4]

[Document(metadata={'source': 'budget.pdf', 'page': 0}, page_content='GOVERNMENT OF INDIA\nBUDGET 2023-2024\nSPEECH\nOF\nNIRMALA SITHARAMAN\nMINISTER OF FINANCE\nFebruary 1,  2023'),
 Document(metadata={'source': 'budget.pdf', 'page': 1}, page_content=''),
 Document(metadata={'source': 'budget.pdf', 'page': 2}, page_content='CONTENTS \nPART-A \n Page No.  \n\uf0b7 Introduction 1 \n\uf0b7 Achievements since 2014: Leaving no one behind 2 \n\uf0b7 Vision for Amrit Kaal  – an empowered and inclusive economy 3 \n\uf0b7 Priorities of this Budget 5 \ni. Inclusive Development  \nii. Reaching the Last Mile \niii. Infrastructure and Investment \niv. Unleashing the Potential \nv. Green Growth \nvi. Youth Power  \nvii. Financial Sector  \n \n \n \n \n \n \n \n \n\uf0b7 Fiscal Management 24 \nPART B  \n  \nIndirect Taxes  27 \n\uf0b7 Green Mobility  \n\uf0b7 Electronics   \n\uf0b7 Electrical   \n\uf0b7 Chemicals and Petrochemicals   \n\uf0b7 Marine products  \n\uf0b7 Lab Grown Diamonds  \n\uf0b7 Pr

In [5]:
# divide the docs into chunks
def chunk_data(docs, chunk_size=800,chunk_overlap=50):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    return text_splitter.split_documents(docs)

In [6]:
document = chunk_data(doc)
print(f"len: {len(document)}")
document[:4]

len: 141


[Document(metadata={'source': 'budget.pdf', 'page': 0}, page_content='GOVERNMENT OF INDIA\nBUDGET 2023-2024\nSPEECH\nOF\nNIRMALA SITHARAMAN\nMINISTER OF FINANCE\nFebruary 1,  2023'),
 Document(metadata={'source': 'budget.pdf', 'page': 2}, page_content='CONTENTS \nPART-A \n Page No.  \n\uf0b7 Introduction 1 \n\uf0b7 Achievements since 2014: Leaving no one behind 2 \n\uf0b7 Vision for Amrit Kaal  – an empowered and inclusive economy 3 \n\uf0b7 Priorities of this Budget 5 \ni. Inclusive Development  \nii. Reaching the Last Mile \niii. Infrastructure and Investment \niv. Unleashing the Potential \nv. Green Growth \nvi. Youth Power  \nvii. Financial Sector  \n \n \n \n \n \n \n \n \n\uf0b7 Fiscal Management 24 \nPART B  \n  \nIndirect Taxes  27 \n\uf0b7 Green Mobility  \n\uf0b7 Electronics   \n\uf0b7 Electrical   \n\uf0b7 Chemicals and Petrochemicals   \n\uf0b7 Marine products  \n\uf0b7 Lab Grown Diamonds  \n\uf0b7 Precious Metals  \n\uf0b7 Metals  \n\uf0b7 Compounded Rubber  \n\uf0b7 Cigar

In [104]:
llm = Ollama(temperature = 0.0, model="qwen2:1.5b")
# llm = Ollama(temperature = 0.0, model="llama3.1:8b-instruct-q2_K")

embedding = OllamaEmbeddings(model_name="nomic-embed-text")
# openAI embeddings
# embedding = OpenAIEmbeddings(api_key="your-api-key")

In [8]:
vectors = embedding.embed_query("What is the budget of 2023")
len(vectors)

768

### vector search db in pinecone [docs](https://docs.pinecone.io/home)

In [88]:
from pinecone import ServerlessSpec

PINECONE_API_KEY="d7f6f00b-2910-4d66-9233-39f168b178d7"
region="us-east-1"
pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "langchainvector"

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=768,
        metric="cosine",
        spec=ServerlessSpec(
            cloud='aws', 
            region=region
        ) 
    ) 


In [25]:
# create vectors
doc_vector = []
for doc in document:
    vector = embedding.embed_query(doc.page_content)
    doc_vector.append({"id": str(doc.metadata["page"]), "values": vector, "metadata": {"source": doc.metadata["source"], "page": doc.metadata["page"], "text": doc.page_content}})

In [46]:
index = pc.Index(index_name)

# index.describe_index_stats()
#
# index.upsert(
#     vectors=doc_vector,
#     namespace="namespace1"
# )


In [67]:
# cosine similarity retrieval
def retrieve_similarity(query, top_k=5):
    query_vector = embedding.embed_query(query)
    result = index.query(namespace="namespace1",vector=query_vector, top_k=top_k,include_values=True,include_metadata=True)
    return result

In [83]:
from langchain.chains.question_answering import load_qa_chain
from langchain.schema import Document

In [105]:

chain = load_qa_chain(llm, chain_type="stuff")

In [106]:
# search answer from vector DB
def retrieve_answer(query):
    doc_search = retrieve_similarity(query,top_k=2)
    # print(doc_search)
    input_docs = [Document(page_content=match['metadata']['text'], metadata=match['metadata']) for match in doc_search['matches']]
    response = chain.run(input_documents=input_docs, question=query)
    return response

In [107]:
our_query = "What is the budget of 2024"

response = retrieve_answer(our_query)
print(response)

The budget for 2024 is estimated to be around 41.9 lakh crore, with a capital expenditure of about 7.3 lakh crore.
