# **LLM-Langchain-PineCone-VectorDB**

In [1]:
# Importing necessary libraries

import openai
import langchain
import pinecone

from langchain.document_loaders import PyPDFDirectoryLoader # For loading any PDF document
from langchain.text_splitter import RecursiveCharacterTextSplitter # Text into chunks
# from langchain.embeddings.openai import OpenAIEmbeddings # Embeddings (Text Chunk --> Vectors)
from langchain_openai import OpenAIEmbeddings

from langchain.vectorstores import Pinecone # For storing the vectors in VectorDB (PineCone)

from langchain.llms import OpenAI # Of-course we need LLM(s)

  from tqdm.autonotebook import tqdm


In [2]:
from dotenv import load_dotenv # Loading variable (and content) from ".env" file - loading environment variables
load_dotenv()

True

In [3]:
import os

<br>
<br>

### **STEP - 1 : Reading the `PDF` File** 

In [4]:
# Reading the document

def read_doc(directory):
    file_loader = PyPDFDirectoryLoader(directory) # Passing the directory where PDF file/document is stored
    documents = file_loader.load() # Loading the file(s)
    return documents

In [5]:
doc = read_doc("documents/")

In [6]:
doc # Will show the output as Pages

[Document(metadata={'source': 'documents\\budget_speech.pdf', 'page': 0}, page_content='GOVERNMENT OF INDIA\nBUDGET 2024-2025\nSPEECH\nOF\nNIRMALA SITHARAMAN\nMINISTER OF FINANCE\nJuly 23,  2024'),
 Document(metadata={'source': 'documents\\budget_speech.pdf', 'page': 1}, page_content=''),
 Document(metadata={'source': 'documents\\budget_speech.pdf', 'page': 2}, page_content=' \nCONTENTS \n \nPART – A \n Page No. \nIntroduction 1 \nGlobal Context 1 \nInterim Budget 2 \nBudget Theme 2 \nBudget Priorities 2 \n(i) Productivity and resilience in Agriculture  \n(ii) Employment & Skilling \n(iii) Inclusive Human Resource Development and Social Justice  \n(iv) Manufacturing & Services \n(v) Urban Development  \n(vi) Energy Security \n(vii) Infrastructure \n(viii) Innovation, Research & Development \n(ix) Next Generation Reforms \nBudget Estimates 2024-25 20 \n \n \nPART – B \nIndirect taxes 22 \nDirect Taxes  25 \n \nAnnexure to Part-A 31 \nAnnexure to Part-B 36 \n \n  '),
 Document(metadata={

In [7]:
len(doc) # No. of pages in the PDF

62

<br>
<br>

### **STEP - 2 : Dividing the PDF content (Text) into 'text-chunks'**

In [8]:
# Dividing the document content into chunks

def chunk_data(docs, chunk_size = 800, chunk_overlap = 50):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = chunk_size, chunk_overlap = chunk_overlap) # Chunk Size and Chunk Overlap mentioned to the function
    doc = text_splitter.split_documents(docs) # Splitting the document content into text-chunks
    return docs

In [9]:
documents = chunk_data(docs=doc)

In [10]:
documents

[Document(metadata={'source': 'documents\\budget_speech.pdf', 'page': 0}, page_content='GOVERNMENT OF INDIA\nBUDGET 2024-2025\nSPEECH\nOF\nNIRMALA SITHARAMAN\nMINISTER OF FINANCE\nJuly 23,  2024'),
 Document(metadata={'source': 'documents\\budget_speech.pdf', 'page': 1}, page_content=''),
 Document(metadata={'source': 'documents\\budget_speech.pdf', 'page': 2}, page_content=' \nCONTENTS \n \nPART – A \n Page No. \nIntroduction 1 \nGlobal Context 1 \nInterim Budget 2 \nBudget Theme 2 \nBudget Priorities 2 \n(i) Productivity and resilience in Agriculture  \n(ii) Employment & Skilling \n(iii) Inclusive Human Resource Development and Social Justice  \n(iv) Manufacturing & Services \n(v) Urban Development  \n(vi) Energy Security \n(vii) Infrastructure \n(viii) Innovation, Research & Development \n(ix) Next Generation Reforms \nBudget Estimates 2024-25 20 \n \n \nPART – B \nIndirect taxes 22 \nDirect Taxes  25 \n \nAnnexure to Part-A 31 \nAnnexure to Part-B 36 \n \n  '),
 Document(metadata={

In [11]:
len(documents)

62

<br>
<br>

### **STEP - 3 : Initializing Embedding Technique**

In [12]:
# Embedding Techinique Initialization of OpenAI

embeddings = OpenAIEmbeddings(api_key = os.environ['OPENAI_API_KEY'])
embeddings

OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x0000023297295DD0>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x0000023295D76250>, model='text-embedding-ada-002', dimensions=None, deployment='text-embedding-ada-002', openai_api_version=None, openai_api_base=None, openai_api_type=None, openai_proxy=None, embedding_ctx_length=8191, openai_api_key=SecretStr('**********'), openai_organization=None, allowed_special=None, disallowed_special=None, chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None, http_async_client=None, check_embedding_ctx_length=True)

In [13]:
vectors = embeddings.embed_query("How Are You")
vectors

[-0.0128998514264822,
 -0.0031354695092886686,
 0.013411243446171284,
 -0.04226651042699814,
 -0.031271591782569885,
 0.01761743798851967,
 -0.023127682507038116,
 -0.02060907892882824,
 -0.020941484719514847,
 -0.013168332166969776,
 0.015252253040671349,
 0.015137189999222755,
 0.00767726544290781,
 0.001140243373811245,
 -0.0071019502356648445,
 -0.009556629694998264,
 0.04142271354794502,
 0.004906162619590759,
 0.00026947938022203743,
 -0.011774790473282337,
 0.009026060812175274,
 0.022948695346713066,
 -0.0038546137511730194,
 -0.0031578429043293,
 -0.01954794116318226,
 0.002159031108021736,
 0.006756760645657778,
 -0.012260612100362778,
 0.008489099331200123,
 -0.019458448514342308,
 0.0077156201004981995,
 -0.0023460087832063437,
 -0.0044171446934342384,
 -0.002117480617016554,
 -0.0019352973904460669,
 -0.015661366283893585,
 -0.01066251378506422,
 -0.002315644873306155,
 0.01965022087097168,
 0.0015285812551155686,
 0.019701359793543816,
 0.003777905134484172,
 -0.011921815

In [14]:
len(vectors)

1536

In [15]:
# Vector Search DB in PineCone

from pinecone import Pinecone, ServerlessSpec
import time

pc = Pinecone(
    api_key="" # 👈 API Key for Pincone Vector Index goes here
)

index_name = "langchainvector"

existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

if index_name not in existing_indexes:
    pc.create_index(
        name=index_name, 
        dimension=1536, 
        metric='cosine',
        spec=ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
    )
    while not pc.describe_index(index_name).status["ready"]:
        time.sleep(1)


In [16]:
index = pc.Index(index_name)

In [17]:
from langchain_pinecone import PineconeVectorStore

vector_store = PineconeVectorStore(index=index, embedding=embeddings)

In [22]:
from uuid import uuid4
from langchain_core.documents import Document

uuids = [str(uuid4()) for _ in range(len(documents))]

vector_store.add_documents(documents=documents, ids=uuids)

['2a358e03-65f6-47ea-a98e-b0eb4ab0998e',
 'f7712dd5-257a-4beb-aa12-664130025e5d',
 'd4a0aa53-b32a-480d-bb30-058b6873d690',
 'bc055a7e-2e01-4261-adc7-67deec80d457',
 'd17e42fa-e801-469f-a426-1b164c30885c',
 '798130bd-ce3e-4b20-86e9-9b08b6dfd494',
 'def09c4d-ef76-4bed-a049-f9923bab590a',
 'a2eb708d-5575-4e76-9b5c-17ba0a0c80d3',
 '1fc8505f-96c7-4fcc-9de2-c28ebe8be2ee',
 'd145f770-26e8-47e3-a5ed-1cc7dee97fcb',
 '478e4f00-1310-4549-9af8-dfc650a92683',
 'c8657bb8-812d-426f-b2f6-0c71a65a4b13',
 '4ddffbea-d603-4c4e-b2a1-fac647432946',
 '29f2f6db-4f67-48bb-b102-65965bd4f7e9',
 '670bfb90-e88e-4191-8ea9-2528b40663cc',
 'c762fde9-eab5-4227-82c6-c3633a8f65d1',
 'cf806e33-c068-48bd-a74a-d38469a35f98',
 '1119eeec-b208-439f-ba81-4d0805dc7361',
 '886542c2-9c95-452f-bae3-e034e7db8da8',
 '0b280d65-5386-43b8-a78f-8d4d40324287',
 '7f7a5541-26c6-495e-b2ac-a321707ed2d5',
 '6f689329-63a5-452f-a909-56ce8443ba13',
 '62e6fdf5-38d3-4ba2-b33e-280e2791541a',
 'b0b1e7a5-93c3-43fb-a6a2-2b5f5b507905',
 'ad769859-2bf7-

<br>
<br>

### **STEP - 4 : Querying the Document**

In [23]:
# Lets Query - Using Cosine Similarity (Retrieve Resalts) - from VectorDB

def retrieve_query(query, k=2):
    matching_results = vector_store.similarity_search(
        query, 
        k = k,
        filter = {"source" : "documents\\budget_speech.pdf"})
    return matching_results

In [42]:
# Queries will be more like : "Question-Answering" type

from langchain.chains.question_answering import load_qa_chain
# from langchain import OpenAI

from langchain.chat_models import ChatOpenAI

In [43]:
# llm = OpenAI(model_name = "gpt-3.5-turbo", temperature = 0.5)

llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.5)

In [44]:
chain = load_qa_chain(llm, chain_type = "stuff")

In [45]:
# Retrieving the Queries - Searching answers from the VectorDB

def retrieve_answers(query):
    doc_search = retrieve_query(query)
    print(doc_search)
    response = chain.run(input_documents = doc_search, question = query) # Running the chain
    return response

In [46]:
query1 = "How much the agriculture target will be increased and by how many crore of rupees ?"
answer = retrieve_answers(query1)
print(answer)

[Document(id='def09c4d-ef76-4bed-a049-f9923bab590a', metadata={'page': 6.0, 'source': 'documents\\budget_speech.pdf'}, page_content=' 3  \n \n8) Innovation, Research & Development and \n9) Next Generation Reforms \n8. Subsequent budgets will build on these, and add more priorities and \nactions. A more detailed formulation will be carried out as part  of the \n‘economic policy framework’ about which I will speak later in this speech.   \n9. This budget details some of the specific actions to be initiated in the \ncurrent year towards fulfilment of these priorities with potential for \ntransformative changes.  The  budget also covers some of the previously \nmade announcements with an intent to strengthen them and step up their \nimplementation for expediting our journey towards the goal of Viksit Bharat.   \nPriority 1: Productivity and resilience in Agriculture \nTransforming agriculture research \n10. Our government will undertake a comprehensive review of the \nagriculture research 