In [98]:
# import Libraries

import openai
import langchain
import pinecone
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain.llms import openai

In [113]:
from dotenv import load_dotenv
load_dotenv()

True

In [114]:
import os

In [115]:
## Lets Read the document
def read_doc(directory):
    file_loader=PyPDFDirectoryLoader(directory)
    documents=file_loader.load()
    return documents

In [116]:
doc=read_doc('documents/')
len(doc)

58

In [117]:
## Divide the docs into chunks
def chunk_data(docs,chunk_size=800,chunk_overlap=50):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=chunk_size,chunk_overlap=chunk_overlap)
    doc=text_splitter.split_documents(docs)
    return doc

In [118]:
documents=chunk_data(docs=doc)
len(documents)

141

In [119]:
#Embedding Technique of OPENAI
embeddings=OpenAIEmbeddings(api_key=os.environ['OPENAI_API_KEY'])
embeddings

OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x0000023B5E7E5F60>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x0000023B5E7DAE60>, model='text-embedding-ada-002', dimensions=None, deployment='text-embedding-ada-002', openai_api_version='', openai_api_base=None, openai_api_type='', openai_proxy='', embedding_ctx_length=8191, openai_api_key=SecretStr('**********'), openai_organization=None, allowed_special=None, disallowed_special=None, chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None, http_async_client=None, check_embedding_ctx_length=True)

In [120]:
vectors=embeddings.embed_query("How are you?")
len(vectors)

1536

In [121]:

import getpass
from pinecone import Pinecone, ServerlessSpec

if not os.getenv("PINECONE_API_KEY"):
    os.environ["PINECONE_API_KEY"] = getpass.getpass("3f7c9918-65fe-4c5d-9bfc-c5f182325424")

pinecone_api_key = os.environ.get("PINECONE_API_KEY")

pc = Pinecone(api_key=pinecone_api_key)
index_name="langchainvector"

In [122]:
import time

index_name = "langchainvector"  # change if desired

existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

if index_name not in existing_indexes:
    pc.create_index(
        name=index_name,
        dimension=1536,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )
    while not pc.describe_index(index_name).status["ready"]:
        time.sleep(1)

index = pc.Index(index_name)

In [127]:
import getpass
os.environ["OPENAI_API_KEY"] = getpass.getpass()
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
print(embeddings)

client=<openai.resources.embeddings.Embeddings object at 0x0000023B5C385000> async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x0000023B62FCB4C0> model='text-embedding-ada-002' dimensions=None deployment='text-embedding-ada-002' openai_api_version='' openai_api_base=None openai_api_type='' openai_proxy='' embedding_ctx_length=8191 openai_api_key=SecretStr('**********') openai_organization=None allowed_special=None disallowed_special=None chunk_size=1000 max_retries=2 request_timeout=None headers=None tiktoken_enabled=True tiktoken_model_name=None show_progress_bar=False model_kwargs={} skip_empty=False default_headers=None default_query=None retry_min_seconds=4 retry_max_seconds=20 http_client=None http_async_client=None check_embedding_ctx_length=True


In [128]:
from langchain_pinecone import PineconeVectorStore
print(index_name)
index = PineconeVectorStore.from_documents(doc,embeddings,index_name=index_name)

langchainvector


In [130]:
## Cosine Similarity Retrieve Results from Vector DB

def retrieve_query(query,k=2):
    matching_results=index.similarity_search(query,k=k)
    return matching_results


In [133]:
from langchain.chains.question_answering import load_qa_chain
from langchain_openai import OpenAI

In [144]:
llm=OpenAI(model_name="davinci-002",temperature=0.5)
chain=load_qa_chain(llm,chain_type="stuff")

In [145]:
## Search answers from VectorDB
def retrieve_answers(query):
    doc_search=retrieve_query(query)
    print(doc_search)
    response=chain.run(input_documents=doc_search,question=query)
    return response

In [150]:
our_query="Give me the top highlights of the budget?"
answer = retrieve_answers(our_query)
print(answer)

[Document(metadata={'page': 2.0, 'source': 'documents\\budget_speech.pdf'}, page_content='CONTENTS \nPART-A \n Page No.  \n\uf0b7 Introduction 1 \n\uf0b7 Achievements since 2014: Leaving no one behind 2 \n\uf0b7 Vision for Amrit Kaal  – an empowered and inclusive economy 3 \n\uf0b7 Priorities of this Budget 5 \ni. Inclusive Development  \nii. Reaching the Last Mile \niii. Infrastructure and Investment \niv. Unleashing the Potential \nv. Green Growth \nvi. Youth Power  \nvii. Financial Sector  \n \n \n \n \n \n \n \n \n\uf0b7 Fiscal Management 24 \nPART B  \n  \nIndirect Taxes  27 \n\uf0b7 Green Mobility  \n\uf0b7 Electronics   \n\uf0b7 Electrical   \n\uf0b7 Chemicals and Petrochemicals   \n\uf0b7 Marine products  \n\uf0b7 Lab Grown Diamonds  \n\uf0b7 Precious Metals  \n\uf0b7 Metals  \n\uf0b7 Compounded Rubber  \n\uf0b7 Cigarettes  \n  \nDirect Taxes  30 \n\uf0b7 MSMEs and Professionals   \n\uf0b7 Cooperation  \n\uf0b7 Start-Ups  \n\uf0b7 Appeals  \n\uf0b7 Better targeting of tax conce

In [148]:
our_query="whose speech it was?"
answer = retrieve_answers(our_query)
print(answer)

[Document(metadata={'page': 0.0, 'source': 'documents\\budget_speech.pdf'}, page_content='GOVERNMENT OF INDIA\nBUDGET 2023-2024\nSPEECH\nOF\nNIRMALA SITHARAMAN\nMINISTER OF FINANCE\nFebruary 1,  2023'), Document(metadata={'page': 4.0, 'source': 'documents\\budget_speech.pdf'}, page_content='Budget 2023-2024 \n \nSpeech of  \nNirmala Sitharaman \nMinister of Finance \nFebruary 1, 2023 \nHon’ble Speaker,  \n I present the Budget for 2023-24. This is the first Budget in Amrit \nKaal . \nIntroduction \n1. This Budget hopes to build on the foundation laid in the previous \nBudget, and the blueprint drawn for India@100. We envision a prosperous \nand inclusive India, in which the fruits of development reach all regions and \ncitizens, especially our youth, women, farmers, OBCs, Scheduled Castes and \nScheduled Tribes.  \n2. In the 75th year of our Independence, the world has recognised the \nIndian economy as a ‘bright star’. Our current year’s economic growth is \nestimated to be at 7 per c

In [149]:
our_query="what is the name of lpg scheme?"
answer = retrieve_answers(our_query)
print(answer)

[Document(metadata={'page': 21.0, 'source': 'documents\\budget_speech.pdf'}, page_content='18 \n \n \n Viability Gap Funding. A detailed framework for Pumped Storage Projects \nwill also be formulated.  \nRenewable Energy Evacuation \n80. The Inter-state transmission system for evacuation and grid \nintegration of 13 GW renewable energy from Ladakh will be constructed \nwith investment of ` 20,700 crore including central support of ` 8,300 crore. \nGreen Credit Programme \n81. For encouraging behavioural change, a Green Credit Programme will \nbe notified under the Environment (Protection) Act. This will incentivize \nenvironmentally sustainable and responsive actions by companies, \nindividuals and local bodies, and help mobilize additional resources for such \nactivities.  \nPM-PRANAM \n82. “PM Programme for Restoration, Awareness, Nourishment and \nAmelioration of Mother Earth” will be launched to incentivize States and \nUnion Territories to promote alternative fertilizers and bala