In [18]:
import openai
import pinecone
from langchain import HuggingFaceHub,LLMChain, PromptTemplate
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain.llms import OpenAI

In [2]:
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
import os

### Let's read the document


In [4]:
def read_doc(directory):
    file_loader = PyPDFDirectoryLoader(directory)
    documents = file_loader.load()
    return documents

In [5]:
doc = read_doc('documents/')
doc

[Document(page_content='GOVERNMENT OF INDIA\nBUDGET 2023-2024\nSPEECH\nOF\nNIRMALA SITHARAMAN\nMINISTER OF FINANCE\nFebruary 1,  2023', metadata={'source': 'documents\\budget_speech.pdf', 'page': 0}),
 Document(page_content='', metadata={'source': 'documents\\budget_speech.pdf', 'page': 1}),
 Document(page_content='CONTENTS \nPART-A \n Page No.  \n\uf0b7 Introduction 1 \n\uf0b7 Achievements since 2014: Leaving no one behind 2 \n\uf0b7 Vision for Amrit Kaal  – an empowered and inclusive economy 3 \n\uf0b7 Priorities of this Budget 5 \ni. Inclusive Development  \nii. Reaching the Last Mile \niii. Infrastructure and Investment \niv. Unleashing the Potential \nv. Green Growth \nvi. Youth Power  \nvii. Financial Sector  \n \n \n \n \n \n \n \n \n\uf0b7 Fiscal Management 24 \nPART B  \n  \nIndirect Taxes  27 \n\uf0b7 Green Mobility  \n\uf0b7 Electronics   \n\uf0b7 Electrical   \n\uf0b7 Chemicals and Petrochemicals   \n\uf0b7 Marine products  \n\uf0b7 Lab Grown Diamonds  \n\uf0b7 Precious Met

### Divide the document into chunks

In [6]:
def chunk_data(docs,chunk_size=800,chunk_overlap=50):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,chunk_overlap=chunk_overlap)
    doc=text_splitter.split_documents(docs)
    return docs

In [7]:
documents=chunk_data(docs=doc)
documents

[Document(page_content='GOVERNMENT OF INDIA\nBUDGET 2023-2024\nSPEECH\nOF\nNIRMALA SITHARAMAN\nMINISTER OF FINANCE\nFebruary 1,  2023', metadata={'source': 'documents\\budget_speech.pdf', 'page': 0}),
 Document(page_content='', metadata={'source': 'documents\\budget_speech.pdf', 'page': 1}),
 Document(page_content='CONTENTS \nPART-A \n Page No.  \n\uf0b7 Introduction 1 \n\uf0b7 Achievements since 2014: Leaving no one behind 2 \n\uf0b7 Vision for Amrit Kaal  – an empowered and inclusive economy 3 \n\uf0b7 Priorities of this Budget 5 \ni. Inclusive Development  \nii. Reaching the Last Mile \niii. Infrastructure and Investment \niv. Unleashing the Potential \nv. Green Growth \nvi. Youth Power  \nvii. Financial Sector  \n \n \n \n \n \n \n \n \n\uf0b7 Fiscal Management 24 \nPART B  \n  \nIndirect Taxes  27 \n\uf0b7 Green Mobility  \n\uf0b7 Electronics   \n\uf0b7 Electrical   \n\uf0b7 Chemicals and Petrochemicals   \n\uf0b7 Marine products  \n\uf0b7 Lab Grown Diamonds  \n\uf0b7 Precious Met

In [8]:
# embeddings = OpenAIEmbeddings(api_key=os.environ['HUGGINGFACEHUB_API_TOKEN'])

In [9]:
from langchain_community.embeddings import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings()

In [10]:
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 384, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False})
  (2): Normalize()
), model_name='sentence-transformers/all-mpnet-base-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [11]:
vectors=embeddings.embed_query('How are you?')

In [12]:
vectors

[0.02710612490773201,
 0.011331792920827866,
 -0.0019524091621860862,
 -0.03695135936141014,
 0.017764905467629433,
 0.0009032208472490311,
 -0.03386485204100609,
 0.013378385454416275,
 0.01773053966462612,
 -0.013246694579720497,
 -0.04028133302927017,
 -0.015285246074199677,
 -0.012560301460325718,
 0.015230968594551086,
 0.015512327663600445,
 -0.05751274526119232,
 -0.017129523679614067,
 -0.06184074655175209,
 -0.018761251121759415,
 -0.007223888300359249,
 -0.049612533301115036,
 0.011142098344862461,
 2.9746854124823585e-05,
 -0.009051471017301083,
 0.05342879891395569,
 0.010582635179162025,
 0.03314786031842232,
 -0.004505310673266649,
 -0.006172284483909607,
 0.06208071485161781,
 -0.027285275980830193,
 0.02982654608786106,
 0.024574732407927513,
 -0.021511835977435112,
 1.6617831306575681e-06,
 0.047963518649339676,
 -0.02351677045226097,
 -0.054577283561229706,
 0.07369424402713776,
 -0.036343879997730255,
 0.023755019530653954,
 -0.06763102859258652,
 0.00746326521039009

In [13]:
len(vectors)

768

### Vector Search DB in Pinecone

In [14]:
pinecone.init(
    api_key="261bd569-a034-4f2c-be2d-0c474db330b6",
    environment="gcp-starter"
    )
index_name='langchainvector'

In [15]:
index=Pinecone.from_documents(doc,embeddings,index_name=index_name)

In [16]:
## Cosine Similarity Retreive Results from VectorDB
def retrieve_query(query,k=2):
    matching_results=index.similarity_search(query,k=k)
    return matching_results

In [17]:
from langchain.chains.question_answering import load_qa_chain

In [19]:
Hugging_Face_Hub= HuggingFaceHub(repo_id='google/flan-t5-large', model_kwargs={'temperature': 0})

  warn_deprecated(


In [20]:
chain = load_qa_chain(Hugging_Face_Hub,chain_type='stuff')

### Search answers from VectorDB

In [21]:
def retrieve_answers(query):
    doc_search=retrieve_query(query)
    print(doc_search)
    response=chain.run(input_documents=doc_search,question=query)
    return response

In [22]:
our_query = "How much the agriculture target will be increased by how many crore?"
answer = retrieve_answers(our_query)
print(answer)

[Document(page_content='5 \n \n \n 4) Green Growth: We are implementing many programmes for green \nfuel, green energy, green farming, green mobility, green buildings, \nand green equipment, and policies for efficient use of energy across \nvarious economic sectors. These green growth efforts help in \nreducing carbon intensity of the economy and provides for large-\nscale green job opportunities.  \nPriorities of this Budget  \n14. The Budget adopts the following seven priorities. They complement \neach other and act as the ‘Saptarishi’  guiding us through the Amrit Kaal.  \n1) Inclusive Development  \n2) Reaching the Last Mile \n3) Infrastructure and Investment \n4) Unleashing the Potential \n5) Green Growth \n6) Youth Power  \n7) Financial Sector \nPriority 1: Inclusive Development  \n15. The Government’s philosophy of Sabka Saath Sabka Vikas  has \nfacilitated inclusive development covering in specific, farmers, women, \nyouth, OBCs, Scheduled Castes, Scheduled Tribes, divyangjan a

  warn_deprecated(


 2,200 crore


In [None]:

output = Hugging_Face_Hub.predict('Can you tell me where is Hyderabad')
print(output)

In [None]:
prompt = PromptTemplate(
    input_variables = ['product'],
    template = 'what is the capital of {product} '
)

In [None]:

chain = LLMChain(llm = Hugging_Face_Hub, prompt=prompt)