In [16]:
import openai
import langchain
import pinecone
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain.llms import OpenAI
import google.generativeai as genai


In [17]:
from dotenv import load_dotenv
load_dotenv()

True

In [6]:
import os 

In [18]:
def read_doc(directory):
    file_loader= PyPDFDirectoryLoader(directory)
    documents=file_loader.load()
    return documents

In [19]:
doc=read_doc('documents/')
len(doc)

58

In [20]:
def chunk_data(docs,chunk_size=800,chunk_overlap=50):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=chunk_size,chunk_overlap=chunk_overlap)
    doc=text_splitter.split_documents(docs)
    return docs

In [None]:
documents=chunk_data(doc)
documents

In [22]:

def get_google_embeddings(query, api_key):
    genai.configure(api_key=api_key)
    result = genai.embed_content(
        model="models/text-embedding-004",
        content=query
    )
    return result['embedding']

In [23]:
from pinecone import Pinecone,ServerlessSpec
# Create a new index (only needs to be done once, unless the index is deleted)
pc=Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
index_name = os.getenv("PINECONE_INDEX_NAME")

if index_name not in pc.list_indexes().names():
    pc.create_index(        
        name=index_name,
        dimension=768,
        metric='cosine',
        spec=ServerlessSpec(cloud='aws',region='us-east-1'),
        )


In [24]:
index=pc.Index(index_name)
index

<pinecone.data.index.Index at 0x1a8d96f4590>

In [25]:
document_contents = [doc.page_content for doc in documents]

embeddings = []
for doc_content in document_contents:
    try:
        # Ensure we are passing the content and get embeddings
        embedding = get_google_embeddings(doc_content, os.getenv("GOOGLE_API_KEY"))
        embeddings.append(embedding)
    except Exception as e:
        print(f"Error generating embeddings for document: {doc_content}. Error: {e}")
        pass  # Optionally append None for failed embeddings

Error generating embeddings for document: . Error: Invalid input: 'content' argument must not be empty. Please provide a non-empty value.
Error generating embeddings for document: . Error: Invalid input: 'content' argument must not be empty. Please provide a non-empty value.


In [26]:
vectors = [
    {
        "id": f"doc{i}",
        "values": embedding,  # The embedding vector
        "metadata": {"page_content": doc.page_content}  # Add metadata
    }
    for i, (embedding, doc) in enumerate(zip(embeddings, documents))
]

# Upsert vectors into Pinecone
index.upsert(vectors=vectors)


{'upserted_count': 56}

In [27]:
from langchain.schema import Document

In [28]:
def retrive_query(query, k=2):
    # First, generate the query vector using your embedding model (e.g., Google or OpenAI)
    query_vector = get_google_embeddings(query,os.getenv("GOOGLE_API_KEY"))
    
    # Perform similarity search in Pinecone using the query vector
    results = index.query(vector=query_vector, top_k=k, include_metadata=True,include_values=True)
    documents = [
        Document(
            page_content=match.get("metadata", {}).get("page_content", ""),  # Safely get metadata
            metadata=match.get("metadata", {})  # Default to empty dict if metadata is missing
        )
        for match in results["matches"]
    ]

    return documents

In [29]:
from langchain.chains.question_answering import load_qa_chain
from langchain import OpenAI


In [None]:
! pip install langchain_groq

In [30]:
import getpass
import os

if not os.environ.get("GROQ_API_KEY"):
  os.environ["GROQ_API_KEY"] = getpass.getpass("Enter API key for Groq: ")

from langchain_groq import ChatGroq

llm = ChatGroq(model="llama3-8b-8192")



In [31]:
chain=load_qa_chain(llm,chain_type="stuff")

In [32]:
def retrive_answer(query):
    doc_search=retrive_query(query)
    # print(doc_search)
    response=chain.run(input_documents=doc_search,question=query)
    if not doc_search:
        return "No relevant documents found."
    return response

In [35]:
q="How is the agriculture doing"
answer=retrive_answer(q)
print(answer)

[Document(metadata={'page_content': '5 \n \n \n \n4) Green Growth: We are implementing many programmes for green \nfuel, green energy, green farming, green mobility, green buildings, \nand green equipment, and policies for efficient use of energy across \nvarious economic sectors. These green growth efforts help in \nreducing carbon intensity of the economy and provides for large-\nscale green job opportunities.  \nPriorities of this Budget \n14. The Budget adopts the following seven priorities. They complement \neach other and act as the ‘Saptarishi’ guiding us through the Amrit Kaal. \n1) Inclusive Development  \n2) Reaching the Last Mile \n3) Infrastructure and Investment \n4) Unleashing the Potential \n5) Green Growth \n6) Youth Power  \n7) Financial Sector \nPriority 1: Inclusive Development  \n15. The Government’s philosophy of Sabka Saath Sabka Vikas  has \nfacilitated inclusive development covering in specific, farmers, women, \nyouth, OBCs, Scheduled Castes, Scheduled Tribes, 