In [None]:
import openai
import langchain
import pinecone
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain.llms import OpenAI


In [None]:
from dotenv import load_dotenv
import os
load_dotenv()

os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")


In [None]:
def read_pdf(directory):
    file_loader = PyPDFDirectoryLoader(directory)
    documents = file_loader.load()
    return documents

In [None]:
doc = read_pdf("./input")

In [None]:
len(doc)

In [None]:
def chunk_data(docs, chunk_size = 100, chunk_overlap = 25):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size= chunk_size, chunk_overlap= chunk_overlap)
    doc = text_splitter.split_documents(docs)
    return doc

In [None]:
documents = chunk_data(docs = doc)
documents

In [None]:

embeddings = OpenAIEmbeddings(api_key=os.environ["OPENAI_API_KEY"])


In [None]:
vectors = embeddings.embed_query("How are you?")
len(vectors)

In [None]:
from pinecone import Pinecone, Index, ServerlessSpec
# from pinecone import Pinecone, ServerlessSpec
client = Pinecone(api_key=os.getenv("PINECONE_API_KEY"), environment="us-east-1")

index_name = "samplevectordb"
# Connect to the created index
index_sample = Index("samplevectordb", host = os.getenv("HOST"))

In [None]:
from langchain.vectorstores import Pinecone as LangchainPinecone

vector_store = LangchainPinecone.from_documents(doc, embeddings, index_name=index_name)

In [None]:
def get_data(query, k=2):
    matching_results = vector_store.similarity_search(query, k=k)
    return matching_results

In [None]:
from langchain.chains.question_answering import load_qa_chain
from langchain import OpenAI

In [None]:
llm=OpenAI(model_name="gpt-3.5-turbo-instruct",temperature=0.5)
chain = load_qa_chain(llm, chain_type="stuff")


In [None]:
def get_answers(query):
    doc_search = get_data(query)
    print(doc_search)
    response = chain.run(input_documents=doc_search, question=query)

    return response

In [None]:
input_query = "Where is election commission of India located?"
answer =  get_answers(input_query)
print(answer)

In [None]:
input_query = "How many MP seats are present?"
answer =  get_answers(input_query)
print(answer)