In [36]:
import openai, langchain, pinecone, os
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain.llms import OpenAI
from langchain_pinecone import PineconeVectorStore
from langchain.chains.question_answering import load_qa_chain
from langchain import OpenAI

from dotenv import load_dotenv
load_dotenv()

True

In [4]:
# read document
def read_doc(directory):
    file_loader = PyPDFDirectoryLoader(directory)
    documents = file_loader.load()
    return documents

In [6]:
doc = read_doc(".")
len(doc)

267

In [7]:
# divide the docs into chunks

def chunk_data(docs, chunk_size=800, chunk_overlap=50):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    doc = text_splitter.split_documents(docs)
    return doc

In [8]:
document = chunk_data(docs=doc)
len(document)

2205

In [9]:
# embedding with openai
embedding = OpenAIEmbeddings(api_key=os.environ["OPENAI_API_KEY"])

In [11]:
vectors = embedding.embed_query("How are you")
len(vectors)

1536

In [19]:
# vector search DB  in Pinecone

index_name = "langchainvector"

In [32]:
# index = Pinecone.from_documents(doc, embedding=embedding, index_name=index_name)
# Pinecone.from_documents(doc, embedding=embedding, index=index_name)
index = PineconeVectorStore.from_documents(doc[:100], embedding=embedding, index_name=index_name)

In [35]:
# cosine similarity retreive results from vectorDB
def retrieve_query(query, k=2):
    matching_results = index.similarity_search(query, k=k)
    return matching_results

In [38]:
llm = OpenAI(model="gpt-3.5-turbo-instruct", temperature=0.5)
chain = load_qa_chain(llm, chain_type="stuff")

In [39]:
# search answers from VectorDB
def retrieve_answers(query):
    doc_search = retrieve_query(query)
    print(doc_search)
    response = chain.run(input_documents=doc_search, question=query)
    return response

In [44]:
our_query = "who is chairman"
answer = retrieve_answers(our_query)
print(answer)

[Document(page_content='Committees\n  Audit Committee\n  Stakeholders’ Relationship Committee\n  Corporate Social Responsibility and \nGovernance Committee\n  Human Resources, Nomination and \nRemuneration Committee\n  Finance Committee\n  Environmental, Social and \nGovernance Committee\n  Risk Management Committee\n  Chairman        MemberOur LeadershipBOARD OF DIRECTORS\nThe face of India’s enterprising spirit, he led the creation \nof the world’s largest petroleum refinery, one of the \nmost expansive 4G and 5G networks and India’s largest \nretail footprintFormer Central Vigilance Commissioner, and \nFormer\xa0Chairman CBDT\nFormer Chairman of McKinsey & Company, India; \nChairman of the Capability Building Commission of \nIndia and Chairman of Quality Council of India; serves \non the\xa0Boards of various Reliance Group of Companies, \nLarsen\xa0& Toubro and CiplaLed the commissioning and start-up of the Jamnagar \ncomplex; spearheaded various large scale projects in a \ncareer s