In [1]:
!pip install --upgrade --quiet langchain langchain-openai langchain-pinecone pypdf

In [2]:
import os
from google.colab import userdata
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAI, OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain.chains.question_answering import load_qa_chain
from langchain.chains import RetrievalQA

In [3]:
# from google.colab import drive
# drive.mount('/content/drive')

Set the following environment variables to make using the Pinecone integration easier:

* PINECONE_API_KEY: Your Pinecone API key.
* PINECONE_INDEX_NAME: The name of the index you want to use.


In [4]:
os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')
os.environ['PINECONE_API_KEY'] = userdata.get('pinecone')
# index name for pine cone
os.environ['environment'] = "gcp-starter"

In [5]:
loader = PyPDFLoader("/content/yoloWorld.pdf")
pages = loader.load()

In [6]:
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=1000,
    chunk_overlap=800,
    length_function=len,
    is_separator_regex=False,
)

docs = text_splitter.split_documents(pages)

In [7]:
embeddings = OpenAIEmbeddings()

In [8]:
embedding_size = embeddings.embed_query("Hi")
print(len(embedding_size))

1536


# Storing data in DB.

In [9]:
index_name = "test-index-for-op-stack"

index = PineconeVectorStore.from_documents(docs, embeddings, index_name=index_name)

# Similarity Search

In [10]:
query = "What is yolo-world?"
docs = index.similarity_search(query)
print(docs)

[Document(page_content='YOLO-World as an open-vocabulary detector', metadata={'page': 2.0, 'source': '/content/yoloWorld.pdf'}), Document(page_content='YOLO-World as an open-vocabulary detector', metadata={'page': 2.0, 'source': '/content/yoloWorld.pdf'}), Document(page_content='YOLO-World: Real-Time Open-Vocabulary Object Detection', metadata={'page': 0.0, 'source': '/content/yoloWorld.pdf'}), Document(page_content='YOLO-World: Real-Time Open-Vocabulary Object Detection', metadata={'page': 0.0, 'source': '/content/yoloWorld.pdf'})]


In [11]:
print(docs[0].page_content)

YOLO-World as an open-vocabulary detector


In [12]:
query = "What is yolo-world?"
docs = index.similarity_search(query, k = 3)
print(docs)

[Document(page_content='YOLO-World as an open-vocabulary detector', metadata={'page': 2.0, 'source': '/content/yoloWorld.pdf'}), Document(page_content='YOLO-World as an open-vocabulary detector', metadata={'page': 2.0, 'source': '/content/yoloWorld.pdf'}), Document(page_content='YOLO-World: Real-Time Open-Vocabulary Object Detection', metadata={'page': 0.0, 'source': '/content/yoloWorld.pdf'})]


# Add data to existing embeddings.

In [13]:
vectorstore = PineconeVectorStore(index_name=index_name, embedding=embeddings)

vectorstore.add_texts(["My name is Shubham!"])

['d90f880c-6d79-4a74-bbb9-ec3e9aec0e09']

# Retriever

**MMR**

In [14]:
retriever = index.as_retriever(search_type="mmr")

In [28]:
matched_docs = retriever.get_relevant_documents(query)
for i, d in enumerate(matched_docs):
    print(f"\n## Document {i}\n")
    print(d.page_content)


## Document 0

YOLO-World as an open-vocabulary detector

## Document 1

mance of YOLO-World-S; (2) using an excessive amount

## Document 2

ically, YOLO-World follows the standard YOLO archi-

## Document 3

of YOLO-World are evaluated for both the


**max_marginal_relevance_search**

In [16]:
# retriever = index.max_marginal_relevance_search(query, k=2, fetch_k=10)

# for i, doc in enumerate(retriever):
#     print(f"{i + 1}.", doc.page_content, "\n")

# Load QA Chain

In [17]:
llm = OpenAI(temperature=0)

chain = load_qa_chain(llm,
                      chain_type='stuff')

In [18]:
query = 'What is yolo world?'
similarity_docs = index.similarity_search(query)
response = chain.run(question = query, input_documents = similarity_docs)

  warn_deprecated(


In [19]:
print(response)

 YOLO-World is a real-time open-vocabulary object detection system.


# Retriever QA Chain

In [41]:
# query = 'What is YOLO-World ?'
query = 'What is yolo world ?'
qa_chain = RetrievalQA.from_chain_type(
    llm = llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True
)

In [42]:
result = qa_chain.invoke({'query':query})

In [43]:
result

{'query': 'What is yolo world ?',
 'result': " I don't know.",
 'source_documents': [Document(page_content='YOLO-World as an open-vocabulary detector', metadata={'page': 2.0, 'source': '/content/yoloWorld.pdf'}),
  Document(page_content='mance of YOLO-World-S; (2) using an excessive amount', metadata={'page': 13.0, 'source': '/content/yoloWorld.pdf'}),
  Document(page_content='ically, YOLO-World follows the standard YOLO archi-', metadata={'page': 1.0, 'source': '/content/yoloWorld.pdf'}),
  Document(page_content='of YOLO-World are evaluated for both the', metadata={'page': 6.0, 'source': '/content/yoloWorld.pdf'})]}

In [44]:
result['result']

" I don't know."