In [83]:
!pip install langchain
!pip install pypdf
!pip install openai
!pip install pinecone-client[grpc]
!pip install tiktoken #utility for embeddings class in OpenAI

Collecting grpc-gateway-protoc-gen-openapiv2==0.1.0 (from pinecone-client[grpc])
  Downloading grpc_gateway_protoc_gen_openapiv2-0.1.0-py3-none-any.whl (12 kB)
Collecting lz4>=3.1.3 (from pinecone-client[grpc])
  Downloading lz4-4.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: lz4, grpc-gateway-protoc-gen-openapiv2
Successfully installed grpc-gateway-protoc-gen-openapiv2-0.1.0 lz4-4.3.3


In [51]:
pip install langchain_community langchain_pinecone

Collecting langchain_pinecone
  Downloading langchain_pinecone-0.1.1-py3-none-any.whl (8.4 kB)
Collecting pinecone-client<4.0.0,>=3.2.2 (from langchain_pinecone)
  Downloading pinecone_client-3.2.2-py3-none-any.whl (215 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m215.9/215.9 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pinecone-client, langchain_pinecone
  Attempting uninstall: pinecone-client
    Found existing installation: pinecone-client 4.1.1
    Uninstalling pinecone-client-4.1.1:
      Successfully uninstalled pinecone-client-4.1.1
Successfully installed langchain_pinecone-0.1.1 pinecone-client-3.2.2


In [84]:
from langchain.llms import OpenAI
from langchain.vectorstores import Pinecone #store and query embeddings
from langchain_pinecone import PineconeVectorStore
from pinecone.grpc import PineconeGRPC
from langchain.chains import RetrievalQA
'''RetrievalQA is a chain that performs retrieval-based question answering. It combines retrieval (finding relevant documents) with
a QA model to answer questions based on those documents.'''
from langchain.prompts import PromptTemplate
from langchain.embeddings import OpenAIEmbeddings
#OpenAIEmbeddings generates embeddings for text using OpenAI's models. These embeddings can then be stored in a vector database like Pinecone for later retrieval.
from langchain.document_loaders import PyPDFDirectoryLoader #loads PDFs from a directory
from langchain.text_splitter import RecursiveCharacterTextSplitter
'''RecursiveCharacterTextSplitter splits text into smaller chunks based on character count, which is useful for processing and
embedding long documents that exceed the model's input size limitations.'''
import os
import pinecone

In [4]:
!mkdir pdfs

In [5]:
loader = PyPDFDirectoryLoader('pdfs')

In [6]:
data = loader.load()

In [7]:
data

[Document(page_content='Citation: Tao, L.; Xie, Z.; Xu, D.; Ma,\nK.; Qiu, Q.; Pan, S.; Huang, B.\nGeographic Named Entity\nRecognition by Employing Natural\nLanguage Processing and an\nImproved BERT Model. ISPRS Int. J.\nGeo‑Inf. 2022 ,11, 598. https://doi.org/\n10.3390/ijgi11120598\nAcademic Editors: Maria Antonia\nBrovelli and Wolfgang Kainz\nReceived: 15 September 2022\nAccepted: 24 November 2022\nPublished: 28 November 2022\nPublisher’s Note: MDPI stays neutral\nwith regard to jurisdictional claims in\npublished maps and institutional affil‑\niations.\nCopyright: © 2022 by the authors.\nLicensee MDPI, Basel, Switzerland.\nThis article is an open access article\ndistributed under the terms and\nconditions of the Creative Commons\nAttribution (CC BY) license ( https://\ncreativecommons.org/licenses/by/\n4.0/).\n International Journal of\nGeo-Information\nArticle\nGeographic Named Entity Recognition by Employing Natural\nLanguage Processing and an Improved BERT Model\nLiufeng Tao1,2, 

In [8]:
data[0]

Document(page_content='Citation: Tao, L.; Xie, Z.; Xu, D.; Ma,\nK.; Qiu, Q.; Pan, S.; Huang, B.\nGeographic Named Entity\nRecognition by Employing Natural\nLanguage Processing and an\nImproved BERT Model. ISPRS Int. J.\nGeo‑Inf. 2022 ,11, 598. https://doi.org/\n10.3390/ijgi11120598\nAcademic Editors: Maria Antonia\nBrovelli and Wolfgang Kainz\nReceived: 15 September 2022\nAccepted: 24 November 2022\nPublished: 28 November 2022\nPublisher’s Note: MDPI stays neutral\nwith regard to jurisdictional claims in\npublished maps and institutional affil‑\niations.\nCopyright: © 2022 by the authors.\nLicensee MDPI, Basel, Switzerland.\nThis article is an open access article\ndistributed under the terms and\nconditions of the Creative Commons\nAttribution (CC BY) license ( https://\ncreativecommons.org/licenses/by/\n4.0/).\n International Journal of\nGeo-Information\nArticle\nGeographic Named Entity Recognition by Employing Natural\nLanguage Processing and an Improved BERT Model\nLiufeng Tao1,2, Z

In [9]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 20)

In [10]:
text_chunks = text_splitter.split_documents(data)

In [11]:
text_chunks

[Document(page_content='Citation: Tao, L.; Xie, Z.; Xu, D.; Ma,\nK.; Qiu, Q.; Pan, S.; Huang, B.\nGeographic Named Entity\nRecognition by Employing Natural\nLanguage Processing and an\nImproved BERT Model. ISPRS Int. J.\nGeo‑Inf. 2022 ,11, 598. https://doi.org/\n10.3390/ijgi11120598\nAcademic Editors: Maria Antonia\nBrovelli and Wolfgang Kainz\nReceived: 15 September 2022\nAccepted: 24 November 2022\nPublished: 28 November 2022\nPublisher’s Note: MDPI stays neutral\nwith regard to jurisdictional claims in', metadata={'source': 'pdfs/107. NER.pdf', 'page': 0}),
 Document(page_content='published maps and institutional affil‑\niations.\nCopyright: © 2022 by the authors.\nLicensee MDPI, Basel, Switzerland.\nThis article is an open access article\ndistributed under the terms and\nconditions of the Creative Commons\nAttribution (CC BY) license ( https://\ncreativecommons.org/licenses/by/\n4.0/).\n International Journal of\nGeo-Information\nArticle\nGeographic Named Entity Recognition by Empl

In [12]:
len(text_chunks)

214

In [13]:
text_chunks[0]

Document(page_content='Citation: Tao, L.; Xie, Z.; Xu, D.; Ma,\nK.; Qiu, Q.; Pan, S.; Huang, B.\nGeographic Named Entity\nRecognition by Employing Natural\nLanguage Processing and an\nImproved BERT Model. ISPRS Int. J.\nGeo‑Inf. 2022 ,11, 598. https://doi.org/\n10.3390/ijgi11120598\nAcademic Editors: Maria Antonia\nBrovelli and Wolfgang Kainz\nReceived: 15 September 2022\nAccepted: 24 November 2022\nPublished: 28 November 2022\nPublisher’s Note: MDPI stays neutral\nwith regard to jurisdictional claims in', metadata={'source': 'pdfs/107. NER.pdf', 'page': 0})

In [14]:
text_chunks[10]

Document(page_content='ISPRS Int. J. Geo‑Inf. 2022 ,11, 598 2 of 22\nfirst subprocess of our approach is to identify the location of the mentioned contents; this\nsubprocess is called entity recognition (NER) in NLP [ 10–13].\nThere are single‑word place names, such as Beijing, Shanghai, Zhejiang, etc. There are\nalso long place names composed of multiple words, such as Ejin Jinqi Saihantaolai Sumu\nTownship (Inner Mongolia Autonomous Region); however, most of the place names are', metadata={'source': 'pdfs/107. NER.pdf', 'page': 1})

In [54]:
print(text_chunks[100].page_content)

text, such as news articles, and many vernacular words are not covered by those embed‑
dings. When that happens, an embedding for a generic unknown token is usually used
to represent this vernacular word and, as a result, the actual semantics of the word are
lost. Second, compared with the basic BiLSTM–CRF model, our presented model adds an
ALBERT layer to capture the dynamic and contextualized semantics of words.
4.3. BiLSTM Layer


In [16]:
os.environ["OPENAI_API_KEY"] = "sk-proj-I6QtQXxx4IvoTOh3QsyfT3BlbkFJf1Qve6gorT3IcMkMfpYK"

In [17]:
embeddings = OpenAIEmbeddings()

  warn_deprecated(


In [18]:
embeddings.embed_query("Hello! How are you?")

RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

In [64]:
pinecone_api_key = os.environ.get('PINECONE_API_KEY', '5965fff2-b55f-4f7c-80ad-2dea835b8edd')
pinecone_api_env = os.environ.get('PINECONE_API_ENV', 'gcp-starter')

In [99]:
os.environ["PINECONE_API_KEY"] = '5965fff2-b55f-4f7c-80ad-2dea835b8edd'

In [65]:
pinecone.init(api_key = pinecone_api_key, environment = pinecone_api_env)

AttributeError: init is no longer a top-level attribute of the pinecone package.

Please create an instance of the Pinecone class instead.

Example:

    import os
    from pinecone import Pinecone, ServerlessSpec

    pc = Pinecone(
        api_key=os.environ.get("PINECONE_API_KEY")
    )

    # Now do stuff
    if 'my_index' not in pc.list_indexes().names():
        pc.create_index(
            name='my_index', 
            dimension=1536, 
            metric='euclidean',
            spec=ServerlessSpec(
                cloud='aws',
                region='us-west-2'
            )
        )



In [100]:
pc = PineconeGRPC(
        api_key = pinecone_api_key
    )

In [101]:
index_name = "testing" # put in the name of your pinecone index here

In [102]:
pc.list_indexes().names()

['testing']

In [103]:
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name = index_name,
        dimension = 1536,
        metric = "cosine",
        spec = ServerlessSpec(
            cloud = 'aws',
            region = 'us-east-1'
        )
    )

In [104]:
pc.list_indexes().names()

['testing']

In [105]:
index = pc.Index(index_name) #Accessing pinecone index

In [106]:
namespace = "wondervector5000"

docsearch = PineconeVectorStore.from_documents(
    documents = text_chunks,
    index_name = index_name,
    embedding = embeddings,
    namespace = namespace
)

RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

In [107]:
query = "YOLOv7 outperforms which models"

In [None]:
docs = docsearch.similarity_search(query)

In [None]:
docs

#We get numbers
'''
[Document(page_content='YOLOv7-tiny 6.2 3.5 320 30.8% 47.3% 32.2% 10.0% 31.9% 52.2%\nimprovement -39% -49% - = = = -0.9 = +0.7\nYOLOR-E6 [81] 115.8M 683.2G 1280 55.7% 73.2% 60.7% 40.1% 60.4% 69.2%\nYOLOv7-E6 97.2M 515.2G 1280 55.9% 73.5% 61.1% 40.6% 60.3% 70.0%\nimprovement -19% -33% - +0.2 +0.3 +0.4 +0.5 -0.1 +0.8\nYOLOR-D6 [81] 151.7M 935.6G 1280 56.1% 73.9% 61.2% 42.4% 60.5% 69.9%\nYOLOv7-D6 154.7M 806.8G 1280 56.3% 73.8% 61.4% 41.3% 60.6% 70.1%\nYOLOv7-E6E 151.7M 843.2G 1280 56.8% 74.4% 62.1% 40.8% 62.1% 70.6%'),
 Document(page_content='YOLOv5-L6 (r6.1) [23] 76.8M 445.6G 1280 63 - / 53.7% - -\nYOLOX-X [21] 99.1M 281.9G 640 58 51.5% / 51.1% - -\nYOLOv7-E6 97.2M 515.2G 1280 56 56.0% /55.9% 73.5% 61.2%\nYOLOR-E6 [81] 115.8M 683.2G 1280 45 55.8% / 55.7% 73.4% 61.1%\nPPYOLOE-X [85] 98.4M 206.6G 640 45 52.2% / 51.9% 69.9% 56.5%\nYOLOv7-D6 154.7M 806.8G 1280 44 56.6% /56.3% 74.0% 61.8%\nYOLOv5-X6 (r6.1) [23] 140.7M 839.2G 1280 38 - / 55.0% - -\nYOLOv7-E6E 151.7M 843.2G 1280 36 56.8% /56.8% 74.4% 62.1%'),
 Document(page_content='YOLOv7: Trainable bag-of-freebies sets new state-of-the-art for real-time object\ndetectors\nChien-Yao Wang1, Alexey Bochkovskiy, and Hong-Yuan Mark Liao1\n1Institute of Information Science, Academia Sinica, Taiwan\nkinyiu@iis.sinica.edu.tw, alexeyab84@gmail.com, and liao@iis.sinica.edu.tw\nAbstract\nYOLOv7 surpasses all known object detectors in both\nspeed and accuracy in the range from 5 FPS to 160 FPS\nand has the highest accuracy 56.8% AP among all known'),
 Document(page_content='YOLOv5-X (r6.1) [23] 86.7M 205.7G 640 83 - / 50.7% - - - - -\nYOLOR-CSP [81] 52.9M 120.4G 640 106 51.1% / 50.8% 69.6% 55.7% 31.7% 55.3% 64.7%\nYOLOR-CSP-X [81] 96.9M 226.8G 640 87 53.0% / 52.7% 71.4% 57.9% 33.7% 57.1% 66.8%\nYOLOv7-tiny-SiLU 6.2M 13.8G 640 286 38.7% / 38.7% 56.7% 41.7% 18.8% 42.4% 51.9%\nYOLOv7 36.9M 104.7G 640 161 51.4% / 51.2% 69.7% 55.9% 31.8% 55.5% 65.0%\nYOLOv7-X 71.3M 189.9G 640 114 53.1% / 52.9% 71.2% 57.8% 33.8% 57.1% 67.4%')]
 '''

In [109]:
llm = OpenAI()

  warn_deprecated(


In [None]:
qa = RetrievalQA.from_chain_type(llm = llm, chain_type = "stuff", retriever = docsearch.as_retriever())

In [None]:
qa.run(query)
#' YOLOv7 outperforms YOLOv5-L6, YOLOX-X, YOLOR-E6, PPYOLOE-X, YOLOv5-X6, and YOLOR-CSP.'

In [None]:
import sys
while True:
  user_input = input(f"Input Prompt: ")
  if user_input == 'exit':
    print('Exiting')
    sys.exit()
  if user_input == '':
    continue
  result = qa({'query': user_input})
  print(f"Answer: {result['result']}")

'''
Input Prompt:  what is a yolo?
Answer:  YOLO (You Only Look Once) is a type of object detector, specifically a deep learning algorithm used for object detection in images and videos. It was developed by Chien-Yao Wang, Alexey Bochkovskiy, and Hong-Yuan Mark Liao from the Institute of Information Science at Academia Sinica in Taiwan. YOLOv7 is the latest version of the algorithm, which has set a new state-of-the-art for real-time object detectors in terms of speed and accuracy.
Input Prompt:  who is invented the yolo?
Answer:  Joseph Redmon and Ali Farhadi
Input Prompt:  what was the accuracy of the yolov7?
Answer:  The accuracy of the YOLOv7 was 56.8% AP test-dev / 56.8% AP min-val.
Input Prompt:  exit
Exiting
'''