In [48]:
!pip install langchain
!pip install openai
!pip install PyPDF2
!pip install faiss-cpu
!pip install tiktoken



In [None]:
from PyPDF2 import PdfReader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import ElasticVectorSearch, Pinecone, Weaviate, FAISS

In [106]:
# Get your API keys from openai, you will need to create an account.
# Here is the link to get the keys: https://platform.openai.com/account/billing/overview
import os
os.environ["OPENAI_API_KEY"] = " "

In [107]:
# connect your Google Drive
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
root_dir = "/content/gdrive/My Drive/"

Mounted at /content/gdrive


In [108]:
# location of the pdf file/files.
reader = PdfReader('/content/gdrive/My Drive/Data/po_order_2.pdf')

In [109]:
reader

<PyPDF2._reader.PdfReader at 0x7b0c18225f60>

In [110]:
# read data from the file and put them into a variable called raw_text
raw_text = ''
for i, page in enumerate(reader.pages):
    text = page.extract_text()
    if text:
        raw_text += text

In [111]:
raw_text

'VAOLABSPOBox12345HappyRoad,VA454545\nPurchase\nOrder\nDate\nP.O.\nNo.\n08/08/2022778899\nShip\nTo\nTrueMan321CedarAvenueBelleview,IL78901\nItem\nDescription\nQty\nRate\nAmount\nHepatrox330Hepatrox20PLbags5010.00500.00\nTotal$500.00\nPhone\n#\nFax\n#\nE-mail\n(804)123-12345(804)123-1234john.doe@vao.world'

In [112]:
raw_text[:100]

'VAOLABSPOBox12345HappyRoad,VA454545\nPurchase\nOrder\nDate\nP.O.\nNo.\n08/08/2022778899\nShip\nTo\nTrueMan321'

In [113]:
# We need to split the text that we read into smaller chunks so that during information retreival we don't hit the token size limits.

text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 1000,
    chunk_overlap  = 200,
    length_function = len,
)
texts = text_splitter.split_text(raw_text)

In [114]:
raw_text

'VAOLABSPOBox12345HappyRoad,VA454545\nPurchase\nOrder\nDate\nP.O.\nNo.\n08/08/2022778899\nShip\nTo\nTrueMan321CedarAvenueBelleview,IL78901\nItem\nDescription\nQty\nRate\nAmount\nHepatrox330Hepatrox20PLbags5010.00500.00\nTotal$500.00\nPhone\n#\nFax\n#\nE-mail\n(804)123-12345(804)123-1234john.doe@vao.world'

In [115]:
len(texts)

1

In [116]:
texts[0]

'VAOLABSPOBox12345HappyRoad,VA454545\nPurchase\nOrder\nDate\nP.O.\nNo.\n08/08/2022778899\nShip\nTo\nTrueMan321CedarAvenueBelleview,IL78901\nItem\nDescription\nQty\nRate\nAmount\nHepatrox330Hepatrox20PLbags5010.00500.00\nTotal$500.00\nPhone\n#\nFax\n#\nE-mail\n(804)123-12345(804)123-1234john.doe@vao.world'

In [117]:
# Download embeddings by HuggingFace
embeddings = HuggingFaceEmbeddings()

In [118]:
docsearch = FAISS.from_texts(texts, embeddings)

In [119]:
docsearch

<langchain.vectorstores.faiss.FAISS at 0x7b0c1a320250>

In [120]:
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI

In [121]:
chain = load_qa_chain(OpenAI(), chain_type="stuff")

In [122]:
query = "what is the Ship To Address, Date, Material Description, Material Quantity?"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

' The Ship To Address is TrueMan321CedarAvenueBelleview,IL78901, the Date is 08/08/2022, the Material Description is Hepatrox330, and the Material Quantity is 20 PL bags.'

In [123]:
query = "what is 50"
docs = docsearch.similarity_search(query)
query_embedings= chain.run(input_documents=docs, question=query)
query_embedings

' 50 is the quantity of Hepatrox 330 Hepatrox PL bags.'

In [None]:
query = "What was the cost of training the GPT4all model?"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

" I don't know."

In [124]:
query = "How was the model trained?"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

" I don't know."

In [125]:
query = "what was the size of the training dataset?"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

" I don't know."

In [126]:
query = "How is this different from other models?"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

" I don't know."

In [127]:
query = "What is Google Bard?"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)



" I don't know."

In [128]:
query = "what trying to make rakesh meena?"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)



" I don't know."