In [1]:
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter


loader = UnstructuredPDFLoader("attention.pdf")

In [2]:
data = loader.load()

print (f'You have {len(data)} documents in your data.')
print(f'There are {len(data[0].page_content)} characters in the first document.')

detectron2 is not installed. Cannot use the hi_res partitioning strategy. Falling back to partitioning with the fast strategy.


You have 1 documents in your data.
There are 39933 characters in the first document.


### Chunk data into smaller documents

In [3]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(data)

print(f'There are {len(texts)} chunked documents now.')

There are 43 chunked documents now.


In [4]:
texts[0]

Document(page_content='7\n\n1\n\n0\n\n2\n\nc\n\ne\n\nD\n\n6\n\n]\n\nL\n\nC\n\n.\n\ns\n\nc\n\n[\n\n5\n\nv\n\n2\n\n6\n\n7\n\n3\n\n0\n\n.\n\n6\n\n0\n\n7\n\n1\n\n:\n\nv\n\ni\n\nX\n\nr\n\na\n\nAttention Is All You Need\n\nAshish Vaswani∗\n\nGoogle Brain\n\navaswani@google.com\n\nNoam Shazeer∗\n\nGoogle Brain\n\nnoam@google.com\n\nNiki Parmar∗\n\nGoogle Research\n\nnikip@google.com\n\nJakob Uszkoreit∗\n\nGoogle Research\n\nusz@google.com\n\nLlion Jones∗\n\nGoogle Research\n\nllion@google.com\n\nAidan N. Gomez∗ †\n\nUniversity of Toronto\n\naidan@cs.toronto.edu\n\nŁukasz Kaiser∗\n\nGoogle Brain\n\nlukaszkaiser@google.com\n\nIllia Polosukhin∗ ‡\n\nillia.polosukhin@gmail.com\n\nAbstract\n\nThe dominant sequence transduction models are based on complex recurrent or\n\nconvolutional neural networks that include an encoder and a decoder. The best\n\nperforming models also connect the encoder and decoder through an attention\n\nmechanism. We propose a new simple network architecture, the Transforme

### Embeddings creation for the documents to get them ready for semantic search

For more information on how to use Weaviate https://weaviate.io/developers/weaviate/quickstart/end-to-end

In [5]:
from langchain.vectorstores.weaviate import Weaviate
from langchain.embeddings.openai import OpenAIEmbeddings
import weaviate

embeddings = OpenAIEmbeddings()

# docker-compose up -d to run, docker-compose down to stop
client = weaviate.Client("http://localhost:8080")
weaviate = Weaviate(client, "Document", "text")
# weaviate.add_documents(texts) # run once to add documents to weaviate

# you can also go to http://localhost:8080/v1/objects to see the documents
client.schema.get()

{'classes': [{'class': 'Document',
   'description': "This property was generated by Weaviate's auto-schema feature on Mon Apr 24 19:59:29 2023",
   'invertedIndexConfig': {'bm25': {'b': 0.75, 'k1': 1.2},
    'cleanupIntervalSeconds': 60,
    'stopwords': {'additions': None, 'preset': 'en', 'removals': None}},
   'moduleConfig': {'text2vec-openai': {'model': 'ada',
     'modelVersion': '002',
     'type': 'text',
     'vectorizeClassName': True}},
   'properties': [{'dataType': ['text'],
     'description': "This property was generated by Weaviate's auto-schema feature on Mon Apr 24 19:59:29 2023",
     'moduleConfig': {'text2vec-openai': {'skip': False,
       'vectorizePropertyName': False}},
     'name': 'text',
     'tokenization': 'word'},
    {'dataType': ['text'],
     'description': "This property was generated by Weaviate's auto-schema feature on Mon Apr 24 19:59:29 2023",
     'moduleConfig': {'text2vec-openai': {'skip': False,
       'vectorizePropertyName': False}},
     'n

In [None]:
'''Created once at the very beginning, no need to recreate

schema = {
    "classes": [
    {
      "class": "Document",
      "description": "A document",
      "vectorizer": "text2vec-openai",
      "moduleConfig": {
        "text2vec-openai": {
          "model": "ada",
          "modelVersion": "002",
          "type": "text"
        }
      },
      "properties": [{
                "name": "content",
                "dataType": ["text"],
                }]
    }
    ]
}

client.schema.create(schema)
client.schema.get()'''

In [6]:
from langchain.llms import OpenAI
from langchain.chains import ChatVectorDBChain

MyOpenAI = OpenAI()
qa = ChatVectorDBChain.from_llm(MyOpenAI, weaviate)

chat_history = []

print("Please enter a question or dialogue to get started!")

query = input("")
result = qa({"question": query, "chat_history": chat_history})
print(result["answer"])
chat_history = [(query, result["answer"])]


Please enter a question or dialogue to get started!
 Transformers are trained faster than architectures based on recurrent or convolutional layers and they allow for significantly more parallelization. They also require significantly less time to train than convolutional neural networks.
