In [1]:
import json

import tqdm
from unstructured.partition.pdf import partition_pdf
from unstructured.staging.weaviate import create_unstructured_weaviate_class, stage_for_weaviate
import weaviate

In [2]:
filename = "../../example-docs/layout-parser-paper-fast.pdf"
elements = partition_pdf(filename=filename, strategy="fast")

In [3]:
unstructured_class_name = "UnstructuredDocument"

In [4]:
unstructured_class = create_unstructured_weaviate_class(unstructured_class_name)
schema = {"classes": [unstructured_class]}                    

In [5]:
from weaviate.util import generate_uuid5
client = weaviate.Client("http://localhost:8080")

In [6]:
client.schema.create(schema)

In [7]:
data_objects = stage_for_weaviate(elements)

In [8]:
with client.batch(batch_size=10) as batch:
    for data_object in tqdm.tqdm(data_objects):
        batch.add_data_object(
            data_object,
            unstructured_class_name,
            uuid=generate_uuid5(data_object),
        )

100%|██████████████████████████████████████████████████████████████████████| 28/28 [00:46<00:00,  1.66s/it]


In [9]:
near_text = {"concepts": ["document understanding"]}

result = (
    client.query
    .get("UnstructuredDocument", ["text"])
    .with_near_text(near_text)
    .with_limit(1)
    .do()
)

print(json.dumps(result, indent=4))

{
    "data": {
        "Get": {
            "UnstructuredDocument": [
                {
                    "text": "Deep Learning(DL)-based approaches are the state-of-the-art for a wide range of document image analysis (DIA) tasks including document image classi\ufb01cation [11,"
                }
            ]
        }
    }
}
