In [20]:
import os

import tqdm

from langchain.document_loaders import UnstructuredFileLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores.pgvector import PGVector

In [13]:
EXAMPLE_DOCS_DIRECTORY = "../../example-docs"

In [30]:
docs = []
files = os.listdir(EXAMPLE_DOCS_DIRECTORY)
for f in tqdm.tqdm(files):
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, f)
    if not os.path.isfile(filename):
        continue
    if not filename.endswith(".pdf"):
        continue
    loader = UnstructuredFileLoader(filename, mode="elements", strategy="fast")
    docs.extend(loader.load())

 34%|███████████████████████▋                                             | 11/32 [00:00<00:00, 104.68it/s]PDF text is not extractable. Cannot use the fast partitioning strategy. Falling back to partitioning with the ocr_only strategy.
100%|██████████████████████████████████████████████████████████████████████| 32/32 [00:04<00:00,  7.10it/s]


In [31]:
docs[0]

Document(page_content='1 2 0 2', metadata={'source': '../../example-docs/layout-parser-paper-fast.pdf', 'filename': '../../example-docs/layout-parser-paper-fast.pdf', 'page_number': 1, 'category': 'UncategorizedText'})

In [32]:
vectorstore = PGVector.from_documents(
    docs, 
    embedding=OpenAIEmbeddings(),
    connection_string="postgresql://localhost:5432/postgres",
)

In [34]:
results = vectorstore.similarity_search("document image analysis", k=3, filter={"category": "NarrativeText"})

In [36]:
results[1].metadata

{'source': '../../example-docs/copy-protected.pdf',
 'filename': '../../example-docs/copy-protected.pdf',
 'page_number': 2,
 'category': 'NarrativeText'}

In [40]:
from copy import deepcopy
import datetime

In [38]:
m = deepcopy(results[1].metadata)

In [41]:
m["date"] = datetime.datetime.now()

In [42]:
m

{'source': '../../example-docs/copy-protected.pdf',
 'filename': '../../example-docs/copy-protected.pdf',
 'page_number': 2,
 'category': 'NarrativeText',
 'date': datetime.datetime(2023, 5, 9, 15, 58, 56, 419472)}