In [10]:
import os

from docling.datamodel.base_models import InputFormat
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions, TableFormerMode

import pandas as pd

import requests

In [3]:
def get_text_from_file(filepath: str) -> str:
    pipeline_options = PdfPipelineOptions()
    pipeline_options.do_ocr = False
    pipeline_options.do_table_structure = True
    pipeline_options.table_structure_options.do_cell_matching = True
    pipeline_options.table_structure_options.mode = (
        TableFormerMode.ACCURATE
    )  # use more accurate TableFormer model

    converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(
                pipeline_options=pipeline_options,
            )
        }
    )

    result = converter.convert(filepath)

    tables = ""
    for _, table in enumerate(result.document.tables):
        table_df: pd.DataFrame = table.export_to_dataframe()
        tables = tables + table_df.to_markdown() + "\n"

    return tables

In [4]:
file_path = "data/ab-5.png"

tables = get_text_from_file(file_path)

print(tables)

|    | 0           | 1                 | 2                 | 3                        |
|---:|:------------|:------------------|:------------------|:-------------------------|
|  0 | Pos Art-Nr. |                   | Bezeichnung       | Menge Einzelpreis Betrag |
|  1 | 1123        | Farbe Weiss       | 25,49             | 25,49 €                  |
|  2 | 2           |                   | 60,00             | 10 6d0,Do €              |
|  3 |             | Ncttobetrag       | Ncttobetrag       | 625,49 €                 |
|  4 |             | Umsatzsteucr 1990 | Umsatzsteucr 1990 | 118,84 €                 |



In [5]:
url = "http://0.0.0.0:8111/process"
data = {"order_number": "1235", "table": tables}

response = requests.post(url, json=data)

print(response.json())

[{'SKU': '', 'Found': False}, {'SKU': '', 'Found': False}]


In [8]:
file_path = "data/ab-4.pdf"

tables = get_text_from_file(file_path)

print(tables)

|    | Pos.   | Menge    | Bezeichnung                       | Einzelpreis   | Gesamtpreis   |
|---:|:-------|:---------|:----------------------------------|:--------------|:--------------|
|  0 | 1      | 1 Stk.   | Fernseher 40 Zoll | Musterartikel | 1000,00 EUR   | 1000,00 EUR   |
|  1 | 2      | Pauschal | Anfahrt und Aufbau                | 120,00 EUR    | 120,00 EUR    |
|  2 |        |          |                                   | Zwischensumme | 1120,00 EUR   |
|  3 |        |          |                                   | 19% MwSt.     | 212,80 EUR    |
|  4 |        |          |                                   | Gesamtbetrag  | 1332,80 EUR   |



In [9]:
url = "http://0.0.0.0:8111/process"
data = {"order_number": "1234", "table": tables}

response = requests.post(url, json=data)

print(response.json())

[{'SKU': '123456', 'Found': False}, {'SKU': '123457', 'Found': True}]


In [15]:
file_path = "./data/ab-4.pdf"
url = "http://localhost:8111/extract-table"

headers = {"Content-Type": "application/json"}
payload = {"filepath": file_path}

response = requests.post(url, json=payload, headers=headers)
table = response.json()

table = response.json().get("tables")
print(table)

|    | Pos.   | Menge    | Bezeichnung                       | Einzelpreis   | Gesamtpreis   |
|---:|:-------|:---------|:----------------------------------|:--------------|:--------------|
|  0 | 1      | 1 Stk.   | Fernseher 40 Zoll | Musterartikel | 1000,00 EUR   | 1000,00 EUR   |
|  1 | 2      | Pauschal | Anfahrt und Aufbau                | 120,00 EUR    | 120,00 EUR    |
|  2 |        |          |                                   | Zwischensumme | 1120,00 EUR   |
|  3 |        |          |                                   | 19% MwSt.     | 212,80 EUR    |
|  4 |        |          |                                   | Gesamtbetrag  | 1332,80 EUR   |



In [18]:
url = "http://localhost:5678/webhook/33cb034f-b754-4992-be53-884db4dd9479"

headers = {"Content-Type": "application/json"}
payload = {"order_number": 1234}

response = requests.post(url, json=payload, headers=headers)
response.json()

[{'output': {'SKU': '123456',
   'Product Name': 'Fernseher 40 Zoll | Musterartikel',
   'Found': True}},
 {'output': {'SKU': '123457',
   'Product Name': 'Anfahrt und Aufbau',
   'Found': True}}]