## RAG Project

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
from unstructured_client import UnstructuredClient
from unstructured_client.models import shared
from unstructured_client.models.errors import SDKError

from unstructured.chunking.title import chunk_by_title
from unstructured.partition.md import partition_md
from unstructured.partition.pptx import partition_pptx
from unstructured.staging.base import dict_to_elements

import chromadb
import os

In [5]:
s = UnstructuredClient(
    api_key_auth=os.getenv("UNSTRUCTURED_API_KEY")
)

### Preprocess PDF

In [6]:
filename = "./example_datasets/donut_paper.pdf"

with open(filename, "rb") as f:
    files=shared.Files(
        content=f.read(),
        file_name=filename,
    )

req = shared.PartitionParameters(
    files=files,
    strategy="hi_res",
    hi_res_model_name="yolox",
    pdf_infer_table_structure=True,
    skip_infer_table_types=[],
)

try:
    resp = s.general.partition(req)
    pdf_elements = dict_to_elements(resp.elements)
except SDKError as e:
    print(e)

In [7]:
pdf_elements[0].to_dict()

{'type': 'Title',
 'element_id': '59a9f0edd370eaa8c5c59cd9256e63bd',
 'text': 'OCR-free Document Understanding Transformer',
 'metadata': {'filetype': 'application/pdf',
  'languages': ['eng'],
  'page_number': 1,
  'filename': 'donut_paper.pdf'}}

In [8]:
tables = [el for el in pdf_elements if el.category == "Table"]

In [9]:
table_html = tables[0].metadata.text_as_html

In [10]:
from io import StringIO 
from lxml import etree

parser = etree.XMLParser(remove_blank_text=True)
file_obj = StringIO(table_html)
tree = etree.parse(file_obj, parser)
print(etree.tostring(tree, pretty_print=True).decode())

<table>
  <tr>
    <td>NAVER CLOVA 4Upstage</td>
    <td>2NAVER Search STmax 6Google</td>
    <td>3SNAVER AI Lal 7LBox</td>
  </tr>
</table>



### Filter Out References

In [11]:
reference_title = [
    el for el in pdf_elements
    if el.text == "References"
    and el.category == "Title"
][0]

In [12]:
reference_title.to_dict()

{'type': 'Title',
 'element_id': '05db3e5dd95df32622138973f0d4b9ed',
 'text': 'References',
 'metadata': {'filetype': 'application/pdf',
  'languages': ['eng'],
  'page_number': 15,
  'parent_id': 'f6fdb355e327d011494e182b9f994661',
  'filename': 'donut_paper.pdf'}}

In [14]:
references_id = reference_title.id
references_id

'05db3e5dd95df32622138973f0d4b9ed'

In [15]:
for element in pdf_elements:
    if element.metadata.parent_id == references_id:
        print(element)
        break

1. Afzal, M.Z., Capobianco, S., Malik, M.I., Marinai, S., Breuel, T.M., classification with Dengel, A., Liwicki, M.: Deepdocclassifier: Document deep convolutional neural network. In: 2015 13th International Conference on Document Analysis and Recognition (ICDAR). pp. 1111–1115 (2015). https://doi.org/10.1109/ICDAR.2015.7333933 1, 4, 14


In [16]:
pdf_elements = [el for el in pdf_elements if el.metadata.parent_id != references_id]

### Filter Out Headers

In [17]:
headers = [el for el in pdf_elements if el.category == "Header"]

In [18]:
headers[1].to_dict()

{'type': 'Header',
 'element_id': 'a4b916e36299d9c6f3f676a6480f550c',
 'text': 'OCR-free Document Understanding Transformer',
 'metadata': {'filetype': 'application/pdf',
  'languages': ['eng'],
  'page_number': 3,
  'filename': 'donut_paper.pdf'}}

In [19]:
pdf_elements = [el for el in pdf_elements if el.category != "Header"]

### Preprocess PPTX Files

In [21]:
filename = "./example_datasets/donut_slide.pptx"
pptx_elements = partition_pptx(filename=filename)

### Preprocess Markdown File

In [22]:
filename = "./example_datasets/donut_readme.md"
md_elements = partition_md(filename=filename)

INFO: Reading document from string ...
INFO: Reading document ...


### Creating Chunks

In [23]:
elements = chunk_by_title(pdf_elements + pptx_elements + md_elements)

In [24]:
type(elements)

list

In [26]:
elements[0].to_dict()

{'type': 'CompositeElement',
 'element_id': '0f7a2a45-4278-49ff-a19c-842b06606043',
 'text': 'OCR-free Document Understanding Transformer\n\n2 2 0 2\n\nGeewook Kim1∗, Teakgyu Hong4†, Moonbin Yim2†, Jeongyeon Nam1, Jinyoung Park5†, Jinyeong Yim6†, Wonseok Hwang7†, Sangdoo Yun3, Dongyoon Han3, and Seunghyun Park1\n\nt c O 6',
 'metadata': {'filename': 'donut_paper.pdf',
  'filetype': 'application/pdf',
  'languages': ['eng'],
  'page_number': 1,
  'orig_elements': 'eJzVU01v1DAQ/SujnBdInMTecC0SqyJaRLdCValWk3gcrN3YkdehhIr/znjVRStUDj1VXCzNmzdfb8a3DxntaCAXN1ZnbyGrG2xMTlqXKifEZVd3ddPpRtSSZNnqbAHZQBE1RmT+Q2bsjhwOlIK1d1PcjDhSeD1qk7jJHefx4MZx3NkOo/XuzaN7h66fsKc9+28zcn12x+jIyMZNQ0uB8eIXQ5F+xJTj8uzzKxOI4J3vptQ3XDtNYR/Raet6WAd0e+PDwKEp7LH02sYdZZzo73nLtiGdy5yolSXVJMyyYQlk15pGSlm/xLwHJDxjJ6cCCRCQgzgd/tpxE9T7YH+SXifeE0IYqauqEq0SS6NUp1RRClSdrNpCGGrwvxPiPdG991v4YIfi6yREoRawJtz28wQr7/qKwVzkC/jovWutgxs7iCN2TsyY+YELHLiPc+tmP/F9fcKwrf+wGE3EFCqP4Bfv9sR1V/c8ojqiV2xo7+FmcuWCj5ezc1lYYTL5duGKOP23eXKHEsXp/i4wBBbv