# Using unstructured

In [2]:
#Examples: https://colab.research.google.com/gist/alejandro-ao/47db0b8b9d00b10a96ab42dd59d90b86/langchain-multimodal.ipynb#scrollTo=8326a750

#!more .env
import os
import base64
import uuid
from dotenv import load_dotenv

load_dotenv() 

True

### Unstructured: Using chunking_strategy="by_title" and reconstructing orig_elements


In [3]:
from langchain_community.document_loaders import UnstructuredPDFLoader

path = "../Data/"
file1 = "BV_page237.pdf"
file2 = "test_sample.pdf"
file3 = "bearing_block.pdf"
file4 = "Image-based-pdf-sample.pdf"
file5 = "BV_page231.pdf"
file6 = "Table 12.pdf"

In [4]:

# Using the UnstructuredPDFLoader to load the document. In special Images and Tables.
# Since we are usinng chunking strategy by_title, the image and tables are inside the orig_elements metadata.
loader = UnstructuredPDFLoader(path+file1,  
                               mode="elements", 
                               strategy="hi_res",
                               extract_image_block_types=["Image"],
                               extract_text_as_html = True,
                               extract_image_block_to_payload = True, 
                               chunking_strategy="by_title", 
                               max_characters=4000,  
                               new_after_n_chars=3800
                               )
pages = loader.load()
print(f"Loaded {len(pages)} documents from {file1}")

  from .autonotebook import tqdm as notebook_tqdm


Loaded 3 documents from BV_page237.pdf


In [35]:
from unstructured.partition.pdf import partition_pdf

chunks = partition_pdf(path+file6,  
                        mode="elements", 
                        strategy="hi_res",
                        extract_image_block_types=["Image"],
                        extract_image_block_to_payload = True,
                        infer_table_structure=True,
                        chunking_strategy="by_title", 
                        max_characters=4000,  
                        new_after_n_chars=3800
                        )
#pages = loader.load()
print(f"Loaded {len(chunks)} documents from {file6}")

Loaded 2 documents from Table 12.pdf


In [15]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

loader = PyPDFLoader(path+file2)
pages = loader.load()
document_text = "".join([page.page_content for page in pages])

# Split the document into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # Adjust as needed
    chunk_overlap=200  # Adjust as needed
)
chunks = text_splitter.create_documents([document_text])

In [36]:
chunks

[<unstructured.documents.elements.CompositeElement at 0x230024ae550>,
 <unstructured.documents.elements.Table at 0x230024afa50>]

In [37]:
chunks[1].metadata.to_dict()

{'last_modified': '2025-02-04T15:18:04',
 'text_as_html': '<table><tr><td>Material (annealed)</td><td>Specified minimum tensile strength 6 2 (N/mm?)</td><td>Design temperature (°C)</td></tr><tr><td>&lt;50</td><td>75</td><td>100</td><td>125</td><td>150</td><td>175</td><td>200</td><td>225</td><td>250</td><td>275</td><td>300</td></tr><tr><td>Copper</td><td>215</td><td>41</td><td>41</td><td>40</td><td>40</td><td>34</td><td>1,13 |</td><td>0,75</td><td/><td/><td/><td/></tr><tr><td>Aluminium brass</td><td>325|</td><td>78]</td><td>78]</td><td>78]</td><td>78|</td><td>78]</td><td>51]</td><td>1,00</td><td/><td/><td/><td/></tr><tr><td>Copper- nickel 95/5 and 90/10</td><td>275</td><td>68</td><td>68</td><td>67</td><td>2,71</td><td>64</td><td>62</td><td>59</td><td>56</td><td>52</td><td>48</td><td>44</td></tr><tr><td>Copper- nickel 70/30</td><td>365</td><td>81</td><td>79</td><td>77</td><td>75</td><td>73</td><td>71</td><td>69</td><td>67</td><td>2,71</td><td>64</td><td>62</td></tr></table>',
 'filetype'

In [7]:
from IPython.display import display, HTML
correct_table = '''<table border="1" class="dataframe">
    <tr>
        <th rowspan="2">Material (annealed)</th>
        <th rowspan="2">Specified minimum tensile strength (N/mm²)</th>
        <th colspan="12">Design temperature (°C)</th>
    </tr>
    <tr>
        <th>≤50</th>
        <th>75</th>
        <th>100</th>
        <th>125</th>
        <th>150</th>
        <th>175</th>
        <th>200</th>
        <th>225</th>
        <th>250</th>
        <th>275</th>
        <th>300</th>
    </tr>
    <tr>
        <td>Copper</td>
        <td>215</td>
        <td>41</td>
        <td>41</td>
        <td>40</td>
        <td>40</td>
        <td>34</td>
        <td>27.5</td>
        <td>18.5</td>
        <td></td>
        <td></td>
        <td></td>
        <td></td>
    </tr>
    <tr>
        <td>Aluminium brass</td>
        <td>325</td>
        <td>78</td>
        <td>78</td>
        <td>78</td>
        <td>78</td>
        <td>78</td>
        <td>51</td>
        <td>24.5</td>
        <td></td>
        <td></td>
        <td></td>
        <td></td>
    </tr>
    <tr>
        <td>Copper-nickel 95/5 and 90/10</td>
        <td>275</td>
        <td>68</td>
        <td>68</td>
        <td>67</td>
        <td>65.5</td>
        <td>64</td>
        <td>62</td>
        <td>59</td>
        <td>56</td>
        <td>52</td>
        <td>48</td>
        <td>44</td>
    </tr>
    <tr>
        <td>Copper-nickel 70/30</td>
        <td>365</td>
        <td>81</td>
        <td>79</td>
        <td>77</td>
        <td>75</td>
        <td>73</td>
        <td>71</td>
        <td>69</td>
        <td>67</td>
        <td>65.5</td>
        <td>64</td>
        <td>62</td>
    </tr>
</table>'''
# Define the HTML table
html_table = '''   
    <table border="1" class="dataframe">
  <tbody>
    <tr>
      <td></td>
    </tr>
    <tr>
      <td></td>
    </tr>
  </tbody>
</table>
'''

# Display the HTML table
display(HTML(html_table))


In [6]:
import base64
from io import BytesIO
from PIL import Image

# Base64 encoded image string
base64_image = "/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBwcJCQgKDBQNDAsLDBkSEw8UHRofHh0aHBwgJC4nICIsIxwcKDcpLDAxNDQ0Hyc5PTgyPC4zNDL/2wBDAQkJCQwLDBgNDRgyIRwhMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjL/wAARCACEAXADASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/8QAHwEAAwEBAQEBAQEBAQAAAAAAAAECAwQFBgcICQoL/8QAtREAAgECBAQDBAcFBAQAAQJ3AAECAxEEBSExBhJBUQdhcRMiMoEIFEKRobHBCSMzUvAVYnLRChYkNOEl8RcYGRomJygpKjU2Nzg5OkNERUZHSElKU1RVVldYWVpjZGVmZ2hpanN0dXZ3eHl6goOEhYaHiImKkpOUlZaXmJmaoqOkpaanqKmqsrO0tba3uLm6wsPExcbHyMnK0tPU1dbX2Nna4uPk5ebn6Onq8vP09fb3+Pn6/9oADAMBAAIRAxEAPwD3+iiigAooooAKKKKACiiigAooooAKKKRlDKVYAqRgg9DQBw93rF7401O40Xw9cSW2k27+XqOswthiw6w25/v+r/w9ucVsjwbpeP8Aj710/wDcdvf/AI7TfAMaR/Dvw0saKoOl2zEKMcmJST+JJNdFQBz/APwhul/8/Wuf+D29/wDj1H/CG6X/AM/Wuf8Ag9vf/j1cW+lX3jL4n+LbG48UeINNtNISyS2h0u8+zriSIu24bTuO7v15x0Axn+MNOvfhnDpWvWPjTWbh2vfs7p4gvpbi02tDKcukS7ycqMYzg4z6gA9E/wCEN0v/AJ+tc/8AB7e//HqP+EN0v/n61z/we3v/AMerx/8A4Xbrn/Qw+Bv/AAC1P/43R/wu3XP+hh8Df+AWp/8AxugD2D/hDdL/AOfrXP8Awe3v/wAeo/4Q3S/+frXP/B7e/wDx6vH/APhduuf9DD4G/wDALU//AI3R/wALt1z/AKGHwN/4Ban/APG6APRpLbxp4RUppEf/AAlenE4jt727WC7g/wC2zDbIvX72GHHJqH/hLfiH/wBEw/8AK/b/AOFef/8AC7dc/wChh8Df+AWp/wDxuj/hduuf9DD4G/8AALU//jdAHaap8R/Fvh6wbVNe+HklnpcLoLm4j1mCZo1ZwuQgGWOWHGR9R1r0yvmDxt8UtV8R+EL7SbnWfClxDP5e6Kwtb5Jm2yKw2mVAg5GTk9M45xW//wALt1z/AKGHwN/4Ban/APG6APoCiuF+GXjiXxnpuoNeXmlXF5aXOwf2asqI0RRCrbZcP94uuSAPl49T3VABRXPN420P7bdWkMl7dy2knlTmy064uUjfAJUvHGy5GeRnIpf+Ey0v/n11z/wRXv8A8ZoA6Ciuf/4TLS/+fXXP/BFe/wDxmj/hMtL/AOfXXP8AwRXv/wAZoA6Ciuf/AOEy0v8A59dc/wDBFe//ABmj/hMtL/59dc/8EV7/APGaAOgorn/+Ey0v/n11z/wRXv8A8Zqxp/ifTdT1EWEAvo7pommVLrT7i23IpUMQZUUHBdenqKANiiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAoorm4/HegTzXMVpLfXv2aYwSyWWmXNzGrgAld8cbKSMjvQB0lFc/8A8Jlpf/Prrn/givf/AIzR/wAJlpf/AD665/4Ir3/4zQAeBP8Aknnhr/sFWv8A6KWugrh/CfiS00zwbodheWOuR3Vrp9vDMn9h3h2usahhkRYOCD0rY/4TLS/+fXXP/BFe/wDxmgDz+DSPEWqfF7x3/YHij+w/L/s/zv8AQI7nzs2/y/fI24w3Tru9q6D/AIRL4h/9FP8A/KBb/wCNHgpZ7j4j+OdW+xX0Fje/YPs0t3Zy2/m7IWVsCRVPB46enrXoFAHA2uq+JPBlxLF4ruLrXtMkAePVrPT1U25wdySRR5bbwCGAPXB9q3iv4nWkdho58Jaxo15eX2qQ2brPJuESSBhudFYMoBC8mvR6zNR8OaHq5zqWj6feHrm4tkkP5kUAYC+KPE+nkrqvhT7Wirkz6JeJcA/9s5Nj/lmrNr8RfC884trnUv7NuuM2+pxNaOPbEgGfwzUifDzwYkplHhbRyxGMNZoR+RGKl/4QTwf/ANCpof8A4Lof/iaAN6OSOaNZInV0YZVlOQR7GnVyMvw38PJI8ulx3eiTucmTSbp7YZ/3FOw/ippp0jxvpmTp3iWz1SMY2watZ7Gx/wBdYcfmUNAHYUVyB8Wa9pxI1rwdfCMHH2jSpVvEPvt+WQf98mren+P/AAtqNx9lj1iCG7zj7Ndg28ufTZIFJ/AUAXNT8L6Rq+oRahcwTJexRmJbm1upbaXYTkqXiZSVyM4JIqsfBmlMCDc64QeCDrt7z/5GroQQQCDkHoRRQBT0rSrDQ9Lt9N0y2S2s7ddkcSdAPqeST1JPJPJq5RRQAUUUUAFFFFABXL+JtG1641ax1nw7eWEV9aW89uYNQjdoZVkMbdUIKkGMc4OfwrqKKAPP/wDi7/8A1I3/AJN1T1bUvizo2jX2qXC+CmgsreS4kWMXRYqiliBkgZwPUV6ZXP8Ajv8A5J54l/7BV1/6KagDoKK4j4dfEnTfH2mnZsttVhXNxZlskDpvT1Xp9CcHsT29ABRRRQAUUUUAFFFFABRRRQAV5Vqnxd1HT9Yv7FPDmlMtrcyQK9x4mtbd3CMVDGN8MucZwfWvVa5G08L+HtS1fWzqXh7TLq5S9yJrmzikeRWjRwckEkAsy8/3aAON/wCF0ap/0Leh/wDhYWVH/C6NU/6FvQ//AAsLKvQP+EE8H/8AQqaH/wCC6H/4mj/hBPB//QqaH/4Lof8A4mgDz/8A4XRqn/Qt6H/4WFlR/wALo1T/AKFvQ/8AwsLKvQP+EE8H/wDQqaH/AOC6H/4mj/hBPB//AEKmh/8Aguh/+JoA8/8A+F0ap/0Leh/+FhZUf8Lo1T/oW9D/APCwsq9A/wCEE8H/APQqaH/4Lof/AImj/hBPB/8A0Kmh/wDguh/+JoA8d8ZfErxV4k0ZtN0uHQ9HWXKzyp4mspXdCPuqd67fc9fp3pfDjxdqnw/8PXGk/wBmaHf+ddtc+b/wlNlFjKIu3G5v7mc5717f/wAIJ4P/AOhU0P8A8F0P/wATR/wgng//AKFTQ/8AwXQ//E0Aef8A/C6NU/6FvQ//AAsLKj/hdGqf9C3of/hYWVegf8IJ4P8A+hU0P/wXQ/8AxNH/AAgng/8A6FTQ/wDwXQ//ABNAHn//AAujVP8AoW9D/wDCwsq6Twv8UtG1iyuX1q80bRLuC4MX2dtZgnDrsRg6upAI+YjjOCpGc5A3P+EE8H/9Cpof/guh/wDiaw9C8F+FZtY8TpL4a0Z0h1NEiVrCIhF+yW7YX5eBuZjgdyT3oA3P+E78H/8AQ16H/wCDGH/4qj/hO/B//Q16H/4MYf8A4qj/AIQTwf8A9Cpof/guh/8AiaP+EE8H/wDQqaH/AOC6H/4mgA/4Tvwf/wBDXof/AIMYf/iqP+E78H/9DXof/gxh/wDiqP8AhBPB/wD0Kmh/+C6H/wCJo/4QTwf/ANCpof8A4Lof/iaAD/hO/B//AENeh/8Agxh/+Ko/4Tvwf/0Neh/+DGH/AOKo/wCEE8H/APQqaH/4Lof/AImj/hBPB/8A0Kmh/wDguh/+JoAP+E78H/8AQ16H/wCDGH/4qj/hO/B//Q16H/4MYf8A4qj/AIQTwf8A9Cpof/guh/8AiaP+EE8H/wDQqaH/AOC6H/4mgA/4Tvwf/wBDXof/AIMYf/iq0bmx0jxDp8RurWy1KylQSRmRFmjdSMhlzkEEYIIrkvGngvwra+BfENxb+GtGhni0y5eOSOwiVkYRMQQQuQQec1ueBP8Aknnhr/sFWv8A6KWgCgfhzpFqS2iXeqaE27djTbtljJ94m3R/+O0gsfHmmf8AHtrGla1EDnZqFs1tLj08yLKk/wDABXYUUAcePGmo2AA13whq9oP4prILexD3zGd4H1QVpaR408Na9MsGm61aTXJzi3Z9k3HX922GH5VvVynj3StO1DRrVr2wtblhqenoDNCrna13CrDkdCpII7gkUAdXRXP/APCCeD/+hU0P/wAF0P8A8TR/wgng/wD6FTQ//BdD/wDE0AdBWXrWtDRhZKthd3097cfZ4YLUxhi3lvISTI6qAFjbv6VT/wCEE8H/APQqaH/4Lof/AImsPXfBfhWHWPDCReGtGRJtTdJVWwiAdfslw2G+XkblU4PcA9qANz/hIdU/6EzXP+/1l/8AJFH/AAkOqf8AQma5/wB/rL/5Io/4QTwf/wBCpof/AILof/iaP+EE8H/9Cpof/guh/wDiaAD/AISHVP8AoTNc/wC/1l/8kVl+JdQ1rWfCur6Xb+D9ZWe9spreNpJ7IKGdCoJxcE4yfQ1qf8IJ4P8A+hU0P/wXQ/8AxNH/AAgng/8A6FTQ/wDwXQ//ABNAHzfo3wr+Kfh/VrfVNL0d7e7gbcjre2/4gjzOQehB619EWHiPxA1hAdR8E6ql5sHnLb3Nm0YbvtJnBx9RVj/hBPB//QqaH/4Lof8A4mj/AIQTwf8A9Cpof/guh/8AiaAD/hIdU/6EzXP+/wBZf/JFH/CQ6p/0Jmuf9/rL/wCSKP8AhBPB/wD0Kmh/+C6H/wCJo/4QTwf/ANCpof8A4Lof/iaAD/hIdU/6EzXP+/1l/wDJFH/CQ6p/0Jmuf9/rL/5Io/4QTwf/ANCpof8A4Lof/iaP+EE8H/8AQqaH/wCC6H/4mgA/4SHVP+hM1z/v9Zf/ACRR/wAJDqn/AEJmuf8Af6y/+SKP+EE8H/8AQqaH/wCC6H/4mj/hBPB//QqaH/4Lof8A4mgA/wCEh1T/AKEzXP8Av9Zf/JFaGhakdY8PaZqhQIby0iuNo/h3oGx+tYWs+EvB2maJf37eFNEItreSXA02Ik7VJwBt5PFdDpNkumaNY2CABLa3jhUDsFUD+lAFysSciw8X2sxIEep25tSS3/LWLdIgA9SjTE/7g/DbrF8VDy9AmvlLB9PZb1SgyxER3Mo/3lDL9GNAG1RSAggEHIPQ0tABRRRQAUUUUAFFFFABRRRQAVz/AIe/5Dniz/sKx/8ApFa10Fc/4e/5Dniz/sKx/wDpFa0AdBRRRQAUUUUAFFFFABRRRQBz/jv/AJJ54l/7BV1/6KajwJ/yTzw1/wBgq1/9FLR47/5J54l/7BV1/wCimo8Cf8k88Nf9gq1/9FLQB0FFFFABXP8AjL/kB23/AGFdN/8AS2Gugrn/ABl/yA7b/sK6b/6Ww0AdBRRRQAVz/iH/AJDnhP8A7Csn/pFdV0Fc/wCIf+Q54T/7Csn/AKRXVAHQUUUUAFFFFABRRRQAUUUUAFFFFABRRRQBia/uup9L0tM/6TdLLNg8iKL94T7gsI0PtJW3WNbj7T4uvpzhks7WO3j9VdyXkH4qIPyrZoAy9Sg16W4VtL1LTbaDYAUutPedi2TzuWZABjHGPXnnihNp3i2eCSGTW9CKSKVYf2NLyCMH/l6ro6qapfppek3d/ICVt4Wk2gZLYGQAO5PQD3oA5Xwz/wAJbqHhTR71Nb0ZUuLGGVVk0iVmAZAcEi5AJ564H0rV+x+MP+g7of8A4Jpv/kqtHQ7D+ytA03TuP9EtYoOOnyoF/pV+gDPuodYfToEs76xivht86aayeSN+Pm2oJVK5OCMs2Bxz1rP+x+MP+g7of/gmm/8AkqugooA5/wCx+MP+g7of/gmm/wDkqj7H4w/6Duh/+Cab/wCSq6CigDPtYdYTTp0vL6xlvju8maGzeONOPl3IZWLYOScMuRxx1rP+x+MP+g7of/gmm/8AkqugooA5/wCx+MP+g7of/gmm/wDkqj7H4w/6Duh/+Cab/wCSq6CigDL02DXorhm1TUtNuYNhCpa6e8DBsjks0zgjGeMdxzxzz+nwa9L4k8VNpepabbQf2nGGS60952LfY7bkMsyADGOMdjzzgdpXP+Hv+Q54s/7Csf8A6RWtAB9j8Yf9B3Q//BNN/wDJVH2Pxh/0HdD/APBNN/8AJVdBRQBz/wBj8Yf9B3Q//BNN/wDJVaGpw6xL5X9k31ja4z5n2qye43dMY2ypjv65yOmOdCigDn/sfjD/AKDuh/8Agmm/+SqPsfjD/oO6H/4Jpv8A5KroKKAOf+x+MP8AoO6H/wCCab/5KrQ8nWP7H8r7dY/2n/z8fY38n73/ADy83d93j7/Xn2rQooA4Pxpa+Kl8C+IWuNZ0aSAaZcmRI9JlRmXymyAxuSAcd8HHoaPBdr4qbwL4ea31nRo4DplsY0k0mV2VfKXALC5AJx3wM+grc8d/8k88S/8AYKuv/RTUeBP+SeeGv+wVa/8AopaAD7H4w/6Duh/+Cab/AOSq0NMh1iLzf7WvrG6zjy/slm9vt65zulfPbpjGD1zxoUUAYc9r4qa4ka31nRo4C5MaSaTK7KueAWFyATjvgZ9BXP8Aiy18VLo9uZ9Z0Z0/tOwACaTKp3fa4dpybk8BsEjuARkZyO8rn/GX/IDtv+wrpv8A6Ww0AH2Pxh/0HdD/APBNN/8AJVSQWvipbiJrjWdGkgDgyJHpMqMy55AY3JAOO+Dj0NblFAGXqUGvS3CtpepabbQbAGS60952LZPIZZkAGMcY7Hnnjl9dtfFQ1jwwJdZ0ZnOpuIiukyqFb7JccsPtJ3DbuGBjkg54we8rn/EP/Ic8J/8AYVk/9IrqgA+x+MP+g7of/gmm/wDkqj7H4w/6Duh/+Cab/wCSq6CigDPuodYfToEs76xivht86aazeSN+Pm2oJVK5OCMs2Bxz1rP+x+MP+g7of/gmm/8AkqugooA5/wCx+MP+g7of/gmm/wDkqj7H4w/6Duh/+Cab/wCSq6CigCnp0epRWzLql3aXM+8lXtbVoFC4GAVaRyTnPOe4445y/sfjD/oO6H/4Jpv/AJKroKKAOf8AsfjD/oO6H/4Jpv8A5Ko+x+MP+g7of/gmm/8AkqugooAx7G28SR3kbX+q6VPajO+ODTJInbg4wxnYDnH8J9OOtF9beJJLyRrDVtKgtTjZHPpkkrrwM5YTqDzn+Efj1rYooA4vSLDxUL7Wni1rQw8l8DLnSZWywgiXOPtI2/Kq8c+uecDYgtfFS3ETXGs6NJAHBkSPSZUZlzyAxuSAcd8HHoadpR+z+INctHPzyzRXkYx/yzaJY/xO+J/zFbVABWFfj+1fENtpuM2lkFvbnjhpN37lPwZWc9wUTsa3a5Gz8Q2On6trgnh1V5ZL7JMGlXU6ALFGgAeONl/hzgHuaAOuorn/APhMtL/59dc/8EV7/wDGaP8AhMtL/wCfXXP/AARXv/xmgDoKK5//AITLS/8An11z/wAEV7/8Zo/4TLS/+fXXP/BFe/8AxmgDoKK5/wD4TLS/+fXXP/BFe/8Axmj/AITLS/8An11z/wAEV7/8ZoA6Ciuf/wCEy0v/AJ9dc/8ABFe//GaP+Ey0v/n11z/wRXv/AMZoA6Ciuf8A+Ey0v/n11z/wRXv/AMZo/wCEy0v/AJ9dc/8ABFe//GaAOgrn/D3/ACHPFn/YVj/9IrWj/hMtL/59dc/8EV7/APGaw9C8WadFrHid2ttZIl1NHXbot4xA+yW6/MBFlTlTwcHGD0IJAO8orn/+Ey0v/n11z/wRXv8A8Zo/4TLS/wDn11z/AMEV7/8AGaAOgorn/wDhMtL/AOfXXP8AwRXv/wAZo/4TLS/+fXXP/BFe/wDxmgDoKK5//hMtL/59dc/8EV7/APGaP+Ey0v8A59dc/wDBFe//ABmgDoKK5/8A4TLS/wDn11z/AMEV7/8AGaP+Ey0v/n11z/wRXv8A8ZoAPHf/ACTzxL/2Crr/ANFNR4E/5J54a/7BVr/6KWsfxZ4ktNT8G65YWdjrkl1dafcQwp/Yd4NztGwUZMWBkkda3PBcE1r4F8PW9xFJDPFplskkcilWRhEoIIPIIPGKANyiiigArn/GX/IDtv8AsK6b/wClsNb5IVSzEADkk9q8+8eePPC1ppkds+uWbzxahZTSRwP5rIkd1E7lgmcYVSefoOSKAPQqK8//AOF2/Dz/AKGH/wAkrj/43R/wu34ef9DD/wCSVx/8boA9Arn/ABD/AMhzwn/2FZP/AEiuq5//AIXb8PP+hh/8krj/AON1j6z8X/Al3qvh6aDXd8dpqDzTn7JONiG1njB5Tn5nUcevpmgD1iivP/8Ahdvw8/6GH/ySuP8A43R/wu34ef8AQw/+SVx/8boA9Aorz/8A4Xb8PP8AoYf/ACSuP/jdH/C7fh5/0MP/AJJXH/xugD0CivP/APhdvw8/6GH/AMkrj/43R/wu34ef9DD/AOSVx/8AG6APQKK8/wD+F2/Dz/oYf/JK4/8AjdH/AAu34ef9DD/5JXH/AMboA9Aorz//AIXb8PP+hh/8krj/AON0f8Lt+Hn/AEMP/klcf/G6APQKK8//AOF2/Dz/AKGH/wAkrj/43R/wu34ef9DD/wCSVx/8boA6XxCPsYtdbTIawf8AfHIANu5Al3eyjEnrmMeprbrzq6+Mnw3vLOe1m18NFNG0bqbK4wVIwR9z3rsPDF8+p+E9Gv5M77qxgmbPXLRqT/OgDVrjNX+Hceq6xdajH4q8U6d9pYO1vp+o+TCpChchdpxnGTz1rs6KAPP/APhVn/U++Of/AAcf/YUf8Ks/6n3xz/4OP/sK9AooA8//AOFWf9T745/8HH/2FH/CrP8AqffHP/g4/wDsK9AooA8//wCFWf8AU++Of/Bx/wDYUf8ACrP+p98c/wDg4/8AsK9AooA8k8V/DHxHb6HLceF/HHiufUYvnFveaqxWZe6qV24b0zx24zkcd8JNN1zxzNrsOueMPFtpLprQoEg1N42DMZAwYODyNg9O9fRlYt34Q8M6heS3l74e0q6uZSDJNPZxyOxAA5JBPQCgDmf+FWf9T745/wDBx/8AYUf8Ks/6n3xz/wCDj/7Cug/4QTwf/wBCpof/AILof/iaP+EE8H/9Cpof/guh/wDiaAOf/wCFWf8AU++Of/Bx/wDYV0nhfwxD4Wsrm3i1HUtQe5uDcS3GoziWVm2ImN2BkBUUDNc/4L8F+FbrwL4euLjw1o008umWzySSWETM7GJSSSVySTzmtz/hBPB//QqaH/4Lof8A4mgDoKK5PQdJ03RvHWt2+l6faWMDaZYu0drCsSlvNuxkhQBnAAz7CusoAKKbJLHDE0srqkajLMxwAPUmuTvvif4LsJlhfX7aeV3EapaK1ySx6L+7DcmgDrqK5eLx7pU02xbHXtuM7/7EuyP0jz+lU7/4reDtLuFt9Q1C7s5m+7HcaZdRsfoDGDQB2lFcefHM1/8AL4f8Ma1qWRlZpYfscJ/4HNtJ/BTQYPH+q58y80bQYSBgQRtezD1+ZtiD/vk0AdhXP6p448MaNKYb3W7NbgHH2eJ/NmJ9PLTLfpWefh5Y3246/q2sa2W+9HdXjRw/9+otiY+oNdBpehaTokXlaXplnZJ3FvCqZ+uBzQBz58Zarf8AGheD9VuRnAn1DbYxY9fn+cj6JR/Z/jvU/wDj81vTNGi3Z8vTbY3EuPTzJfl/JK7CigDkF+HGi3DCTWp9S12QNuH9p3bSID7RDEY/75rpbDTLDSrcW+nWVtZwDpHbxLGv5ACrVFABRRRQAVh+KPDEPimytreXUdS097a4FxFcadOIpVbY6Y3YOAVdgcVuUUAef/8ACrP+p98c/wDg4/8AsKP+FWf9T745/wDBx/8AYV6BRQB5/wD8Ks/6n3xz/wCDj/7CsvxL8PZtG8K6vqlv468atPZWU1xGsmrkqWRCwBwoOMj1FeqVz/jv/knniX/sFXX/AKKagD5Y8Map8Q/F2uQ6TpPiTXZJ5OWY6jMEiTu7ndwB/gBkkCvoO3+FUy20QufiB41ecKBI8eqlFZsckKVJAz2yfrW54F8C6X4E0NbGxXzLiTDXV2y4ed/6KOcL29yST1FAHn//AAqz/qffHP8A4OP/ALCj/hVn/U++Of8Awcf/AGFegUUAef8A/CrP+p98c/8Ag4/+wo/4VZ/1Pvjn/wAHH/2FegUUAef/APCrP+p98c/+Dj/7Cj/hVn/U++Of/Bx/9hXoFFAHn/8Awqz/AKn3xz/4OP8A7Cu00nTYdG0ax0u3aRoLK3jt42kILFUUKCcADOB6CrlFABRRRQAUUUUAFFFFABRRRQAUUUUAFFFFAHP+BP8Aknnhr/sFWv8A6KWugrlfhzqNtfeBNIt4ZD9o0+1isruFgVeCaNAroynkEEd+2DXVUAef63/wmH/Cw7//AIRT+w/+QVZ/af7V87/nrdbdnl/8Czn2qnqWrfE3RrdbjVNQ+HljAzhFkuprmJS2CcAsQM4BOPY1qa9c+KtH8ZXN/ofhP+27W70+2hZ/7RitvLeOSckYfJORKP8APTn9etPG/j9tM0q/8JweH7WG7Ny9/Pc22pIuIpFCmAgBslwM8469s0AMsrrRfEF6b74g+LfCF95OFtNOsdRBtFxkmR1dvnc5xyCAB78X/Fs/hDXtL0m00rxr4b0l9M1GHUIWWeF0DRhgBsDqP4s/hjvWX/wpfVP+hk0P/wAI+yo/4Uvqn/QyaH/4R9lQBoAeEb4Z1/4nrquVw8I1mG1gb/gEJX9Sa2dI1L4YaAB/ZOpeFbNsY3w3UAc/Vs5P4muW/wCFL6p/0Mmh/wDhH2VH/Cl9U/6GTQ//AAj7KgD0D/hO/B//AENeh/8Agxh/+Ko/4Tvwf/0Neh/+DGH/AOKrz/8A4Uvqn/QyaH/4R9lR/wAKX1T/AKGTQ/8Awj7KgD0D/hO/B/8A0Neh/wDgxh/+Ko/4Tvwf/wBDXof/AIMYf/iq8r8S/CTUdN8K6vfv4g0aVLaymmaOPwrZwswVCcB15QnH3hyOorU/4Uvqn/QyaH/4R9lQB3D+PNIuL6Ox0MSeILpkMjppMsMixKO7uzqi5PABbJ9Ksf8ACRap/wBCZrn/AH+sv/kisvwD4AHgqTVbiXUY7261F4jIYLKO0iRY1IULEnAPzNkjrxxnJPaUAZPh/wAR6f4ks3nsmkSSFzFcW067JreQdUdex/MHsTWtWRf+FfDuq3bXeo6Dpd5csAGmuLOORyB05YE1W/4QTwf/ANCpof8A4Lof/iaAOgorn/8AhBPB/wD0Kmh/+C6H/wCJo/4QTwf/ANCpof8A4Lof/iaAOgorn/8AhBPB/wD0Kmh/+C6H/wCJo/4QTwf/ANCpof8A4Lof/iaAOgrl/EtxrU/iDRtE0i/GnrdR3FzdXSwpJIkcRjGFD5XJMoGSDj0qx/wgng//AKFTQ/8AwXQ//E1c03w1oOjXLXOl6JptjOyGNpLW1SJipIOCVAOMgHHtQBT/AOEe1T/oc9c/782X/wAj1Xv/AAheanp11YXfi/XJLa6ieGZPLsxuRgQwyLcEcE9K6iigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKAPOPHv/FLeIdD8S6T+4vtQ1GDTb5R/q7qJ84Lr3dcfK3BHTkcV6PRRQAUUUUAFFFFABRRRQB5lqOr6jr/xdbwfLfXFnpNtaC7YWMphlnbj5XkHzBfZSvua63/hDdL/AOfrXP8Awe3v/wAeoooAjuPAui3VtLb3EusywSoUkjk1u9ZXUjBBBlwQR2NdJRRQAUUUUAFFFFABRRRQAUUUUAFFFFABRRRQAUUUUAFFFFABRRRQAUUUUAFFFFAH/9k="

# Decode the Base64 string to bytes
image_data = base64.b64decode(base64_image)

# Convert the bytes into a PIL Image
image = Image.open(BytesIO(image_data))

# Display the image
image.show()

In [4]:
import pdfplumber
import pandas as pd

# Function to extract tables from a PDF and convert to HTML
def extract_table_to_html(pdf_path):
    html_output = ""

    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            tables = page.extract_tables()

            for table in tables:
                # Convert table to DataFrame for easier manipulation
                df = pd.DataFrame(table)

                # Convert DataFrame to HTML
                html_output += df.to_html(index=False, header=False, border=1)

    return html_output

# Example usage
pdf_path = path + "BV_standard.pdf"  # Replace with your PDF file path
html_table = extract_table_to_html(pdf_path)

# Save the HTML output to a file with UTF-8 encoding
with open('output_table.html', 'w', encoding='utf-8') as f:
    f.write(html_table)

print("Table extracted and saved as output_table.html")

Table extracted and saved as output_table.html


In [5]:
import pdfplumber

def is_mergeable(cell):
    """
    Returns True only if the cell is exactly None.
    (We do not consider empty strings "" as mergeable.)
    """
    return cell is None

def is_group_header_row(row):
    """
    Returns True if the row appears to be a group header row.
    We define such a row as one where the first cell is not None
    and all remaining cells are None.
    For example:
        ["INERT GAS", None, None, None, None, None]  → True
    """
    if not row:
        return False
    # The row is a group header if the first cell is not None and every other cell is None.
    return (row[0] is not None) and all(cell is None for cell in row[1:])

def build_html_table(table_data):
    """
    Build an HTML table from table_data (a list of rows, each a list of cell values)
    applying dynamic merging.
    
    • Horizontal merging:
      For a given cell, we look to the right and merge cells whose value is None.
      
    • Vertical merging:
      For a given cell, we look downward and merge only if every cell in the same block
      (from the current column up to the horizontal block) is None.
      However, if the very next row is a group header row (i.e. its first cell is not None and
      all others are None), we stop vertical merging so that the group header remains separate.
    
    All cells are rendered using <td> (i.e. no special treatment for “header rows”).
    """
    nrows = len(table_data)
    ncols = max(len(row) for row in table_data)
    
    # used[i][j] will be True if that cell has already been merged into another cell.
    used = [[False] * ncols for _ in range(nrows)]
    
    html = '<table border="1" class="dataframe">\n'
    for i in range(nrows):
        html += "  <tr>\n"
        for j in range(ncols):
            if used[i][j]:
                continue  # skip cells already merged
            
            # Get the cell text; if missing or None, use an empty string.
            try:
                cell_text = table_data[i][j]
            except IndexError:
                cell_text = ""
            if cell_text is None:
                cell_text = ""
            cell_text = cell_text.strip()
            
            # --- Horizontal merging (compute colspan) ---
            colspan = 1
            for k in range(j + 1, ncols):
                if (k < len(table_data[i])) and (not used[i][k]) and is_mergeable(table_data[i][k]):
                    colspan += 1
                    used[i][k] = True
                else:
                    break
            
            # --- Vertical merging (compute rowspan) ---
            rowspan = 1
            for ii in range(i + 1, nrows):
                # If the next row is a group header row, do not merge vertically.
                if is_group_header_row(table_data[ii]):
                    break
                valid = True
                for jj in range(j, j + colspan):
                    if (jj >= len(table_data[ii])) or used[ii][jj] or (not is_mergeable(table_data[ii][jj])):
                        valid = False
                        break
                if valid:
                    rowspan += 1
                    for jj in range(j, j + colspan):
                        used[ii][jj] = True
                else:
                    break
            
            attrs = ""
            if rowspan > 1:
                attrs += f' rowspan="{rowspan}"'
            if colspan > 1:
                attrs += f' colspan="{colspan}"'
            
            html += f'    <td{attrs}>{cell_text}</td>\n'
        html += "  </tr>\n"
    html += "</table>\n"
    
    return html

def extract_tables_to_html(pdf_path):
    """
    Opens the PDF at pdf_path, extracts tables using pdfplumber,
    and returns the HTML string (with merged cells) for all tables.
    """
    html_tables = []
    
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            tables = page.extract_tables()
            for table in tables:
                # table is a list-of-lists (rows), with cell values (strings or None)
                html_table = build_html_table(table)
                html_tables.append(html_table)
    
    return html_tables

# Example usage:
pdf_path = path + "BV_standard.pdf"  # Replace with your actual PDF file path.
tables = extract_tables_to_html(pdf_path)
print(f"There were {len(tables)} tables in {file2}.")
# Save the HTML output to a file with UTF-8 encoding.
#with open('output_table.html', 'w', encoding='utf-8') as f:
#    f.write(html_tables)


There were 1245 tables in test_sample.pdf.


In [4]:
from docling.document_converter import DocumentConverter

source = path + file1  # document per local path or URL
converter = DocumentConverter()
result = converter.convert(source)
md = result.document.export_to_markdown() 

In [9]:
print(result.document.export_to_html)

<bound method DoclingDocument.export_to_html of DoclingDocument(schema_name='DoclingDocument', version='1.1.0', name='BV_page237', origin=DocumentOrigin(mimetype='application/pdf', binary_hash=17612189631379352603, filename='BV_page237.pdf', uri=None), furniture=GroupItem(self_ref='#/furniture', parent=None, children=[], content_layer=<ContentLayer.FURNITURE: 'furniture'>, name='_root_', label=<GroupLabel.UNSPECIFIED: 'unspecified'>), body=GroupItem(self_ref='#/body', parent=None, children=[RefItem(cref='#/texts/0'), RefItem(cref='#/texts/1'), RefItem(cref='#/pictures/0'), RefItem(cref='#/tables/0'), RefItem(cref='#/texts/2'), RefItem(cref='#/pictures/1'), RefItem(cref='#/texts/3'), RefItem(cref='#/texts/4')], content_layer=<ContentLayer.BODY: 'body'>, name='_root_', label=<GroupLabel.UNSPECIFIED: 'unspecified'>), groups=[], texts=[TextItem(self_ref='#/texts/0', parent=RefItem(cref='#/body'), children=[], content_layer=<ContentLayer.BODY: 'body'>, label=<DocItemLabel.CAPTION: 'caption'

In [7]:
import pymupdf4llm

md_text = pymupdf4llm.to_markdown(path+"BV_page231_processed.pdf")

print(md_text)

Processing ../Data/BV_page231_processed.pdf...
```
             Table 12 : Permissible stresses for copper and copper alloy pipes
              Specified Design temperature (°C)
 Material (annealed) minimum tensile
             strength (N/mm?) <50 75 100 125 150 175 200 | 225 250 | 275 300
 Copper 215 Al 41 40 40 34 27,5 18,5
 Aluminium brass 325 78 78 78 78 78 51 24,5
 Copper-nickel 95/5
 and 90/10 275 68 68 67 65,5 64 62 59 56 52 48 44
 Copper-nickel 70/30 365 81 79 77 75 73 71 69 67 65,5 64 62
2.2.3. Thickness reduction due to bending
a) Unless otherwise justified, the thickness reduction b due to bending is to be determined by the following formula:
    _ Dt
     2,5p
  where:
  p : Bending radius measured on the centre line of the pipe, in mm
         as defined in [1.4.1]
   to : as defined in [2.2.1].
b) When the bending radius is not given, the thickness reduction is to be taken equal to:
   to
   10
c) For straight pipes, the thickness reduction is to be taken equal to 0.
2.2

## Text

In [49]:
document_text = "".join([chunk.metadata.text_as_html if 'Table' in str(type(chunk)) else chunk.text for chunk in chunks])
document_text

'NATIONAL PARTNERSHIP FOR QUALITY AFTERSCHOOL LEARNING www.sedl.org/afterschool/toolkits\n\n����������� �������� �������\n\nTutoring to Enhance Science Skills Tutoring Two: Learning to Make Data Tables . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .\n\nSample Data for Data Tables\n\nUse these data to create data tables following the Guidelines for Making a Data Table and Checklist for a Data Table.\n\nExample 1: Pet Survey (GR 2–3) Ms. Hubert’s afterschool students took a survey of the 600 students at Morales Elementary School. Students were asked to select their favorite pet from a list of eight animals. Here are the results.\n\nLizard 25, Dog 250, Cat 115, Bird 50, Guinea pig 30, Hamster 45, Fish 75, Ferret 10\n\nExample 2: Electromagnets—Increasing Coils (GR 3–5) The following data were collected using an electromagnet with a 1.5 volt battery, a

In [47]:
pages

[Document(metadata={'source': '../Data/test_sample.pdf', 'coordinates': {'points': ((150.0, 149.1642499999999), (150.0, 215.8003611111111), (1140.4117801666675, 215.8003611111111), (1140.4117801666675, 149.1642499999999)), 'system': 'PixelSpace', 'layout_width': 1700, 'layout_height': 2200}, 'filename': 'test_sample.pdf', 'file_directory': '../Data', 'last_modified': '2025-01-26T15:59:25', 'filetype': 'application/pdf', 'page_number': 1, 'category': 'CompositeElement', 'element_id': '62b9a05766ea6dad665f549f87a727e7'}, page_content='NATIONAL PARTNERSHIP FOR QUALITY AFTERSCHOOL LEARNING www.sedl.org/afterschool/toolkits\n\n����������� �������� �������\n\nTutoring to Enhance Science Skills Tutoring Two: Learning to Make Data Tables . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .\n\nSample Data for Data Tables\n\nUse these data to create data tables f

## Tables

In [1]:
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [44]:
#Add tables to the document text
tables = []
for chunk in chunks:
    if 'Table' in str(type(chunk)):
            
        tables.append(str(chunk.metadata.text_as_html))
# Join all table elements into a single string
document_table = "\n".join(tables)
print(document_table)

<table><thead><th>Number of Coils</th><th>Number of Paperclips</th></thead><tr><td></td><td>3,5,4</td></tr><tr><td></td><td>11, 10, 12</td></tr><tr><td></td><td>15, 13, 14</td></tr></table>
<table><tr><td rowspan="2">Speed (mph) 407.447</td><td>Driver</td><td>Car</td><td>Eng ine</td><td>Date</td></tr><tr><td></td><td>Craig Breedlove</td><td>Spirit of America</td><td>GE J 47</td><td>8/5/63</td></tr><tr><td>413.199</td><td>Tom Green</td><td>Wingfoot Express</td><td>WE J46</td><td>10/2/64</td></tr><tr><td>434.22</td><td>Art Arfons</td><td>Green Monster</td><td>GE J   79</td><td>10/5/64</td></tr><tr><td>468.719</td><td>Craig Breed ove</td><td>Spirit of America</td><td>GE J 79</td><td>10/13/64</td></tr><tr><td>526.277</td><td>Craig Breed ove</td><td>Spirit of America</td><td>GE J 79</td><td>10/15/65</td></tr><tr><td>536.712</td><td>Art Arfons</td><td>Green Monster</td><td>GE J 79</td><td>10/27/65</td></tr><tr><td>555.127</td><td>Craig Breed ove</td><td>Spirit of America, Sonic 1</td><td>GE 

In [39]:
document_all = "\n".join([document_text, document_table])
document_all

'Pt C, Ch 1, Sec 10\n\n1.4 Symbols and units\n\n1.4.1 The following symbols and related units are commonly used in this Section. Additional symbols, related to some formulae indicated in this Section, are listed wherever it is necessary.\n\n: Design pressure, in MPa\n\np\n\nT : Design temperature, in °C\n\nt : Rule required minimum thickness, in mm\n\nD : Pipe external diameter, in mm.\n\n1.5 Class of piping systems\n\n1.5.1 Purpose of the classes of piping systems\n\nPiping systems are subdivided into three classes, denoted as class |, class II and class III, for the purpose of acceptance of materials, selection of joints, heat treatment, welding, pressure testing and the certification of fittings.\n\n1.5.2 Definitions of the classes of piping systems\n\na) Classes |, Il and III are defined in Tab 3\n\nb) The following systems are not covered by Tab 3:\n\n* cargo piping for oil tankers, gas tankers and chemical tankers, and\n\n¢ fluids for refrigerating plants.\n\nTable 3 : Class of p

### Chunks

In [43]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Split the document into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000,  # Adjust as needed
    chunk_overlap=200  # Adjust as needed
)
chunks = text_splitter.create_documents([document_all])
chunks

[Document(metadata={}, page_content='Pt C, Ch 1, Sec 10\n\n1.4 Symbols and units\n\n1.4.1 The following symbols and related units are commonly used in this Section. Additional symbols, related to some formulae indicated in this Section, are listed wherever it is necessary.\n\n: Design pressure, in MPa\n\np\n\nT : Design temperature, in °C\n\nt : Rule required minimum thickness, in mm\n\nD : Pipe external diameter, in mm.\n\n1.5 Class of piping systems\n\n1.5.1 Purpose of the classes of piping systems\n\nPiping systems are subdivided into three classes, denoted as class |, class II and class III, for the purpose of acceptance of materials, selection of joints, heat treatment, welding, pressure testing and the certification of fittings.\n\n1.5.2 Definitions of the classes of piping systems\n\na) Classes |, Il and III are defined in Tab 3\n\nb) The following systems are not covered by Tab 3:\n\n* cargo piping for oil tankers, gas tankers and chemical tankers, and\n\n¢ fluids for refrigera

## Images

In [5]:
from unstructured.staging.base import elements_from_base64_gzipped_json

orig_elements = elements_from_base64_gzipped_json(sub_docs[0].metadata["orig_elements"])
orig_elements[1].category

'Title'

In [11]:
orig_elements[7].category

'ListItem'

In [12]:
# Retrive the image_base64 from orig_elements metadata and store in a list. More than one image can exist per chunk.
for doc in sub_docs:
    image_base64_list = []
    if 'orig_elements' in doc.metadata:
        for orig_element in elements_from_base64_gzipped_json(doc.metadata["orig_elements"]):
            if orig_element.category == "Image" and orig_element.metadata.image_base64 != "":
                image_base64_list.append(orig_element.metadata.image_base64)
                print(f"    {doc.metadata['element_id']} \
                      {doc.metadata['page_number']} \
                        {orig_element.category}: \
                            {orig_element.metadata.image_base64}")
        if (len(image_base64_list) > 0):
            doc.metadata['list_image_base64'] = image_base64_list
            print(f"Document {doc.metadata['element_id']} has {len(image_base64_list)} images")    
    
            