In [1]:
# Import necessary libraries
from bs4 import BeautifulSoup  # BeautifulSoup is a library for parsing HTML and XML documents
import re  # The re module provides support for regular expressions

# Read the HOCR file
with open("/content/combined_hocr_143209.hocr", "r", encoding="utf-8") as file:
    hocr_content = file.read()  # Read the content of the HOCR file

# Parse the HOCR content using BeautifulSoup
soup = BeautifulSoup(hocr_content, "html.parser")  # Create a BeautifulSoup object from the HOCR content

# Extract key information from the HOCR content

# Extract the title text from bounding box "8 13 1302 159"
div_title_tag = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 8 13 1302 159" in value)
if div_title_tag:
    title_text = div_title_tag.get_text(strip=True)  # Extract text from the found div
else:
    title_text = ""  # Set title_text to empty string if the div is not found

# Extract the date text from bounding box "398 1813 887 1862"
div_date_tag = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 398 1813 887 1862" in value)
if div_date_tag:
    date_text = ""
    text = div_date_tag.get_text(strip=True)  # Extract text from the found div
    match = re.search(r"\b\d+,\s\w+\s\d+\b", text)  # Search for a specific date format
    if match:
        date_text = match.group(0)  # Extract the matched date text
else:
    date_text = ""  # Set date_text to empty string if the div is not found

# Extract the place names from bounding box "1 1991 1292 2047"
div_place_tag = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 1 1991 1292 2047" in value)
if div_place_tag:
    place_text = ""
    text = div_place_tag.get_text(strip=True)  # Extract text from the found div
    matches = re.findall(r'\bLeipzig\b|\bBerlin\b', text)  # Search for specific place names
    if matches:
        place_text = ' and '.join(matches)  # Join the found place names
else:
    place_text = ""  # Set place_text to empty string if the div is not found

# Extract the organization name from bounding box "1 1991 1292 2047"
div_org_name_tag = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 1 1991 1292 2047" in value)
if div_org_name_tag:
    org_name_text = ""
    text = div_org_name_tag.get_text(strip=True)  # Extract text from the found div
    match = re.search(r"\b[A-Z]\. G\. [A-Z][a-z]+\b", text)  # Search for a specific organization name format
    if match:
        org_name_text = match.group(0)  # Extract the matched organization name
else:
    org_name_text = ""  # Set org_name_text to empty string if the div is not found

# Extract various text elements from different divs with specific bounding boxes
# These elements will be used to construct the TEI XML document later

# Extract heading 1 text from bounding box "169 375 1135 485"
div_h1 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 169 375 1135 485" in value)
if div_h1:
    h1 = div_h1.get_text(strip=True)  # Extract text from the found div
else:
    h1 = ""  # Set h1 to empty string if the div is not found

# Extract subheading 1 text from bounding box "7 538 1300 760"
div_sh1 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 7 538 1300 760" in value)
if div_sh1:
    sh1 = div_sh1.get_text(strip=True)  # Extract text from the found div
else:
    sh1 = ""  # Set sh1 to empty string if the div is not found

# Extract paragraph 1 text from bounding box "526 863 779 905"
div_p1 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 526 863 779 905" in value)
if div_p1:
    p1 = div_p1.get_text(strip=True)  # Extract text from the found div
else:
    p1 = ""  # Set p1 to empty string if the div is not found

# Extract paragraph 2 text from bounding box "116 937 1194 993"
div_p2 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 116 937 1194 993" in value)
if div_p2:
    p2 = div_p2.get_text(strip=True)  # Extract text from the found div
else:
    p2 = ""  # Set p2 to empty string if the div is not found

# Extract paragraph 3 text from bounding box "497 1029 803 1136"
div_p3 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 497 1029 803 1136" in value)
if div_p3:
    p3 = div_p3.get_text(strip=True)  # Extract text from the found div
else:
    p3 = ""  # Set p3 to empty string if the div is not found

# Extract paragraph 4 text from bounding box "445 1152 853 1205"
div_p4 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 445 1152 853 1205" in value)
if div_p4:
    p4 = div_p4.get_text(strip=True)  # Extract text from the found div
else:
    p4 = ""  # Set p4 to empty string if the div is not found

# Extract paragraph 5 text from bounding box "153 1307 1140 1698"
div_p5 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 153 1307 1140 1698" in value)
if div_p5:
    p5 = div_p5.get_text(strip=True)  # Extract text from the found div
else:
    p5 = ""  # Set p5 to empty string if the div is not found

# Extract paragraph 6 text from bounding box "398 1813 887 1862"
div_p6 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 398 1813 887 1862" in value)
if div_p6:
    p6 = div_p6.get_text(strip=True)  # Extract text from the found div
else:
    p6 = ""  # Set p6 to empty string if the div is not found

# Extract paragraph 7 text from bounding box "636 1951 659 1971"
div_p7 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 636 1951 659 1971" in value)
if div_p7:
    p7 = div_p7.get_text(strip=True)  # Extract text from the found div
else:
    p7 = ""  # Set p7 to empty string if the div is not found

# Extract paragraph 8 text from bounding box "1 1991 1292 2047"
div_p8 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 1 1991 1292 2047" in value)
if div_p8:
    p8 = div_p8.get_text(strip=True)  # Extract text from the found div
else:
    p8 = ""  # Set p8 to empty string if the div is not found

# Extract paragraph 9 text from bounding box "0 2090 1311 2138"
div_p9 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 0 2090 1311 2138" in value)
if div_p9:
    p9 = div_p9.get_text(strip=True)  # Extract text from the found div
else:
    p9 = ""  # Set p9 to empty string if the div is not found

# Extract heading 2 text from bounding box "660 11 1245 108"
div_h2 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 660 11 1245 108" in value)
if div_h2:
    h2 = div_h2.get_text(strip=True)  # Extract text from the found div
else:
    h2 = ""  # Set h2 to empty string if the div is not found

# Extract heading 3 text from bounding box "50 123 990 217"
div_h3 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 50 123 990 217" in value)
if div_h3:
    h3 = div_h3.get_text(strip=True)  # Extract text from the found div
else:
    h3 = ""  # Set h3 to empty string if the div is not found

# Extract paragraph 10 text from bounding box "35 234 1869 1248"
div_p10 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 35 234 1869 1248" in value)
if div_p10:
    p10 = div_p10.get_text(strip=True)  # Extract text from the found div
else:
    p10 = ""  # Set p10 to empty string if the div is not found

# Extract heading 4 text from bounding box "35 1301 1752 1480"
div_h4 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 35 1301 1752 1480" in value)
if div_h4:
    h4 = div_h4.get_text(strip=True)  # Extract text from the found div
else:
    h4 = ""  # Set h4 to empty string if the div is not found

# Extract subheading 2 text from bounding box "29 1496 580 1561"
div_sh2 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 29 1496 580 1561" in value)
if div_sh2:
    sh2 = div_sh2.get_text(strip=True)  # Extract text from the found div
else:
    sh2 = ""  # Set sh2 to empty string if the div is not found

# Extract paragraph 11 text from bounding box "30 1571 1860 2216"
div_p11 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 30 1571 1860 2216" in value)
if div_p11:
    p11 = div_p11.get_text(strip=True)  # Extract text from the found div
else:
    p11 = ""  # Set p11 to empty string if the div is not found

# Extract subheading 3 text from bounding box "29 2236 488 2302"
div_sh3 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 29 2236 488 2302" in value)
if div_sh3:
    sh3 = div_sh3.get_text(strip=True)  # Extract text from the found div
else:
    sh3 = ""  # Set sh3 to empty string if the div is not found

# Extract paragraph 12 text from bounding box "23 2314 1854 2882"
div_p12 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 23 2314 1854 2882" in value)
if div_p12:
    p12 = div_p12.get_text(strip=True)  # Extract text from the found div
else:
    p12 = ""  # Set p12 to empty string if the div is not found

# Extract heading 5 text from bounding box "528 39 1386 138"
div_h5 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 528 39 1386 138" in value)
if div_h5:
    h5 = div_h5.get_text(strip=True)  # Extract text from the found div
else:
    h5 = ""  # Set h5 to empty string if the div is not found

# Extract paragraph 13 text from bounding box "45 150 1871 916"
div_p13 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 45 150 1871 916" in value)
if div_p13:
    p13 = div_p13.get_text(strip=True)  # Extract text from the found div
else:
    p13 = ""  # Set p13 to empty string if the div is not found

# Extract paragraph 14 text from bounding box "45 966 1868 1303"
div_p14 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 45 966 1868 1303" in value)
if div_p14:
    p14 = div_p14.get_text(strip=True)  # Extract text from the found div
else:
    p14 = ""  # Set p14 to empty string if the div is not found

# Extract paragraph 15 text from bounding box "116 1343 1870 2304"
div_p15 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 116 1343 1870 2304" in value)
if div_p15:
    p15 = div_p15.get_text(strip=True)  # Extract text from the found div
else:
    p15 = ""  # Set p15 to empty string if the div is not found

# Extract paragraph 16 text from bounding box "114 2337 1870 2920"
div_p16 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 114 2337 1870 2920" in value)
if div_p16:
    p16 = div_p16.get_text(strip=True)  # Extract text from the found div
else:
    p16 = ""  # Set p16 to empty string if the div is not found

# Extract heading 6 text from bounding box "88 85 1191 163"
div_h6 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 88 85 1191 163" in value)
if div_h6:
    h6 = div_h6.get_text(strip=True)  # Extract text from the found div
else:
    h6 = ""  # Set h6 to empty string if the div is not found

# Extract paragraph 17 text from bounding box "77 177 1832 503"
div_p17 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 77 177 1832 503" in value)
if div_p17:
    p17 = div_p17.get_text(strip=True)  # Extract text from the found div
else:
    p17 = ""  # Set p17 to empty string if the div is not found

# Extract paragraph 18 text from bounding box "83 543 1774 616"
div_p18 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 83 543 1774 616" in value)
if div_p18:
    p18 = div_p18.get_text(strip=True)  # Extract text from the found div
else:
    p18 = ""  # Set p18 to empty string if the div is not found

# Extract heading 7 text from bounding box "103 683 686 960"
div_h7 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 103 683 686 960" in value)
if div_h7:
    h7 = div_h7.get_text(strip=True)  # Extract text from the found div
else:
    h7 = ""  # Set h7 to empty string if the div is not found

# Extract paragraph 19 text from bounding box "305 975 1835 1233"
div_p19 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 305 975 1835 1233" in value)
if div_p19:
    p19 = div_p19.get_text(strip=True)  # Extract text from the found div
else:
    p19 = ""  # Set p19 to empty string if the div is not found

# Extract paragraph 20 text from bounding box "304 1256 1838 1937"
div_p20 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 304 1256 1838 1937" in value)
if div_p20:
    p20 = div_p20.get_text(strip=True)  # Extract text from the found div
else:
    p20 = ""  # Set p20 to empty string if the div is not found

# Extract heading 8 text from bounding box "228 2010 634 2082"
div_h8 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 228 2010 634 2082" in value)
if div_h8:
    h8 = div_h8.get_text(strip=True)  # Extract text from the found div
else:
    h8 = ""  # Set h8 to empty string if the div is not found

# Extract paragraph 21 text from bounding box "300 2098 1833 2597"
div_p21 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 300 2098 1833 2597" in value)
if div_p21:
    p21 = div_p21.get_text(strip=True)  # Extract text from the found div
else:
    p21 = ""  # Set p21 to empty string if the div is not found

# Extract paragraph 22 text from bounding box "301 2636 1830 2792"
div_p22 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 301 2636 1830 2792" in value)
if div_p22:
    p22 = div_p22.get_text(strip=True)  # Extract text from the found div
else:
    p22 = ""  # Set p22 to empty string if the div is not found

# Extract paragraph 23 text from bounding box "298 2811 1835 2943"
div_p23 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 298 2811 1835 2943" in value)
if div_p23:
    p23 = div_p23.get_text(strip=True)  # Extract text from the found div
else:
    p23 = ""  # Set p23 to empty string if the div is not found

# Extract paragraph 24 text from bounding box "321 12 1844 154"
div_p24 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 321 12 1844 154" in value)
if div_p24:
    p24 = div_p24.get_text(strip=True)  # Extract text from the found div
else:
    p24 = ""  # Set p24 to empty string if the div is not found

# Extract paragraph 25 text from bounding box "319 137 1842 268"
div_p25 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 319 137 1842 268" in value)
if div_p25:
    p25 = div_p25.get_text(strip=True)  # Extract text from the found div
else:
    p25 = ""  # Set p25 to empty string if the div is not found

# Extract paragraph 26 text from bounding box "317 293 1842 553"
div_p26 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 317 293 1842 553" in value)
if div_p26:
    p26 = div_p26.get_text(strip=True)  # Extract text from the found div
else:
    p26 = ""  # Set p26 to empty string if the div is not found

# Extract paragraph 27 text from bounding box "317 568 1839 761"
div_p27 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 317 568 1839 761" in value)
if div_p27:
    p27 = div_p27.get_text(strip=True)  # Extract text from the found div
else:
    p27 = ""  # Set p27 to empty string if the div is not found

# Extract paragraph 28 text from bounding box "315 785 1470 940"
div_p28 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 315 785 1470 940" in value)
if div_p28:
    p28 = div_p28.get_text(strip=True)  # Extract text from the found div
else:
    p28 = ""  # Set p28 to empty string if the div is not found

# Extract paragraph 29 text from bounding box "317 954 1836 1100"
div_p29 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 317 954 1836 1100" in value)
if div_p29:
    p29 = div_p29.get_text(strip=True)  # Extract text from the found div
else:
    p29 = ""  # Set p29 to empty string if the div is not found

# Extract paragraph 30 text from bounding box "314 1111 1201 1184"
div_p30 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 314 1111 1201 1184" in value)
if div_p30:
    p30 = div_p30.get_text(strip=True)  # Extract text from the found div
else:
    p30 = ""  # Set p30 to empty string if the div is not found

# Extract heading 9 text from bounding box "237 1247 784 1314"
div_h9 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 237 1247 784 1314" in value)
if div_h9:
    h9 = div_h9.get_text(strip=True)  # Extract text from the found div
else:
    h9 = ""  # Set h9 to empty string if the div is not found

# Extract paragraph 31 text from bounding box "306 1343 1836 1858"
div_p31 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 306 1343 1836 1858" in value)
if div_p31:
    p31 = div_p31.get_text(strip=True)  # Extract text from the found div
else:
    p31 = ""  # Set p31 to empty string if the div is not found

# Extract paragraph 32 text from bounding box "303 1872 1831 2063"
div_p32 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 303 1872 1831 2063" in value)
if div_p32:
    p32 = div_p32.get_text(strip=True)  # Extract text from the found div
else:
    p32 = ""  # Set p32 to empty string if the div is not found

# Extract paragraph 33 text from bounding box "306 2086 1272 2417"
div_p33 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 306 2086 1272 2417" in value)
if div_p33:
    p33 = div_p33.get_text(strip=True)  # Extract text from the found div
else:
    p33 = ""  # Set p33 to empty string if the div is not found

# Extract heading 10 text from bounding box "225 2460 649 2547"
div_h10 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 225 2460 649 2547" in value)
if div_h10:
    h10 = div_h10.get_text(strip=True)  # Extract text from the found div
else:
    h10 = ""  # Set h10 to empty string if the div is not found

# Extract paragraph 34 text from bounding box "297 2570 1825 2890"
div_p34 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 297 2570 1825 2890" in value)
if div_p34:
    p34 = div_p34.get_text(strip=True)  # Extract text from the found div
else:
    p34 = ""  # Set p34 to empty string if the div is not found

# Extract heading 11 text from bounding box "184 17 1364 92"
div_h11 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 184 17 1364 92" in value)
if div_h11:
    h11 = div_h11.get_text(strip=True)  # Extract text from the found div
else:
    h11 = ""  # Set h11 to empty string if the div is not found

# Extract paragraph 35 text from bounding box "253 111 1782 372"
div_p35 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 253 111 1782 372" in value)
if div_p35:
    p35 = div_p35.get_text(strip=True)  # Extract text from the found div
else:
    p35 = ""  # Set p35 to empty string if the div is not found

# Extract heading 12 text from bounding box "168 425 521 498"
div_h12 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 168 425 521 498" in value)
if div_h12:
    h12 = div_h12.get_text(strip=True)  # Extract text from the found div
else:
    h12 = ""  # Set h12 to empty string if the div is not found

# Extract paragraph 36 text from bounding box "177 515 1782 778"
div_p36 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 177 515 1782 778" in value)
if div_p36:
    p36 = div_p36.get_text(strip=True)  # Extract text from the found div
else:
    p36 = ""  # Set p36 to empty string if the div is not found

# Extract paragraph 37 text from bounding box "175 794 1779 1052"
div_p37 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 175 794 1779 1052" in value)
if div_p37:
    p37 = div_p37.get_text(strip=True)  # Extract text from the found div
else:
    p37 = ""  # Set p37 to empty string if the div is not found

# Extract paragraph 38 text from bounding box "146 1109 1779 2050"
div_p38 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 146 1109 1779 2050" in value)
if div_p38:
    p38 = div_p38.get_text(strip=True)  # Extract text from the found div
else:
    p38 = ""  # Set p38 to empty string if the div is not found

# Extract heading 13 text from bounding box "152 2205 1684 2305"
div_h13 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 152 2205 1684 2305" in value)
if div_h13:
    h13 = div_h13.get_text(strip=True)  # Extract text from the found div
else:
    h13 = ""  # Set h13 to empty string if the div is not found

# Extract heading 14 text from bounding box "169 2304 993 2381"
div_h14 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 169 2304 993 2381" in value)
if div_h14:
    h14 = div_h14.get_text(strip=True)  # Extract text from the found div
else:
    h14 = ""  # Set h14 to empty string if the div is not found

# Extract paragraph 39 text from bounding box "163 2392 1768 2897"
div_p39 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 163 2392 1768 2897" in value)
if div_p39:
    p39 = div_p39.get_text(strip=True)  # Extract text from the found div
else:
    p39 = ""  # Set p39 to empty string if the div is not found

# Extract paragraph 40 text from bounding box "247 37 1131 104"
div_p40 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 247 37 1131 104" in value)
if div_p40:
    p40 = div_p40.get_text(strip=True)  # Extract text from the found div
else:
    p40 = ""  # Set p40 to empty string if the div is not found

# Extract paragraph 41 text from bounding box "245 151 1852 281"
div_p41 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 245 151 1852 281" in value)
if div_p41:
    p41 = div_p41.get_text(strip=True)  # Extract text from the found div
else:
    p41 = ""  # Set p41 to empty string if the div is not found

# Extract paragraph 42 text from bounding box "244 315 1850 573"
div_p42 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 244 315 1850 573" in value)
if div_p42:
    p42 = div_p42.get_text(strip=True)  # Extract text from the found div
else:
    p42 = ""  # Set p42 to empty string if the div is not found

# Extract paragraph 43 text from bounding box "245 604 1851 739"
div_p43 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 245 604 1851 739" in value)
if div_p43:
    p43 = div_p43.get_text(strip=True)  # Extract text from the found div
else:
    p43 = ""  # Set p43 to empty string if the div is not found

# Extract paragraph 44 text from bounding box "245 770 1854 1407"
div_p44 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 245 770 1854 1407" in value)
if div_p44:
    p44 = div_p44.get_text(strip=True)  # Extract text from the found div
else:
    p44 = ""  # Set p44 to empty string if the div is not found

# Extract paragraph 45 text from bounding box "223 1400 1468 1477"
div_p45 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 223 1400 1468 1477" in value)
if div_p45:
    p45 = div_p45.get_text(strip=True)  # Extract text from the found div
else:
    p45 = ""  # Set p45 to empty string if the div is not found

# Extract heading 15 text from bounding box "248 1641 1458 1806"
div_h15 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 248 1641 1458 1806" in value)
if div_h15:
    h15 = div_h15.get_text(strip=True)  # Extract text from the found div
else:
    h15 = ""  # Set h15 to empty string if the div is not found

# Extract paragraph 46 text from bounding box "242 1828 1852 2089"
div_p46 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 242 1828 1852 2089" in value)
if div_p46:
    p46 = div_p46.get_text(strip=True)  # Extract text from the found div
else:
    p46 = ""  # Set p46 to empty string if the div is not found

# Extract paragraph 47 text from bounding box "244 2137 1850 2463"
div_p47 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 244 2137 1850 2463" in value)
if div_p47:
    p47 = div_p47.get_text(strip=True)  # Extract text from the found div
else:
    p47 = ""  # Set p47 to empty string if the div is not found

# Extract paragraph 48 text from bounding box "244 2479 1849 2615"
div_p48 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 244 2479 1849 2615" in value)
if div_p48:
    p48 = div_p48.get_text(strip=True)  # Extract text from the found div
else:
    p48 = ""  # Set p48 to empty string if the div is not found

# Extract paragraph 49 text from bounding box "244 2628 1850 2763"
div_p49 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 244 2628 1850 2763" in value)
if div_p49:
    p49 = div_p49.get_text(strip=True)  # Extract text from the found div
else:
    p49 = ""  # Set p49 to empty string if the div is not found

# Extract paragraph 50 text from bounding box "244 2777 1847 2913"
div_p50 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 244 2777 1847 2913" in value)
if div_p50:
    p50 = div_p50.get_text(strip=True)  # Extract text from the found div
else:
    p50 = ""  # Set p50 to empty string if the div is not found

# Extract paragraph 51 text from bounding box "96 20 1695 378"
div_p51 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 96 20 1695 378" in value)
if div_p51:
    p51 = div_p51.get_text(strip=True)  # Extract text from the found div
else:
    p51 = ""  # Set p51 to empty string if the div is not found

# Extract paragraph 52 text from bounding box "96 401 395 466"
div_p52 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 96 401 395 466" in value)
if div_p52:
    p52 = div_p52.get_text(strip=True)  # Extract text from the found div
else:
    p52 = ""  # Set p52 to empty string if the div is not found

# Extract paragraph 53 text from bounding box "93 489 1695 683"
div_p53 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 93 489 1695 683" in value)
if div_p53:
    p53 = div_p53.get_text(strip=True)  # Extract text from the found div
else:
    p53 = ""  # Set p53 to empty string if the div is not found

# Extract paragraph 54 text from bounding box "94 705 1697 900"
div_p54 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 94 705 1697 900" in value)
if div_p54:
    p54 = div_p54.get_text(strip=True)  # Extract text from the found div
else:
    p54 = ""  # Set p54 to empty string if the div is not found

# Extract paragraph 55 text from bounding box "92 919 1695 1111"
div_p55 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 92 919 1695 1111" in value)
if div_p55:
    p55 = div_p55.get_text(strip=True)  # Extract text from the found div
else:
    p55 = ""  # Set p55 to empty string if the div is not found

# Extract paragraph 56 text from bounding box "94 1132 1694 1262"
div_p56 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 94 1132 1694 1262" in value)
if div_p56:
    p56 = div_p56.get_text(strip=True)  # Extract text from the found div
else:
    p56 = ""  # Set p56 to empty string if the div is not found

# Extract heading 16 text from bounding box "92 1351 1381 1426"
div_h16 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 92 1351 1381 1426" in value)
if div_h16:
    h16 = div_h16.get_text(strip=True)  # Extract text from the found div
else:
    h16 = ""  # Set h16 to empty string if the div is not found

# Extract paragraph 57 text from bounding box "92 1439 1694 1639"
div_p57 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 92 1439 1694 1639" in value)
if div_p57:
    p57 = div_p57.get_text(strip=True)  # Extract text from the found div
else:
    p57 = ""  # Set p57 to empty string if the div is not found

# Extract paragraph 58 text from bounding box "88 1652 1696 2172"
div_p58 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 88 1652 1696 2172" in value)
if div_p58:
    p58 = div_p58.get_text(strip=True)  # Extract text from the found div
else:
    p58 = ""  # Set p58 to empty string if the div is not found

# Extract text from the div containing bounding box "91 2190 1694 2387"
div_p59 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 91 2190 1694 2387" in value)
if div_p59:
    p59 = div_p59.get_text(strip=True)  # Extract text from the found div
else:
    p59 = ""  # Set p59 to empty string if the div is not found

# Extract text from the div containing bounding box "89 2467 1692 2885"
div_p60 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 89 2467 1692 2885" in value)
if div_p60:
    p60 = div_p60.get_text(strip=True)  # Extract text from the found div
else:
    p60 = ""  # Set p60 to empty string if the div is not found

# Extract text from the div containing bounding box "267 2 892 72"
div_p61 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 267 2 892 72" in value)
if div_p61:
    p61 = div_p61.get_text(strip=True)  # Extract text from the found div
else:
    p61 = ""  # Set p61 to empty string if the div is not found

# Extract text from the div containing bounding box "264 119 1867 253"
div_p62 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 264 119 1867 253" in value)
if div_p62:
    p62 = div_p62.get_text(strip=True)  # Extract text from the found div
else:
    p62 = ""  # Set p62 to empty string if the div is not found

# Extract text from the div containing bounding box "263 285 1866 416"
div_p63 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 263 285 1866 416" in value)
if div_p63:
    p63 = div_p63.get_text(strip=True)  # Extract text from the found div
else:
    p63 = ""  # Set p63 to empty string if the div is not found

# Extract text from the div containing bounding box "265 447 1864 578"
div_p64 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 265 447 1864 578" in value)
if div_p64:
    p64 = div_p64.get_text(strip=True)  # Extract text from the found div
else:
    p64 = ""  # Set p64 to empty string if the div is not found

# Extract text from the div containing bounding box "263 616 1867 811"
div_p65 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 263 616 1867 811" in value)
if div_p65:
    p65 = div_p65.get_text(strip=True)  # Extract text from the found div
else:
    p65 = ""  # Set p65 to empty string if the div is not found

# Extract text from the div containing bounding box "191 967 246 1022"
div_h17_0 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 191 967 246 1022" in value)
if div_h17_0:
    h17_0 = div_h17_0.get_text(strip=True)  # Extract text from the found div
else:
    h17_0 = ""  # Set h17_0 to empty string if the div is not found

# Extract text from the div containing bounding box "266 961 1081 1035"
div_h17 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 266 961 1081 1035" in value)
if div_h17:
    h17 = div_h17.get_text(strip=True)  # Extract text from the found div
else:
    h17 = ""  # Set h17 to empty string if the div is not found

# Extract text from the div containing bounding box "262 1048 1866 1259"
div_p66 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 262 1048 1866 1259" in value)
if div_p66:
    p66 = div_p66.get_text(strip=True)  # Extract text from the found div
else:
    p66 = ""  # Set p66 to empty string if the div is not found

# Extract text from the div containing bounding box "261 1287 1865 1419"
div_p67 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 261 1287 1865 1419" in value)
if div_p67:
    p67 = div_p67.get_text(strip=True)  # Extract text from the found div
else:
    p67 = ""  # Set p67 to empty string if the div is not found

# Extract text from the div containing bounding box "262 1450 1411 1520"
div_p68 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 262 1450 1411 1520" in value)
if div_p68:
    p68 = div_p68.get_text(strip=True)  # Extract text from the found div
else:
    p68 = ""  # Set p68 to empty string if the div is not found

# Extract text from the div containing bounding box "259 1551 1864 1743"
div_p69 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 259 1551 1864 1743" in value)
if div_p69:
    p69 = div_p69.get_text(strip=True)  # Extract text from the found div
else:
    p69 = ""  # Set p69 to empty string if the div is not found

# Extract text from the div containing bounding box "257 1781 1863 1973"
div_p70 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 257 1781 1863 1973" in value)
if div_p70:
    p70 = div_p70.get_text(strip=True)  # Extract text from the found div
else:
    p70 = ""  # Set p70 to empty string if the div is not found

# Extract text from the div containing bounding box "257 2008 1863 2199"
div_p71 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 257 2008 1863 2199" in value)
if div_p71:
    p71 = div_p71.get_text(strip=True)  # Extract text from the found div
else:
    p71 = ""  # Set p71 to empty string if the div is not found

# Extract text from the div containing bounding box "257 2216 1860 2411"
div_p72 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 257 2216 1860 2411" in value)
if div_p72:
    p72 = div_p72.get_text(strip=True)  # Extract text from the found div
else:
    p72 = ""  # Set p72 to empty string if the div is not found

# Extract text from the div containing bounding box "184 2516 239 2572"
div_h18_0 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 184 2516 239 2572" in value)
if div_h18_0:
    h18_0 = div_h18_0.get_text(strip=True)  # Extract text from the found div
else:
    h18_0 = ""  # Set h18_0 to empty string if the div is not found

# Extract text from the div containing bounding box "260 2515 627 2582"
div_h18 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 260 2515 627 2582" in value)
if div_h18:
    h18 = div_h18.get_text(strip=True)  # Extract text from the found div
else:
    h18 = ""  # Set h18 to empty string if the div is not found

# Extract text from the div containing bounding box "255 2612 1858 2869"
div_p73 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 255 2612 1858 2869" in value)
if div_p73:
    p73 = div_p73.get_text(strip=True)  # Extract text from the found div
else:
    p73 = ""  # Set p73 to empty string if the div is not found

# Extract text from the div containing bounding box "49 21 675 102"
div_h19 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 49 21 675 102" in value)
if div_h19:
    h19 = div_h19.get_text(strip=True)  # Extract text from the found div
else:
    h19 = ""  # Set h19 to empty string if the div is not found

# Extract text from the div containing bounding box "117 133 1871 643"
div_p74 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 117 133 1871 643" in value)
if div_p74:
    p74 = div_p74.get_text(strip=True)  # Extract text from the found div
else:
    p74 = ""  # Set p74 to empty string if the div is not found

# Extract text from the div containing bounding box "113 667 1868 924"
div_p75 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 113 667 1868 924" in value)
if div_p75:
    p75 = div_p75.get_text(strip=True)  # Extract text from the found div
else:
    p75 = ""  # Set p75 to empty string if the div is not found

# Extract text from the div containing bounding box "38 2731 1857 2894"
div_p76 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 38 2731 1857 2894" in value)
if div_p76:
    p76 = div_p76.get_text(strip=True)  # Extract text from the found div
else:
    p76 = ""  # Set p76 to empty string if the div is not found


# Construct the TEI XML document using the extracted information

# Define the TEI XML structure with extracted data
tei_xml = f"""
<TEI>
  <teiHeader>
    <fileDesc>
      <titleStmt>
        <title>{title_text}</title>
        <author>
          <orgName>{org_name_text}</orgName>
        </author>
      </titleStmt>
      <publicationStmt>
        <publPlace>{place_text}</publPlace>
        <date>{date_text}</date>
      </publicationStmt>
    </fileDesc>
  </teiHeader>
  <text>
    <body>
      <div>
        <!-- Heading -->
        <head>{h1}</head>
        <!-- Subtext -->
        <head>{sh1}</head>
        <p>{p1}</p>
        <p>{p2}</p>
        <p>{p3}</p>
        <p>{p4}</p>
        <p>{p5}</p>
        <p>{p6}</p>
        <p>{p7}</p>
        <p>{p8}</p>
        <p>{p9}</p>
      </div>
      <div>
        <!-- Heading -->
        <head>{h2}</head>
        <!-- Heading -->
        <head>{h3}</head>
        <p>{p10}</p>
        <!-- Heading -->
        <head>{h4}</head>
        <!-- Sub-heading -->
        <head>{sh2}</head>
        <p>{p11}</p>
        <!-- Sub-heading -->
        <head>{sh3}</head>
        <p>{p12}</p>
      </div>
      <div>
        <!-- Heading -->
        <head>{h5}</head>
        <p>{p13}</p>
        <p>{p14}</p>
        <p>{p15}</p>
        <p>{p16}</p>
      </div>
      <div>
        <!-- Heading -->
        <head>{h6}</head>
        <p>{p17}</p>
        <p>{p18}</p>
        <!-- Heading -->
        <head>{h7}</head>
        <p>{p19}</p>
        <p>{p20}</p>
        <!-- Heading -->
        <head>{h8}</head>
        <p>{p21}</p>
        <p>{p22}</p>
        <p>{p23}</p>
      </div>
      <div>
        <p>{p24} {p25}</p>
        <p>{p26}</p>
        <p>{p27}</p>
        <p>{p28}</p>
        <p>{p29}</p>
        <p>{p30}</p>
        <!-- Heading -->
        <head>{h9}</head>
        <p>{p31}</p>
        <p>{p32}</p>
        <p>{p33}</p>
        <!-- Heading -->
        <head>{h10}</head>
        <p>{p34}</p>
      </div>
      <div>
        <!-- Heading -->
        <head>{h11}</head>
        <p>{p35}</p>
        <!-- Heading -->
        <head>{h12}</head>
        <p>{p36}</p>
        <p>{p37}</p>
        <p>{p38}</p>
        <!-- Heading -->
        <head>{h13}</head>
        <!-- Heading -->
        <head>{h14}</head>
        <p>{p39}</p>
      </div>
      <div>
        <p>{p40}</p>
        <p>{p41}</p>
        <p>{p42}</p>
        <p>{p43}</p>
        <p>{p44}</p>
        <p>{p45}</p>
        <!-- Heading -->
        <head>{h15}</head>
        <p>{p46}</p>
        <p>{p47}</p>
        <p>{p48}</p>
        <p>{p49}</p>
        <p>{p50}</p>
      </div>
      <div>
        <p>{p51}</p>
        <p>{p52}</p>
        <p>{p53}</p>
        <p>{p54}</p>
        <p>{p55}</p>
        <p>{p56}</p>
        <!-- Heading -->
        <head>{h16}</head>
        <p>{p57}</p>
        <p>{p58}</p>
        <p>{p59}</p>
        <p>{p60}</p>
      </div>
      <div>
        <p>{p61}</p>
        <p>{p62}</p>
        <p>{p63}</p>
        <p>{p64}</p>
        <p>{p65}</p>
        <!-- Heading -->
        <head>{h17_0} {h17}</head>
        <p>{p66}</p>
        <p>{p67}</p>
        <p>{p68}</p>
        <p>{p69}</p>
        <p>{p70}</p>
        <p>{p71}</p>
        <p>{p72}</p>
        <!-- Heading -->
        <head>{h18_0} {h18}</head>
        <p>{p73}</p>
      </div>
      <div>
      <!-- Heading -->
        <head>{h19}</head>
        <p>{p74}</p>
        <p>{p75}</p>
        <p>{p76}</p>
      </div>
    </body>
  </text>
</TEI>
"""

# Write the TEI XML document to a file
with open("final_output_143209.tei.xml", "w", encoding="utf-8") as file:
    file.write(tei_xml)   # Write the TEI XML document content to the file

