In [1]:
# Import necessary libraries
from bs4 import BeautifulSoup  # BeautifulSoup is a library for parsing HTML and XML documents
import re  # The re module provides support for regular expressions

# Read the HOCR file
with open("/content/combined_hocr_142207.hocr", "r", encoding="utf-8") as file:
    hocr_content = file.read()  # Read the content of the HOCR file

# Parse the HOCR content using BeautifulSoup
soup = BeautifulSoup(hocr_content, "html.parser")  # Create a BeautifulSoup object from the HOCR content

# Extract key information from the HOCR content

# Extract the title text from bounding box "944 186 2235 334"
div_title_tag = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 944 186 2235 334" in value)
if div_title_tag:
    title_text = div_title_tag.get_text(strip=True)  # Extract text from the found div
else:
    title_text = ""  # Set title_text to empty string if the div is not found

# Extract the date text from bounding box "1361 1979 1810 2028"
div_date_tag = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 1361 1979 1810 2028" in value)
if div_date_tag:
    date_text = ""
    text = div_date_tag.get_text(strip=True)  # Extract text from the found div
    match = re.search(r"\b\d+.\s\w+\s\d+\b", text)  # Search for a specific date format
    if match:
        date_text = match.group(0)  # Extract the matched date text
else:
    date_text = ""  # Set date_text to empty string if the div is not found

# Extract the place names from bounding box "997 2177 2175 2230"
div_place_tag = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 997 2177 2175 2230" in value)
if div_place_tag:
    place_text = ""
    text = div_place_tag.get_text(strip=True)  # Extract text from the found div
    matches = re.findall(r'\bLeipzig\b|\bBerlin\b', text)  # Search for specific place names
    if matches:
        place_text = ' and '.join(matches)  # Join the found place names
else:
    place_text = ""  # Set place_text to empty string if the div is not found

# Extract the organization name from bounding box "997 2177 2175 2230"
div_org_name_tag = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 997 2177 2175 2230" in value)
if div_org_name_tag:
    org_name_text = ""
    text = div_org_name_tag.get_text(strip=True)  # Extract text from the found div
    match = re.search(r"\b[A-Z]\.G\.[A-Z][a-z]+\b", text)  # Search for a specific organization name format
    if match:
        org_name_text = match.group(0)  # Extract the matched organization name
else:
    org_name_text = ""  # Set org_name_text to empty string if the div is not found

# Extract various text elements from different divs with specific bounding boxes
# These elements will be used to construct the TEI XML document later

# Extract heading 1 text from bounding box "1144 537 2039 645"
div_h1 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 1144 537 2039 645" in value)
if div_h1:
    h1 = div_h1.get_text(strip=True)  # Extract text from the found div
else:
    h1 = ""  # Set h1 to empty string if the div is not found

# Extract subheading 1 text from bounding box "1387 691 1793 748"
div_sh1 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 1387 691 1793 748" in value)
if div_sh1:
    sh1 = div_sh1.get_text(strip=True)  # Extract text from the found div
else:
    sh1 = ""  # Set sh1 to empty string if the div is not found

# Extract heading 2 text from bounding box "1344 796 1835 906"
div_h2 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 1344 796 1835 906" in value)
if div_h2:
    h2 = div_h2.get_text(strip=True)  # Extract text from the found div
else:
    h2 = ""  # Set h2 to empty string if the div is not found

# Extract paragraph 1 text from bounding box "1466 1040 1717 1082"
div_p1 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 1466 1040 1717 1082" in value)
if div_p1:
    p1 = div_p1.get_text(strip=True)  # Extract text from the found div
else:
    p1 = ""  # Set p1 to empty string if the div is not found

# Extract paragraph 2 text from bounding box "1060 1113 2124 1254"
div_p2 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 1060 1113 2124 1254" in value)
if div_p2:
    p2 = div_p2.get_text(strip=True)  # Extract text from the found div
else:
    p2 = ""  # Set p2 to empty string if the div is not found

# Extract paragraph 3 text from bounding box "1436 1289 1740 1338"
div_p3 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 1436 1289 1740 1338" in value)
if div_p3:
    p3 = div_p3.get_text(strip=True)  # Extract text from the found div
else:
    p3 = ""  # Set p3 to empty string if the div is not found

# Extract paragraph 4 text from bounding box "1385 1348 1792 1457"
div_p4 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 1385 1348 1792 1457" in value)
if div_p4:
    p4 = div_p4.get_text(strip=True)  # Extract text from the found div
else:
    p4 = ""  # Set p4 to empty string if the div is not found

# Extract paragraph 5 text from bounding box "1102 1558 2074 1873"
div_p5 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 1102 1558 2074 1873" in value)
if div_p5:
    p5 = div_p5.get_text(strip=True)  # Extract text from the found div
else:
    p5 = ""  # Set p5 to empty string if the div is not found

# Extract paragraph 6 text from bounding box "1361 1979 1810 2028"
div_p6 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 1361 1979 1810 2028" in value)
if div_p6:
    p6 = div_p6.get_text(strip=True)  # Extract text from the found div
else:
    p6 = ""  # Set p6 to empty string if the div is not found

# Extract paragraph 7 text from bounding box "997 2177 2175 2230"
div_p7 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 997 2177 2175 2230" in value)
if div_p7:
    p7 = div_p7.get_text(strip=True)  # Extract text from the found div
else:
    p7 = ""  # Set p7 to empty string if the div is not found

# Extract paragraph 8 text from bounding box "939 2274 2235 2349"
div_p8 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 939 2274 2235 2349" in value)
if div_p8:
    p8 = div_p8.get_text(strip=True)  # Extract text from the found div
else:
    p8 = ""  # Set p8 to empty string if the div is not found

# Extract heading 3 text from bounding box "956 168 2246 260"
div_h3 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 956 168 2246 260" in value)
if div_h3:
    h3 = div_h3.get_text(strip=True)  # Extract text from the found div
else:
    h3 = ""  # Set h3 to empty string if the div is not found

# Extract subheading 2 text from bounding box "1326 265 1870 317"
div_sh2 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 1326 265 1870 317" in value)
if div_sh2:
    sh2 = div_sh2.get_text(strip=True)  # Extract text from the found div
else:
    sh2 = ""  # Set sh2 to empty string if the div is not found

# Extract paragraph 9 text from bounding box "954 337 2244 389"
div_p9 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 954 337 2244 389" in value)
if div_p9:
    p9 = div_p9.get_text(strip=True)  # Extract text from the found div
else:
    p9 = ""  # Set p9 to empty string if the div is not found

# Extract heading 4 text from bounding box "952 496 1634 561"
div_h4 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 952 496 1634 561" in value)
if div_h4:
    h4 = div_h4.get_text(strip=True)  # Extract text from the found div
else:
    h4 = ""  # Set h4 to empty string if the div is not found

# Extract paragraph 10 text from bounding box "1188 574 2229 801"
div_p10 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 1188 574 2229 801" in value)
if div_p10:
    p10 = div_p10.get_text(strip=True)  # Extract text from the found div
else:
    p10 = ""  # Set p10 to empty string if the div is not found

# Extract heading 5 text from bounding box "949 862 2243 990"
div_h5 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 949 862 2243 990" in value)
if div_h5:
    h5 = div_h5.get_text(strip=True)  # Extract text from the found div
else:
    h5 = ""  # Set h5 to empty string if the div is not found

# Extract subheading 3 text from bounding box "941 2099 1158 2151"
div_sh3 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 941 2099 1158 2151" in value)
if div_sh3:
    sh3 = div_sh3.get_text(strip=True)  # Extract text from the found div
else:
    sh3 = ""  # Set sh3 to empty string if the div is not found

# Extract paragraph 11 text from bounding box "1184 996 2244 2214"
div_p11 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 1184 996 2244 2214" in value)
if div_p11:
    p11 = div_p11.get_text(strip=True)  # Extract text from the found div
else:
    p11 = ""  # Set p11 to empty string if the div is not found

# Extract heading 6 text from bounding box "405 240 2217 341"
div_h6 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 405 240 2217 341" in value)
if div_h6:
    h6 = div_h6.get_text(strip=True)  # Extract text from the found div
else:
    h6 = ""  # Set h6 to empty string if the div is not found

# Extract paragraph 12 text from bounding box "405 240 2217 341"
div_p12 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 405 240 2217 341" in value)
if div_p12:
    p12 = div_p12.get_text(strip=True)  # Extract text from the found div
else:
    p12 = ""  # Set p12 to empty string if the div is not found

# Extract paragraph 13 text from bounding box "400 555 2224 1002"
div_p13 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 400 555 2224 1002" in value)
if div_p13:
    p13 = div_p13.get_text(strip=True)  # Extract text from the found div
else:
    p13 = ""  # Set p13 to empty string if the div is not found

# Extract paragraph 14 text from bounding box "402 1031 2221 1227"
div_p14 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 402 1031 2221 1227" in value)
if div_p14:
    p14 = div_p14.get_text(strip=True)  # Extract text from the found div
else:
    p14 = ""  # Set p14 to empty string if the div is not found

# Extract paragraph 15 text from bounding box "400 1220 2217 1349"
div_p15 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 400 1220 2217 1349" in value)
if div_p15:
    p15 = div_p15.get_text(strip=True)  # Extract text from the found div
else:
    p15 = ""  # Set p15 to empty string if the div is not found

# Extract heading 7 text from bounding box "405 1524 1330 1596"
div_h7 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 405 1524 1330 1596" in value)
if div_h7:
    h7 = div_h7.get_text(strip=True)  # Extract text from the found div
else:
    h7 = ""  # Set h7 to empty string if the div is not found

# Extract paragraph 16 text from bounding box "474 1644 2220 1836"
div_p16 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 474 1644 2220 1836" in value)
if div_p16:
    p16 = div_p16.get_text(strip=True)  # Extract text from the found div
else:
    p16 = ""  # Set p16 to empty string if the div is not found

# Extract paragraph 17 text from bounding box "475 1880 2226 2268"
div_p17 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 475 1880 2226 2268" in value)
if div_p17:
    p17 = div_p17.get_text(strip=True)  # Extract text from the found div
else:
    p17 = ""  # Set p17 to empty string if the div is not found

# Extract paragraph 18 text from bounding box "474 2303 2226 2869"
div_p18 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 474 2303 2226 2869" in value)
if div_p18:
    p18 = div_p18.get_text(strip=True)  # Extract text from the found div
else:
    p18 = ""  # Set p18 to empty string if the div is not found

# Extract paragraph 19 text from bounding box "477 2928 2226 3122"
div_p19 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 477 2928 2226 3122" in value)
if div_p19:
    p19 = div_p19.get_text(strip=True)  # Extract text from the found div
else:
    p19 = ""  # Set p19 to empty string if the div is not found

# Extract paragraph 20 text from bounding box "362 241 2112 563"
div_p20 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 362 241 2112 563" in value)
if div_p20:
    p20 = div_p20.get_text(strip=True)  # Extract text from the found div
else:
    p20 = ""  # Set p20 to empty string if the div is not found

# Extract paragraph 21 text from bounding box "361 577 2111 836"
div_p21 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 361 577 2111 836" in value)
if div_p21:
    p21 = div_p21.get_text(strip=True)  # Extract text from the found div
else:
    p21 = ""  # Set p21 to empty string if the div is not found

# Extract header 8.0 text from bounding box "292 968 349 1024"
div_h8_0 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 292 968 349 1024" in value)
if div_h8_0:
    h8_0 = div_h8_0.get_text(strip=True)  # Extract text from the found div
else:
    h8_0 = ""  # Set h8_0 to empty string if the div is not found

# Extract header 8 text from bounding box "367 936 1110 1033"
div_h8 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 367 936 1110 1033" in value)
if div_h8:
    h8 = div_h8.get_text(strip=True)  # Extract text from the found div
else:
    h8 = ""  # Set h8 to empty string if the div is not found

# Extract paragraph 22 text from bounding box "366 1060 2113 1442"
div_p22 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 366 1060 2113 1442" in value)
if div_p22:
    p22 = div_p22.get_text(strip=True)  # Extract text from the found div
else:
    p22 = ""  # Set p22 to empty string if the div is not found

# Extract paragraph 23 text from bounding box "366 1457 2116 1776"
div_p23 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 366 1457 2116 1776" in value)
if div_p23:
    p23 = div_p23.get_text(strip=True)  # Extract text from the found div
else:
    p23 = ""  # Set p23 to empty string if the div is not found

# Extract paragraph 24 text from bounding box "751 1818 2115 1945"
div_p24 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 751 1818 2115 1945" in value)
if div_p24:
    p24 = div_p24.get_text(strip=True)  # Extract text from the found div
else:
    p24 = ""  # Set p24 to empty string if the div is not found

# Extract paragraph 25 text from bounding box "850 1959 2115 2090"
div_p25 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 850 1959 2115 2090" in value)
if div_p25:
    p25 = div_p25.get_text(strip=True)  # Extract text from the found div
else:
    p25 = ""  # Set p25 to empty string if the div is not found

# Extract paragraph 26 text from bounding box "428 2101 2116 2248"
div_p26 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 428 2101 2116 2248" in value)
if div_p26:
    p26 = div_p26.get_text(strip=True)  # Extract text from the found div
else:
    p26 = ""  # Set p26 to empty string if the div is not found

# Extract paragraph 27 text from bounding box "401 2229 2117 2391"
div_p27 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 401 2229 2117 2391" in value)
if div_p27:
    p27 = div_p27.get_text(strip=True)  # Extract text from the found div
else:
    p27 = ""  # Set p27 to empty string if the div is not found

# Extract paragraph 28 text from bounding box "371 2360 2117 2496"
div_p28 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 371 2360 2117 2496" in value)
if div_p28:
    p28 = div_p28.get_text(strip=True)  # Extract text from the found div
else:
    p28 = ""  # Set p28 to empty string if the div is not found

# Extract paragraph 29 text from bounding box "904 2472 1930 2537"
div_p29 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 904 2472 1930 2537" in value)
if div_p29:
    p29 = div_p29.get_text(strip=True)  # Extract text from the found div
else:
    p29 = ""  # Set p29 to empty string if the div is not found

# Extract paragraph 30 text from bounding box "784 2561 2119 2689"
div_p30 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 784 2561 2119 2689" in value)
if div_p30:
    p30 = div_p30.get_text(strip=True)  # Extract text from the found div
else:
    p30 = ""  # Set p30 to empty string if the div is not found

# Extract paragraph 31 text from bounding box "853 2706 2117 2834"
div_p31 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 853 2706 2117 2834" in value)
if div_p31:
    p31 = div_p31.get_text(strip=True)  # Extract text from the found div
else:
    p31 = ""  # Set p31 to empty string if the div is not found

# Extract paragraph 32 text from bounding box "373 2872 2119 3132"
div_p32 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 373 2872 2119 3132" in value)
if div_p32:
    p32 = div_p32.get_text(strip=True)  # Extract text from the found div
else:
    p32 = ""  # Set p32 to empty string if the div is not found

# Extract paragraph 33 text from bounding box "497 247 2245 379"
div_p33 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 497 247 2245 379" in value)
if div_p33:
    p33 = div_p33.get_text(strip=True)  # Extract text from the found div
else:
    p33 = ""  # Set p33 to empty string if the div is not found

# Extract paragraph 34 text from bounding box "496 394 2245 653"
div_p34 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 496 394 2245 653" in value)
if div_p34:
    p34 = div_p34.get_text(strip=True)  # Extract text from the found div
else:
    p34 = ""  # Set p34 to empty string if the div is not found

# Extract paragraph 35 text from bounding box "496 667 2243 863"
div_p35 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 496 667 2243 863" in value)
if div_p35:
    p35 = div_p35.get_text(strip=True)  # Extract text from the found div
else:
    p35 = ""  # Set p35 to empty string if the div is not found

# Extract header 9.0 text from bounding box "422 987 478 1045"
div_h9_0 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 422 987 478 1045" in value)
if div_h9_0:
    h9_0 = div_h9_0.get_text(strip=True)  # Extract text from the found div
else:
    h9_0 = ""  # Set h9_0 to empty string if the div is not found

# Extract header 9 text from bounding box "494 981 1303 1054"
div_h9 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 494 981 1303 1054" in value)
if div_h9:
    h9 = div_h9.get_text(strip=True)  # Extract text from the found div
else:
    h9 = ""  # Set h9 to empty string if the div is not found

# Extract paragraph 36 text from bounding box "490 1082 2237 1339"
div_p36 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 490 1082 2237 1339" in value)
if div_p36:
    p36 = div_p36.get_text(strip=True)  # Extract text from the found div
else:
    p36 = ""  # Set p36 to empty string if the div is not found

# Extract paragraph 37 text from bounding box "486 1352 2237 2114"
div_p37 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 486 1352 2237 2114" in value)
if div_p37:
    p37 = div_p37.get_text(strip=True)  # Extract text from the found div
else:
    p37 = ""  # Set p37 to empty string if the div is not found

# Extract paragraph 38 text from bounding box "485 2128 2235 2384"
div_p38 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 485 2128 2235 2384" in value)
if div_p38:
    p38 = div_p38.get_text(strip=True)  # Extract text from the found div
else:
    p38 = ""  # Set p38 to empty string if the div is not found

# Extract header 10.0 text from bounding box "413 2499 470 2556"
div_h10_0 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 413 2499 470 2556" in value)
if div_h10_0:
    h10_0 = div_h10_0.get_text(strip=True)  # Extract text from the found div
else:
    h10_0 = ""  # Set h10_0 to empty string if the div is not found

# Extract header 10 text from bounding box "488 2492 2235 2644"
div_h10 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 488 2492 2235 2644" in value)
if div_h10:
    h10 = div_h10.get_text(strip=True)  # Extract text from the found div
else:
    h10 = ""  # Set h10 to empty string if the div is not found

# Extract paragraph 39 text from bounding box "486 2665 2233 2984"
div_p39 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 486 2665 2233 2984" in value)
if div_p39:
    p39 = div_p39.get_text(strip=True)  # Extract text from the found div
else:
    p39 = ""  # Set p39 to empty string if the div is not found

# Extract paragraph 40 text from bounding box "489 2998 2234 3130"
div_p40 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 489 2998 2234 3130" in value)
if div_p40:
    p40 = div_p40.get_text(strip=True)  # Extract text from the found div
else:
    p40 = ""  # Set p40 to empty string if the div is not found

# Extract paragraph 41 text from bounding box "355 282 2106 794"
div_p41 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 355 282 2106 794" in value)
if div_p41:
    p41 = div_p41.get_text(strip=True)  # Extract text from the found div
else:
    p41 = ""  # Set p41 to empty string if the div is not found

# Extract paragraph 42 text from bounding box "358 809 2108 1318"
div_p42 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 358 809 2108 1318" in value)
if div_p42:
    p42 = div_p42.get_text(strip=True)  # Extract text from the found div
else:
    p42 = ""  # Set p42 to empty string if the div is not found

# Extract paragraph 43 text from bounding box "358 1333 2109 1590"
div_p43 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 358 1333 2109 1590" in value)
if div_p43:
    p43 = div_p43.get_text(strip=True)  # Extract text from the found div
else:
    p43 = ""  # Set p43 to empty string if the div is not found

# Extract header 11.0 text from bounding box "284 1691 340 1747"
div_h11_0 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 284 1691 340 1747" in value)
if div_h11_0:
    h11_0 = div_h11_0.get_text(strip=True)  # Extract text from the found div
else:
    h11_0 = ""  # Set h11_0 to empty string if the div is not found

# Extract header 11 text from bounding box "357 1687 1235 1760"
div_h11 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 357 1687 1235 1760" in value)
if div_h11:
    h11 = div_h11.get_text(strip=True)  # Extract text from the found div
else:
    h11 = ""  # Set h11 to empty string if the div is not found

# Extract paragraph 44 text from bounding box "358 1333 2109 1590"
div_p44 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 358 1333 2109 1590" in value)
if div_p44:
    p44 = div_p44.get_text(strip=True)  # Extract text from the found div
else:
    p44 = ""  # Set p44 to empty string if the div is not found

# Extract paragraph 45 text from bounding box "358 1333 2109 1590"
div_p45 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 358 1333 2109 1590" in value)
if div_p45:
    p45 = div_p45.get_text(strip=True)  # Extract text from the found div
else:
    p45 = ""  # Set p45 to empty string if the div is not found

# Extract paragraph 46 text from bounding box "2071 3225 2107 3275"
div_p46 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 2071 3225 2107 3275" in value)
if div_p46:
    p46 = div_p46.get_text(strip=True)  # Extract text from the found div
else:
    p46 = ""  # Set p46 to empty string if the div is not found

# Extract paragraph 47 text from bounding box "358 2817 2106 3169"
div_p47 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 358 2817 2106 3169" in value)
if div_p47:
    p47 = div_p47.get_text(strip=True)  # Extract text from the found div
else:
    p47 = ""  # Set p47 to empty string if the div is not found

# Extract paragraph 48 text from bounding box "1057 197 2298 874"
div_p48 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 1057 197 2298 874" in value)
if div_p48:
    p48 = div_p48.get_text(strip=True)  # Extract text from the found div
else:
    p48 = ""  # Set p48 to empty string if the div is not found

# Extract header 12.0 text from bounding box "1026 1305 1047 1324"
div_h12_0 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 1026 1305 1047 1324" in value)
if div_h12_0:
    h12_0 = div_h12_0.get_text(strip=True)  # Extract text from the found div
else:
    h12_0 = ""  # Set h12_0 to empty string if the div is not found

# Extract header 12 text from bounding box "1056 946 1441 1003"
div_h12 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 1056 946 1441 1003" in value)
if div_h12:
    h12 = div_h12.get_text(strip=True)  # Extract text from the found div
else:
    h12 = ""  # Set h12 to empty string if the div is not found

# Extract paragraph 49 text from bounding box "1056 1017 2296 1208"
div_p49 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 1056 1017 2296 1208" in value)
if div_p49:
    p49 = div_p49.get_text(strip=True)  # Extract text from the found div
else:
    p49 = ""  # Set p49 to empty string if the div is not found

# Extract header 13.0 text from bounding box "1004 1281 1033 1324"
div_h13_0 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 1004 1281 1033 1324" in value)
if div_h13_0:
    h13_0 = div_h13_0.get_text(strip=True)  # Extract text from the found div
else:
    h13_0 = ""  # Set h13_0 to empty string if the div is not found

# Extract header 13 text from bounding box "1055 1276 1370 1332"
div_h13 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 1055 1276 1370 1332" in value)
if div_h13:
    h13 = div_h13.get_text(strip=True)  # Extract text from the found div
else:
    h13 = ""  # Set h13 to empty string if the div is not found

# Extract paragraph 50 text from bounding box "1055 1348 2295 1494"
div_p50 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 1055 1348 2295 1494" in value)
if div_p50:
    p50 = div_p50.get_text(strip=True)  # Extract text from the found div
else:
    p50 = ""  # Set p50 to empty string if the div is not found

# Construct the TEI XML document using the extracted information

# Define the TEI XML structure with extracted data
tei_xml = f"""
<TEI>
  <teiHeader>
    <fileDesc>
      <titleStmt>
        <title>{title_text}</title>
        <author>
          <orgName>{org_name_text}</orgName>
        </author>
      </titleStmt>
      <publicationStmt>
        <publPlace>{place_text}</publPlace>
        <date>{date_text}</date>
      </publicationStmt>
    </fileDesc>
  </teiHeader>
  <text>
    <body>
      <div>
        <!-- Heading -->
        <head>{h1}</head>
        <!-- Subtext -->
        <head>{sh1}</head>
         <!-- Heading -->
        <head>{h2}</head>
        <p>{p1}</p>
        <p>{p2}</p>
        <p>{p3}</p>
        <p>{p4}</p>
        <p>{p5}</p>
        <p>{p6}</p>
        <p>{p7}</p>
        <p>{p8}</p>
      </div>
      <div>
       <!-- Heading -->
        <head>{h3}</head>
        <!-- Subtext -->
        <head>{sh2}</head>
        <p>{p9}</p>
        <!-- Heading -->
        <head>{h4}</head>
        <p>{p10}</p>
        <!-- Heading -->
        <head>{h5}</head>
        <!-- Sub-heading -->
        <head>{sh3}</head>
        <p>{p11}</p>
      </div>
      <div>
        <!-- Heading -->
        <head>{h6}</head>
        <p>{p13}</p>
        <p>{p14}</p>
        <p>{p15}</p>
        <!-- Heading -->
        <head>{h7}</head>
        <p>{p16}</p>
        <list type="number">
          <item n="1">{p17}</item>
          <item n="2">{p18}</item>
        </list>
        <p>{p19}</p>
      </div>
      <div>
        <p>{p20}</p>
        <p>{p21}</p>
        <!-- Heading -->
        <head>{h8_0} {h8}</head>
        <p>{p22} {p23} {p24} {p25} {p26} {p27} {p28} {p29} {p30} {p31} {p32}</p>
      </div>
      <div>
        <p>{p33}</p>
        <p>{p34}</p>
        <p>{p35}</p>
        <!-- Heading -->
        <head>{h9_0} {h9}</head>
        <p>{p36}</p>
        <p>{p37}</p>
        <p>{p38}</p>
        <!-- Heading -->
        <head>{h10_0} {h10}</head>
        <p>{p39}</p>
        <p>{p40}</p>
      </div>
      <div>
        <p>{p41}</p>
        <p>{p42}</p>
        <p>{p43}</p>
        <!-- Heading -->
        <head>{h11_0} {h11}</head>
        <p>{p44} {p45}</p>
        <p>{p46} {p47}</p>
      </div>
      <div>
        <p>{p48}</p>
        <!-- Heading -->
        <head>{h12_0} {h12}</head>
        <p>{p49}</p>
        <head>{h13_0} {h13}</head>
        <p>{p50}</p>
      </div>
    </body>
  </text>
</TEI>
"""

# Write the TEI XML document to a file
with open("final_output_142207.tei.xml", "w", encoding="utf-8") as file:
    file.write(tei_xml)  # Write the TEI XML document content to the file