In [1]:
# Import necessary libraries
from bs4 import BeautifulSoup  # Import BeautifulSoup for HTML parsing
import re  # Import re module for regular expressions

# Read the HOCR file
with open("/content/combined_hocr_143122.hocr", "r", encoding="utf-8") as file:
    hocr_content = file.read()  # Read the content of the HOCR file

# Parse the HOCR content
soup = BeautifulSoup(hocr_content, "html.parser")  # Create a BeautifulSoup object from the HOCR content

# Extract key information from the HOCR content

# Extract the title text from bounding box "205 161 1491 307"
div_title_tag = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 205 161 1491 307" in value)
if div_title_tag:
    title_text = div_title_tag.get_text(strip=True)  # Extract text from the found div
else:
    title_text = ""  # Set title_text to empty string if the div is not found

# Extract the date text from bounding box "634 1895 1064 1941"
div_date_tag = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 634 1895 1064 1941" in value)
if div_date_tag:
    date_text = ""
    text = div_date_tag.get_text(strip=True)  # Extract text from the found div
    match = re.search(r"\b\d+\.\s\w+\s\d+\b", text)  # Search for a specific date format
    if match:
        date_text = match.group(0)  # Extract the matched date text
else:
    date_text = ""  # Set date_text to empty string if the div is not found

# Extract the place names from bounding box "204 2134 1494 2186"
div_place_tag = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 204 2134 1494 2186" in value)
if div_place_tag:
    place_text = ""
    text = div_place_tag.get_text(strip=True)  # Extract text from the found div
    matches = re.findall(r'\bLeipzig\b|\bBerlin\b', text)  # Search for specific place names
    if matches:
        place_text = ' and '.join(matches)  # Join the found place names
else:
    place_text = ""  # Set place_text to empty string if the div is not found

# Extract the organization name from bounding box "204 2134 1494 2186"
div_org_name_tag = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 204 2134 1494 2186" in value)
if div_org_name_tag:
    org_name_text = ""
    text = div_org_name_tag.get_text(strip=True)  # Extract text from the found div
    match = re.search(r"\b[A-Z]\. G\. [A-Z][a-z]+\b", text)  # Search for a specific organization name format
    if match:
        org_name_text = match.group(0)  # Extract the matched organization name
else:
    org_name_text = ""  # Set org_name_text to empty string if the div is not found

# Extract various text elements from different divs with specific bounding boxes
# These elements will be used to construct the TEI XML document later

# Extract heading 1 text from bounding box "364 547 1331 656"
div_h1 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 364 547 1331 656" in value)
if div_h1:
    h1 = div_h1.get_text(strip=True)  # Extract text from the found div
else:
    h1 = ""  # Set h1 to empty string if the div is not found

# Extract subheading 1 text from bounding box "198 705 1023 764"
div_sh1 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 198 705 1023 764" in value)
if div_sh1:
    sh1 = div_sh1.get_text(strip=True)  # Extract text from the found div
else:
    sh1 = ""  # Set sh1 to empty string if the div is not found

# Extract heading 2 text from bounding box "443 820 1244 926"
div_h2 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 443 820 1244 926" in value)
if div_h2:
    h2 = div_h2.get_text(strip=True)  # Extract text from the found div
else:
    h2 = ""  # Set h2 to empty string if the div is not found

# Extract paragraph 1 text from bounding box "722 1072 976 1113"
div_p1 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 722 1072 976 1113" in value)
if div_p1:
    p1 = div_p1.get_text(strip=True)  # Extract text from the found div
else:
    p1 = ""  # Set p1 to empty string if the div is not found

# Extract paragraph 2 text from bounding box "303 1147 1395 1199"
div_p2 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 303 1147 1395 1199" in value)
if div_p2:
    p2 = div_p2.get_text(strip=True)  # Extract text from the found div
else:
    p2 = ""  # Set p2 to empty string if the div is not found

# Extract paragraph 3 text from bounding box "363 1389 1336 1739"
div_p3 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 363 1389 1336 1739" in value)
if div_p3:
    p3 = div_p3.get_text(strip=True)  # Extract text from the found div
else:
    p3 = ""  # Set p3 to empty string if the div is not found

# Extract paragraph 4 text from bounding box "634 1895 1064 1941"
div_p4 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 634 1895 1064 1941" in value)
if div_p4:
    p4 = div_p4.get_text(strip=True)  # Extract text from the found div
else:
    p4 = ""  # Set p4 to empty string if the div is not found

# Extract paragraph 5 text from bounding box "204 2134 1494 2186"
div_p5 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 204 2134 1494 2186" in value)
if div_p5:
    p5 = div_p5.get_text(strip=True)  # Extract text from the found div
else:
    p5 = ""  # Set p5 to empty string if the div is not found

# Extract paragraph 6 text from bounding box "203 2236 1513 2277"
div_p6 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 203 2236 1513 2277" in value)
if div_p6:
    p6 = div_p6.get_text(strip=True)  # Extract text from the found div
else:
    p6 = ""  # Set p6 to empty string if the div is not found

# Extract heading 3 text from bounding box "1116 494 1522 563"
div_h3 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 1116 494 1522 563" in value)
if div_h3:
    h3 = div_h3.get_text(strip=True)  # Extract text from the found div
else:
    h3 = ""  # Set h3 to empty string if the div is not found

# Extract paragraph 7 text from bounding box "672 581 1956 897"
div_p7 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 672 581 1956 897" in value)
if div_p7:
    p7 = div_p7.get_text(strip=True)  # Extract text from the found div
else:
    p7 = ""  # Set p7 to empty string if the div is not found

# Extract heading 4 text from bounding box "673 957 1512 1086"
div_h4 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 673 957 1512 1086" in value)
if div_h4:
    h4 = div_h4.get_text(strip=True)  # Extract text from the found div
else:
    h4 = ""  # Set h4 to empty string if the div is not found

# Extract paragraph 8 text from bounding box "671 1097 1955 1286"
div_p8 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 671 1097 1955 1286" in value)
if div_p8:
    p8 = div_p8.get_text(strip=True)  # Extract text from the found div
else:
    p8 = ""  # Set p8 to empty string if the div is not found

# Extract heading 5 text from bounding box "672 1353 1208 1415"
div_h5 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 672 1353 1208 1415" in value)
if div_h5:
    h5 = div_h5.get_text(strip=True)  # Extract text from the found div
else:
    h5 = ""  # Set h5 to empty string if the div is not found

# Extract heading 6 text from bounding box "1122 1444 1488 1503"
div_h6 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 1122 1444 1488 1503" in value)
if div_h6:
    h6 = div_h6.get_text(strip=True)  # Extract text from the found div
else:
    h6 = ""  # Set h6 to empty string if the div is not found

# Extract paragraph 9 text from bounding box "664 1516 1952 2074"
div_p9 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 664 1516 1952 2074" in value)
if div_p9:
    p9 = div_p9.get_text(strip=True)  # Extract text from the found div
else:
    p9 = ""  # Set p9 to empty string if the div is not found

# Extract heading 7 text from bounding box "663 2134 1608 2272"
div_h7 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 663 2134 1608 2272" in value)
if div_h7:
    h7 = div_h7.get_text(strip=True)  # Extract text from the found div
else:
    h7 = ""  # Set h7 to empty string if the div is not found

# Extract paragraph 10 text from bounding box "661 2289 1947 2541"
div_p10 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 661 2289 1947 2541" in value)
if div_p10:
    p10 = div_p10.get_text(strip=True)  # Extract text from the found div
else:
    p10 = ""  # Set p10 to empty string if the div is not found

# Extract paragraph 11 text from bounding box "662 2538 938 2596"
div_p11 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 662 2538 938 2596" in value)
if div_p11:
    p11 = div_p11.get_text(strip=True)  # Extract text from the found div
else:
    p11 = ""  # Set p11 to empty string if the div is not found

# Extract paragraph 12 text from bounding box "1096 2604 1663 2675"
div_p12 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 1096 2604 1663 2675" in value)
if div_p12:
    p12 = div_p12.get_text(strip=True)  # Extract text from the found div
else:
    p12 = ""  # Set p12 to empty string if the div is not found

# Extract paragraph 13 text from bounding box "1132 3027 1465 3077"
div_p13 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 1132 3027 1465 3077" in value)
if div_p13:
    p13 = div_p13.get_text(strip=True)  # Extract text from the found div
else:
    p13 = ""  # Set p13 to empty string if the div is not found

# Extract paragraph 14 text from bounding box "308 295 2129 692"
div_p14 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 308 295 2129 692" in value)
if div_p14:
    p14 = div_p14.get_text(strip=True)  # Extract text from the found div
else:
    p14 = ""  # Set p14 to empty string if the div is not found

# Extract heading 8 text from bounding box "308 896 1500 985"
div_h8 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 308 896 1500 985" in value)
if div_h8:
    h8 = div_h8.get_text(strip=True)  # Extract text from the found div
else:
    h8 = ""  # Set h8 to empty string if the div is not found

# Extract paragraph 15 text from bounding box "648 1025 2133 1294"
div_p15 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 648 1025 2133 1294" in value)
if div_p15:
    p15 = div_p15.get_text(strip=True)  # Extract text from the found div
else:
    p15 = ""  # Set p15 to empty string if the div is not found

# Extract heading 9 text from bounding box "311 1402 2041 1505"
div_h9 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 311 1402 2041 1505" in value)
if div_h9:
    h9 = div_h9.get_text(strip=True)  # Extract text from the found div
else:
    h9 = ""  # Set h9 to empty string if the div is not found

# Extract paragraph 16 text from bounding box "302 1535 2135 2646"
div_p16 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 302 1535 2135 2646" in value)
if div_p16:
    p16 = div_p16.get_text(strip=True)  # Extract text from the found div
else:
    p16 = ""  # Set p16 to empty string if the div is not found

# Extract paragraph 17 text from bounding box "650 2678 1409 2746"
div_p17 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 650 2678 1409 2746" in value)
if div_p17:
    p17 = div_p17.get_text(strip=True)  # Extract text from the found div
else:
    p17 = ""  # Set p17 to empty string if the div is not found

# Extract paragraph 18 text from bounding box "653 2768 2131 2931"
div_p18 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 653 2768 2131 2931" in value)
if div_p18:
    p18 = div_p18.get_text(strip=True)  # Extract text from the found div
else:
    p18 = ""  # Set p18 to empty string if the div is not found

# Extract paragraph 19 text from bounding box "654 2959 2137 3033"
div_p19 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 654 2959 2137 3033" in value)
if div_p19:
    p19 = div_p19.get_text(strip=True)  # Extract text from the found div
else:
    p19 = ""  # Set p19 to empty string if the div is not found

# Extract paragraph 20 text from bounding box "310 3095 1839 3168"
div_p20 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 310 3095 1839 3168" in value)
if div_p20:
    p20 = div_p20.get_text(strip=True)  # Extract text from the found div
else:
    p20 = ""  # Set p20 to empty string if the div is not found

# Extract paragraph 21 text from bounding box "1008 201 2237 400"
div_p21 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 1008 201 2237 400" in value)
if div_p21:
    p21 = div_p21.get_text(strip=True)  # Extract text from the found div
else:
    p21 = ""  # Set p21 to empty string if the div is not found

# Extract paragraph 22 text from bounding box "1001 454 2234 1158"
div_p22 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 1001 454 2234 1158" in value)
if div_p22:
    p22 = div_p22.get_text(strip=True)  # Extract text from the found div
else:
    p22 = ""  # Set p22 to empty string if the div is not found

# Extract paragraph 23 text from bounding box "998 1217 2228 1562"
div_p23 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 998 1217 2228 1562" in value)
if div_p23:
    p23 = div_p23.get_text(strip=True)  # Extract text from the found div
else:
    p23 = ""  # Set p23 to empty string if the div is not found

# Extract paragraph 24 text from bounding box "989 1616 2226 2238"
div_p24 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 989 1616 2226 2238" in value)
if div_p24:
    p24 = div_p24.get_text(strip=True)  # Extract text from the found div
else:
    p24 = ""  # Set p24 to empty string if the div is not found

# Extract paragraph 25 text from bounding box "24 153 2243 719"
div_p25 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 24 153 2243 719" in value)
if div_p25:
    p25 = div_p25.get_text(strip=True)  # Extract text from the found div
else:
    p25 = ""  # Set p25 to empty string if the div is not found

# Extract paragraph 26 text from bounding box "23 724 1727 817"
div_p26 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 23 724 1727 817" in value)
if div_p26:
    p26 = div_p26.get_text(strip=True)  # Extract text from the found div
else:
    p26 = ""  # Set p26 to empty string if the div is not found

# Extract paragraph 27 text from bounding box "351 265 1467 357"
div_p27 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 351 265 1467 357" in value)
if div_p27:
    p27 = div_p27.get_text(strip=True)  # Extract text from the found div
else:
    p27 = ""  # Set p27 to empty string if the div is not found

# Extract heading 10 text from bounding box "643 391 1180 441"
div_h10 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 643 391 1180 441" in value)
if div_h10:
    h10 = div_h10.get_text(strip=True)  # Extract text from the found div
else:
    h10 = ""  # Set h10 to empty string if the div is not found

# Extract paragraph 28 text from bounding box "350 452 1471 677"
div_p28 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 350 452 1471 677" in value)
if div_p28:
    p28 = div_p28.get_text(strip=True)  # Extract text from the found div
else:
    p28 = ""  # Set p28 to empty string if the div is not found

# Extract heading 11 text from bounding box "724 716 1099 769"
div_h11 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 724 716 1099 769" in value)
if div_h11:
    h11 = div_h11.get_text(strip=True)  # Extract text from the found div
else:
    h11 = ""  # Set h11 to empty string if the div is not found

# Extract paragraph 29 text from bounding box "354 778 1473 1092"
div_p29 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 354 778 1473 1092" in value)
if div_p29:
    p29 = div_p29.get_text(strip=True)  # Extract text from the found div
else:
    p29 = ""  # Set p29 to empty string if the div is not found

# Extract paragraph 30 text from bounding box "356 1136 1476 1556"
div_p30 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 356 1136 1476 1556" in value)
if div_p30:
    p30 = div_p30.get_text(strip=True)  # Extract text from the found div
else:
    p30 = ""  # Set p30 to empty string if the div is not found

# Extract heading 12 text from bounding box "598 1590 1239 1641"
div_h12 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 598 1590 1239 1641" in value)
if div_h12:
    h12 = div_h12.get_text(strip=True)  # Extract text from the found div
else:
    h12 = ""  # Set h12 to empty string if the div is not found

# Extract paragraph 31 text from bounding box "358 1652 1477 1877"
div_p31 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 358 1652 1477 1877" in value)
if div_p31:
    p31 = div_p31.get_text(strip=True)  # Extract text from the found div
else:
    p31 = ""  # Set p31 to empty string if the div is not found

# Extract heading 13 text from bounding box "671 1914 1168 1964"
div_h13 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 671 1914 1168 1964" in value)
if div_h13:
    h13 = div_h13.get_text(strip=True)  # Extract text from the found div
else:
    h13 = ""  # Set h13 to empty string if the div is not found

# Extract paragraph 32 text from bounding box "360 1976 1479 2113"
div_p32 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 360 1976 1479 2113" in value)
if div_p32:
    p32 = div_p32.get_text(strip=True)  # Extract text from the found div
else:
    p32 = ""  # Set p32 to empty string if the div is not found

# Construct the TEI XML document using the extracted information

# Define the TEI XML structure with extracted data
tei_xml = f"""
<TEI>
  <teiHeader>
    <fileDesc>
      <titleStmt>
        <title>{title_text}</title>
        <author>
          <orgName>{org_name_text}</orgName>
        </author>
      </titleStmt>
      <publicationStmt>
        <publPlace>{place_text}</publPlace>
        <date>{date_text}</date>
      </publicationStmt>
    </fileDesc>
  </teiHeader>
  <text>
    <body>
      <div>
        <!-- Heading -->
        <head>{h1}</head>
        <!-- Subtext -->
        <head>{sh1}</head>
        <!-- Heading -->
        <head>{h2}</head>
        <p>{p1}</p>
        <p>{p2}</p>
        <p>{p3}</p>
        <p>{p4}</p>
        <p>{p5}</p>
        <p>{p6}</p>
      </div>
      <div>
        <!-- Heading -->
        <head>{h3}</head>
        <p>{p7}</p>
        <!-- Heading -->
        <head>{h4}</head>
        <p>{p8}</p>
        <!-- Heading -->
        <head>{h5} {h6}</head>
        <p>{p9}</p>
        <!-- Heading -->
        <head>{h7}</head>
        <p>{p10} {p11}</p>
        <p>{p12}</p>
        <div type="footer">
          <!--Footer-->
          <fw>{p13}</fw>
        </div>
      </div>
      <div>
        <p>{p14}</p>
        <!-- Heading -->
        <head>{h8}</head>
        <p>{p15}</p>
        <!-- Heading -->
        <head>{h9}</head>
         <p>{p16} {p17} {p18} {p19}</p>
         <p>{p20}</p>
      </div>
      <div>
        <p>{p21}</p>
        <p>{p22}</p>
        <p>{p23}</p>
        <p>{p24}</p>
      </div>
      <div>
        <p>{p25} {p26}</p>
      </div>
      <div>
        <p>{p27}</p>
        <!-- Heading -->
        <head>{h10}</head>
        <p>{p28}</p>
        <!-- Heading -->
        <head>{h11}</head>
        <p>{p29}</p>
        <p>{p30}</p>
        <!-- Heading -->
        <head>{h12}</head>
        <p>{p31}</p>
        <!-- Heading -->
        <head>{h13}</head>
        <p>{p32}</p>
      </div>
    </body>
  </text>
</TEI>
"""

# Write the TEI XML document to a file
with open("final_output_143122.tei.xml", "w", encoding="utf-8") as file:
    file.write(tei_xml)   # Write the TEI XML document content to the file

