In [None]:
# Import necessary libraries
from bs4 import BeautifulSoup  # BeautifulSoup is a library for parsing HTML and XML documents
import re  # The re module provides support for regular expressions

# Read the HOCR file
with open("OCR-D-HOCR-Final_20240110-142856_2-2.xml", "r", encoding="utf-8") as file:
    hocr_content = file.read()  # Read the content of the HOCR file

# Parse the HOCR content using BeautifulSoup
soup = BeautifulSoup(hocr_content, "html.parser")  # Create a BeautifulSoup object from the HOCR content

# Extract the place names from bounding box "bbox 271 2112 1554 2169"
div_place_tag = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 271 2112 1554 2169" in value)

place_text = ""

# Extract text from the found div
if div_place_tag:
    text = div_place_tag.get_text(strip=True)  # Extract text from the found div
    # Match both "Leipzig" and "Berlin" using regular expression
    matches = re.findall(r'\b(?:Leipzig|Berlin)\b', text)
    if matches:
        place_text = " und ".join(matches)  # Join the matches with " und " in between

# Extract the organization name from bounding box "bbox 271 2112 1554 2169"
div_org_name_tag = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 271 2112 1554 2169" in value)

# Extract text from the third div with the specified format for orgName
if div_org_name_tag:
    org_name_text = ""
    text = div_org_name_tag.get_text(strip=True)  # Extract text from the found div
    match = re.search(r"\b[A-Z]\.[A-Z]\. [A-Z][a-z]+\b", text)  # Search for a specific organization name format
    if match:
        org_name_text = match.group(0)  # Extract the matched organization name
else:
    org_name_text = ""  # Set org_name_text to empty string if the div is not found

# Extract various text elements from different divs with specific bounding boxes
# These elements will be used to construct the TEI XML document later

# Find the div with class "ocr_carea" and p1 attribute containing "bbox 271 210 1561 347"
div_p1 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 271 210 1561 347" in value)

# Extract text from the first div
if div_p1:
    p1 = div_p1.get_text(strip=True)  # Extract text from the found div
else:
    p1 = ""  # Set p1 to empty string if the div is not found

# Find the div with class "ocr_carea" and head1 attribute containing "bbox 273 375 1320 448"
div_h1 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 273 375 1320 448" in value)

# Extract text from the first div
if div_h1:
    h1 = div_h1.get_text(strip=True)  # Extract text from the found div
else:
    h1 = ""  # Set h1 to empty string if the div is not found

# Find the div with class "ocr_carea" and p2 attribute containing "bbox 377 462 1559 601"
div_p2 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 377 462 1559 601" in value)

# Extract text from the first div
if div_p2:
    p2 = div_p2.get_text(strip=True)  # Extract text from the found div
else:
    p2 = ""  # Set p2 to empty string if the div is not found

# Find the div with class "ocr_carea" and p3 attribute containing "bbox 271 615 1559 841"
div_p3 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 271 615 1559 841" in value)

# Extract text from the first div
if div_p3:
    p3 = div_p3.get_text(strip=True)  # Extract text from the found div
else:
    p3 = ""  # Set p3 to empty string if the div is not found

# Find the div with class "ocr_carea" and p4 attribute containing "bbox 273 911 946 965"
div_p4 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 273 911 946 965" in value)

# Extract text from the first div
if div_p4:
    p4 = div_p4.get_text(strip=True)  # Extract text from the found div
else:
    p4 = ""  # Set p4 to empty string if the div is not found

# Find the div with class "ocr_carea" and head2 attribute containing "bbox 274 995 1404 1069"
div_h2 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 274 995 1404 1069" in value)

# Extract text from the first div
if div_h2:
    h2 = div_h2.get_text(strip=True)  # Extract text from the found div
else:
    h2 = ""  # Set h2 to empty string if the div is not found

# Find the div with class "ocr_carea" and p5 attribute containing "bbox 379 1078 1556 1305"
div_p5 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 379 1078 1556 1305" in value)

# Extract text from the first div
if div_p5:
    p5 = div_p5.get_text(strip=True)  # Extract text from the found div
else:
    p5 = ""  # Set p5 to empty string if the div is not found

# Find the div with class "ocr_carea" and p6 attribute containing "bbox 271 1317 1556 1541"
div_p6 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 271 1317 1556 1541" in value)

# Extract text from the first div
if div_p6:
    p6 = div_p6.get_text(strip=True)  # Extract text from the found div
else:
    p6 = ""  # Set p6 to empty string if the div is not found

# Construct the TEI XML document using the extracted information

# Define the TEI XML structure with extracted data
tei_xml = f"""
<TEI>
  <teiHeader>
    <fileDesc>
        <orgName>{org_name_text}</orgName>
      <publicationStmt>
        <publPlace>{place_text}</publPlace>
      </publicationStmt>
    </fileDesc>
  </teiHeader>
  <text>
    <body>
      <div>
        <p>{p1}</p>
        <head>{h1}</head>
        <p>{p2}</p>
        <p>{p3}</p>
        <p>{p4}</p>
        <head>{h2}</head>
        <p>{p5}</p>
        <p>{p6}</p>
      </div>
    </body>
  </text>
</TEI>
"""

# Write the TEI XML document to a file
with open("output_142856_2-2.tei.xml", "w", encoding="utf-8") as file:
    file.write(tei_xml)  # Write the TEI XML document content to the file