In [None]:
# Import necessary libraries
from bs4 import BeautifulSoup  # BeautifulSoup is a library for parsing HTML and XML documents
import re  # The re module provides support for regular expressions

# Read the HOCR file
with open("/content/OCR-D-HOCR_OCR-D-HOCR_00001.xml", "r", encoding="utf-8") as file:
    hocr_content = file.read()  # Read the content of the HOCR file

# Parse the HOCR content using BeautifulSoup
soup = BeautifulSoup(hocr_content, "html.parser")  # Create a BeautifulSoup object from the HOCR content


# Extract the date text from bounding box "bbox 479 1384 728 1415"
div_date_tag = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 479 1384 728 1415" in value)

# Extract text from the second div with the specified format (\d+,\s\w+\s\d+)
if div_date_tag:
    date_text = ""
    text = div_date_tag.get_text(strip=True)  # Extract text from the found div
    match = re.search(r"\b\d{1,2},\s\w+\s\d{4}\b", text)  # Search for a specific date format
    if match:
        date_text = match.group(0)  # Extract the matched date text
else:
    date_text = ""  # Set date_text to empty string if the div is not found

# Extract various text elements from different divs with specific bounding boxes
# These elements will be used to construct the TEI XML document later

# Find the div with class "ocr_carea" and p1 attribute containing "bbox 177 99 767 315"
div_p1 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 177 99 767 315" in value)

# Extract text from the first div
if div_p1:
    p1 = div_p1.get_text(strip=True)   # Extract text from the found div
else:
    p1 = ""  # Set p1 to empty string if the div is not found

# Find the div with class "ocr_carea" and p2 attribute containing "bbox 176 346 477 433"
div_p2 = soup.find("div", class_="ocr_carea", title=lambda value: value and "bbox 176 346 477 433" in value)

# Extract text from the first div
if div_p2:
    p2 = div_p2.get_text(strip=True)   # Extract text from the found div
else:
    p2 = ""  # Set p2 to empty string if the div is not found

# Construct the TEI XML document using the extracted information

# Define the TEI XML structure with extracted data
tei_xml = f"""
<TEI>
  <teiHeader>
    <fileDesc>
    <titleStmt>
        <title></title>
        </titleStmt>
        <orgName></orgName>
      <publicationStmt>
        <publPlace></publPlace>
        <date>{date_text}</date>
      </publicationStmt>
    </fileDesc>
  </teiHeader>
  <text>
    <body>
      <div>
        <p>{p1}</p>
        <p>{p2}</p>
      </div>
    </body>
  </text>
</TEI>
"""

# Write the TEI XML document to a file
with open("output_141748.tei.xml", "w", encoding="utf-8") as file:
    file.write(tei_xml)   # Write the TEI XML document content to the file