## Sanitize ISO doc


In [None]:
# Installs the required Python packages for web scraping and HTML parsing.

!pip install beautifulsoup4
!pip install html2text



In [None]:
# Imports

import re
import os
from bs4 import BeautifulSoup
import difflib
import html2text

In [None]:
# Open HTML

if not os.path.exists('docs/p1.html'):
    with open('docs/p1.html', 'w', encoding='utf-8') as f:
        f.write('<html><body>Forward ... A complete listing of these bodies can be found at www.iso.org/members.html.</body></html>')

with open('docs/p1.html', 'r', encoding='utf-8') as f:
    html = f.read()

In [None]:
# Remove the entire section under the header 'Foreword', preserving 'Introduction' and all that follows

pattern1 = r'(<h1 class="sts-sec-title">Foreword</h1>.*?)(?=<h1 class="sts-sec-title">Introduction</h1>)'
clean_html = re.sub(pattern1, '', html, flags=re.DOTALL)

# Remove the instructional text under the Terms and Definitions header

pattern2 = r'(<div class="sts-p"><i>The Terms and definitions clause is a mandatory element of the text.</i></div>.*?)(?=<h2 class="sts-sec-title">3.1&nbsp;&nbsp;&nbsp;Types of ontologies</h2>)'
clean_html = re.sub(pattern2, '', clean_html, flags=re.DOTALL)

# Remove the section from 'Annex A' header to the 'Bibliography' header

pattern3 = r'(<h1 class="sts-app-header">Annex A</h1>.*?)(?=<h2 class="sts-sec-title">Bibliography</h2>)'
clean_html = re.sub(pattern3, '', clean_html, flags=re.DOTALL)

# Replace the text in the 'Normative references' section with "There are no normative references"
def replace_normative_references(html):
    soup = BeautifulSoup(html, 'html.parser')

    # Find all top-level sts-section divs
    for section in soup.find_all('div', class_='sts-section'):
        h1 = section.find('h1', class_='sts-sec-title')
        if h1 and 'Normative references' in h1.get_text():
            # Remove all children except the h1
            for child in section.find_all(recursive=False):
                if child != h1:
                    child.decompose()
            # Add the replacement paragraph
            new_para = soup.new_tag('div', attrs={"class": "sts-p"})
            new_para.string = "There are no normative references."
            section.append(new_para)
            break  # Only replace the first matching section

    return str(soup)
 
clean_html = replace_normative_references(clean_html)

# Replace the text in the 'Parts in the OBI series' section with "New text"
def replace_parts(html):
    pattern = re.compile(
        r'(<h1 class="sts-sec-title"[^>]*>4\s*&nbsp;&nbsp;&nbsp;Parts in the OBI series</h1>.*?)(?=<h1 class="sts-sec-title"|$)',
        re.DOTALL
    )
    def replacer(match):
        header = re.search(r'<h1 class="sts-sec-title"[^>]*>4\s*&nbsp;&nbsp;&nbsp;Parts in the OBI series</h1>', match.group(1))
        if header:
            # Make the URL clickable
            new_text = (
                'This document is Part 1 of the multi-part ISO 23726 OBI series. '
                'Part 2 Vocabulary is development and in the ISO process. '
                'Part 3 is the foundation of the OBI series and describes the Industrial Data Ontology. '
                'IDO is used for representing industrial data and information, building vocabularies, and managing asset models which employ reference data libraries and exploit OWL DL. '
                'IDO supports automated machine reasoning, data quality checks, and information models used in all life cycle phases of industrial systems, processes, and products. '
                'The ISO standard for IDO is in development in ISO and due for public release in 2026. '
                'A draft of the IDO standard document and a download of the digital artefact (current at the start of the ISO process in 2024) can be downloaded from the PoscCaesar web site at '
                '<a href="https://rds.posccaesar.org/ontology/lis14/">https://rds.posccaesar.org/ontology/lis14/</a>. '
                'Other parts of OBI are envisaged and Part 100, the Schedule Data Ontology, is already in development in ISO. '
                'Guideline documents for practical implementation are also being developed and made available by industry associations.'
            )
            return f'{header.group(0)}\n<div class="sts-p">{new_text}</div>\n'
        return match.group(1)
    return pattern.sub(replacer, html)
 
clean_html = replace_parts(clean_html)

In [None]:
# Add header/title to the document
title_html = '<h1 class="sts-sec-title">Ontology-based Interoperability for Industrial Data</h1>\n'

# Add new section after the title
new_section_html = '''
<h3>Lead authors: Melinda Hodkiewicz and Andreas Neumann</h3>
<h3>Development team: Pål Rylandsholm, Maja Milicic Brandt, Johan W Kluwer, Caitlin Woods, Dirk Walter, Inghild Kaarstad</h3>
<h2>Abstract</h2>
<div class="sts-p">This document provides guidance to industry users and the semantic data modelling community on 1) the vision for the ISO 23726 Ontology-based Interoperability (OBI) series, and 2) a set of principles which resources will have to comply with in order to be considered <code>compliant</code> with IDO and the ISO 23726 series. The <a href="https://rds.posccaesar.org/ontology/lis14/">Industrial Data Ontology (IDO)</a> is the upper ontology in the ISO 23726 series. IDO is currently inside the ISO process and due to be published as an ISO standard in 2026.<br><br>
The contents of this document will be submitted to ISO as part of ISO 23726-1 in October 2025. The standardisation process is expected to take 3 years. During this period the contents of this document will evolve as other organisations and national bodies work to shape the ideas presented in this initial version. Once inside the ISO process only members of the ISO TC184/SC4 WG26 committee and the liaison groups will have access to the draft standard and any associated digital artefacts until it is published in 2027/2028.
</div>
<h2>Licence</h2>
<div class="sts-p"><a href="https://creativecommons.org/licenses/by-sa/4.0/">CC-BY-SA-4.0 licence</a></div>
'''

clean_html = title_html + new_section_html + clean_html

In [None]:
# Remove metadata from the document
# Remove all HTML comments
clean_html = re.sub(r'<!--.*?-->', '', clean_html, flags=re.DOTALL)

In [None]:
# Adds figures (see README for instructions)

fig_counter = 1

# Match: ./docs/p1_files/graphic-<whatever>.png
pattern = re.compile(r'\./p1_files/graphic-[^"]+\.png')

def rename_graphic_path(match):
    global fig_counter
    new_path = f'./figs/Fig{fig_counter}.png'
    fig_counter += 1
    return new_path

clean_html = pattern.sub(rename_graphic_path, clean_html)

In [None]:
# Change 'document' to 'draft' in the "Scope" section

def replace_document_with_draft_in_scope(html):
    scope_block_pattern = re.compile(
        r'(<div class="sts-section"[^>]*?>\s*<h1 class="sts-sec-title">[^<]*Scope[^<]*</h1>.*?</div>\s*</div>)',
        re.DOTALL
    )

    def replace_in_scope_block(match):
        scope_html = match.group(1)
        scope_html = re.sub(r'\bDocuments\b', 'Drafts', scope_html)
        scope_html = re.sub(r'\bdocuments\b', 'drafts', scope_html)
        scope_html = re.sub(r'\bDocument\b', 'Draft', scope_html)
        scope_html = re.sub(r'\bdocument\b', 'draft', scope_html)
        return scope_html

    return scope_block_pattern.sub(replace_in_scope_block, html)

clean_html = replace_document_with_draft_in_scope(clean_html)

In [None]:
# Remove the specific section that contains the ISO/TC 184/SC 4 committee information

pattern = re.compile(
    r'<div class="sts-p">The ISO/TC 184/SC 4 committee.*?ontology-based\s*interoperable ecosystem\.\s*</div>',
    re.DOTALL
)

clean_html = re.sub(pattern, '', clean_html)

In [None]:
# Redirect sd.iso.org hrefs to existing term IDs

def redirect_links_to_existing_term_ids(html_string, log_path="link_redirection_log.txt"):
    soup = BeautifulSoup(html_string, 'html.parser')

    # Step 1: Build map of term text (preferred + admitted) → section ID
    term_to_id = {}
    for section in soup.find_all('div', class_='sts-tbx-sec'):
        section_id = section.get('id')
        if not section_id:
            continue

        for term_class in ['preferredTerm', 'admittedTerm']:
            for term_elem in section.find_all(class_=term_class):
                term_text = term_elem.get_text(strip=True).lower()
                term_text = re.sub(r'\(.*?\)', '', term_text).strip()
                if term_text:
                    term_to_id[term_text] = section_id

    log_lines = []

    # Step 2: Redirect all external hrefs to internal term matches
    for a in soup.find_all('a', href=True):
        if 'sd.iso.org' not in a['href']:
            continue

        anchor_text = a.get_text(strip=True).lower()
        anchor_text_cleaned = re.sub(r'\(.*?\)', '', anchor_text).strip()

        matched_id = term_to_id.get(anchor_text_cleaned)
        if matched_id:
            a['href'] = f'#{matched_id}'
        else:
            match = difflib.get_close_matches(anchor_text_cleaned, term_to_id.keys(), n=1, cutoff=0.85)
            if match:
                matched_id = term_to_id[match[0]]
                a['href'] = f'#{matched_id}'
                log_lines.append(f"[Fuzzy Match] '{anchor_text}' → '{match[0]}' → #{matched_id}")
            else:
                log_lines.append(f"[Unmatched – removed] '{anchor_text}'")
                a.unwrap()

    return str(soup)

clean_html = redirect_links_to_existing_term_ids(clean_html)

In [None]:
# Generate a Table of Contents (TOC) for the document

def generate_toc(html):
    soup = BeautifulSoup(html, 'html.parser')
    headings = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])

    if not headings:
        return html  # nothing to do

    # Skip the first <h1> (page title)
    headings = headings[1:] if headings[0].name == 'h1' else headings

    toc_soup = BeautifulSoup('<div id="toc"><h2>Table of Contents</h2></div>', 'html.parser')
    toc_div = toc_soup.div
    current_level = 0
    stack = [toc_soup.new_tag('ul')]
    toc_div.append(stack[0])

    for heading in headings:
        level = int(heading.name[1])
        if not heading.has_attr('id'):
            heading_id = heading.text.strip().lower().replace(' ', '-')
            heading['id'] = heading_id
        else:
            heading_id = heading['id']

        while level > current_level:
            new_ul = toc_soup.new_tag('ul')
            stack[-1].append(new_ul)
            stack.append(new_ul)
            current_level += 1
        while level < current_level:
            stack.pop()
            current_level -= 1

        li = toc_soup.new_tag('li')
        a = toc_soup.new_tag('a', href=f'#{heading_id}')
        a.string = heading.text.strip()
        li.append(a)
        stack[-1].append(li)

    # Insert TOC after the first <h1> (title), or at top of <body> if not found
    first_h1 = soup.find('h1')
    if first_h1:
        first_h1.insert_after(toc_div)
    elif soup.body:
        soup.body.insert(0, toc_div)

    return str(soup)

clean_html = generate_toc(clean_html)

In [None]:
# Add breakline before figure captions

def add_space_before_labels(html_string):
    soup = BeautifulSoup(html_string, 'html.parser')
    target_classes = ['sts-caption-label']

    for class_name in target_classes:
        for label in soup.find_all(class_=class_name):
            prev = label.find_previous_sibling()

            # Skip if already has spacing before
            if prev and (
                prev.name == 'br' or
                (prev.name in ['p', 'div', 'section'] and prev.get_text(strip=True)) or
                (prev.string and prev.string.strip() == '')
            ):
                continue

            # Insert a <br> before the label
            br_tag = soup.new_tag('br')
            label.insert_before(br_tag)

    return str(soup)

clean_html = add_space_before_labels(clean_html)

In [None]:
# Change definition headings to <h3> tags

def convert_label_divs_to_h3(clean_html):
    soup = BeautifulSoup(clean_html, 'html.parser')

    # Find all divs with class 'sts-tbx-label'
    for label_div in soup.find_all('div', class_='sts-tbx-label'):
        # Replace <div> with <h3>
        h3 = soup.new_tag('h3')
        h3.string = label_div.get_text(strip=True)
        label_div.replace_with(h3)

    return str(soup)

clean_html = convert_label_divs_to_h3(clean_html)

In [None]:
# Save the cleaned HTML
with open('docs/p1_clean.html', 'w', encoding='utf-8') as f:
    f.write(clean_html)

In [None]:
# Convert the cleaned HTML to Markdown

def normalize_anchor(value):
    return re.sub(r'\s+', '-', value.replace('\xa0', '').strip())

def preprocess_html(html):
    soup = BeautifulSoup(html, 'html.parser')
    div_id_placeholders = []

    # 1. Remove the Table of Contents div
    toc_div = soup.find('div', id='toc')
    if toc_div:
        toc_div.decompose()

    # 2. Replace all <a> with Markdown links
    for a in soup.find_all('a'):
        href = a.get('href', '')
        text = a.get_text().strip()
        if not href or not text:
            continue
        if href.startswith('#'):
            href = f"#{normalize_anchor(href[1:])}"
        markdown_link = f"[{text}]({href})"
        a.replace_with(markdown_link)

    # 3. Normalize all id attributes
    for tag in soup.find_all(attrs={"id": True}):
        tag['id'] = normalize_anchor(tag['id'])

    # 4. Replace each sts-tbx-sec div with a placeholder
    for i, div in enumerate(soup.find_all('div', class_='sts-tbx-sec')):
        div_id = div.get('id')
        placeholder = f"XXXDIVHEREXXX_{i:04d}"
        div_id_placeholders.append((placeholder, div_id))
        div.insert_before(placeholder)
        div.unwrap()  # Remove the original div, keeping the content

    return str(soup), div_id_placeholders

def fix_bibliography_bolding(md_text):
    lines = md_text.split('\n')
    fixed_lines = []
    in_bib = False
    bib_started = False

    for i, line in enumerate(lines):
        if line.strip().lower() == '## bibliography':
            in_bib = True
            bib_started = False
            fixed_lines.append(line)
            continue

        if in_bib:
            # Stop when next heading is found
            if line.startswith('#'):
                in_bib = False
                fixed_lines.append(line)
                continue

            # Skip the second line (---|---) if it exists
            if not bib_started and re.match(r'^---\s*\|\s*---$', line.strip()):
                bib_started = True
                continue

            # Replace the first | with –
            fixed_lines.append(re.sub(r'\s*\|\s*', ' – ', line, count=1))
        else:
            fixed_lines.append(line)

    return '\n'.join(fixed_lines)

def demote_all_headings_but_first(md_text):
    lines = md_text.split('\n')
    heading_count = 0
    for i, line in enumerate(lines):
        if re.match(r'^\s*#{1,6}\s', line):
            heading_count += 1
            if heading_count > 1:
                lines[i] = '#' + line  # Add one more #
    return '\n'.join(lines)

def ensure_blank_line_before_headings(md_text):
    lines = md_text.split('\n')
    result = []

    for i, line in enumerate(lines):
        is_heading = re.match(r'^\s*#{1,6}\s', line)
        if is_heading:
            if i > 0 and lines[i-1].strip() != '':
                result.append('')  # Add a blank line
        result.append(line)

    return '\n'.join(result)

def ensure_blank_lines_around_images(markdown):
    lines = markdown.splitlines()
    updated_lines = []
    i = 0
    while i < len(lines):
        line = lines[i]
        is_image_line = re.match(r'!\[.*?\]\(.*?\)', line.strip())

        if is_image_line:
            # Add blank line before if needed
            if updated_lines and updated_lines[-1].strip() != '':
                updated_lines.append('')  # insert blank line before image

            updated_lines.append(line)

            # Check next line — if it's not blank, insert a blank
            if i + 1 < len(lines) and lines[i + 1].strip() != '':
                updated_lines.append('')
            i += 1
        else:
            updated_lines.append(line)
            i += 1

    return '\n'.join(updated_lines)

def convert_starred_lettered_to_clean_list(markdown):
    lines = markdown.splitlines()
    converted_lines = []
    pattern = re.compile(r'^\s*\*\s*([a-zA-Z])\)\s+(.*)')

    for i, line in enumerate(lines):
        match = pattern.match(line)
        if match:
            letter, content = match.groups()

            # Insert a blank line before if previous line is not already blank
            if converted_lines and converted_lines[-1].strip() != '':
                converted_lines.append('')

            converted_lines.append(f"{letter}) {content}")
        else:
            converted_lines.append(line)

    return '\n'.join(converted_lines)

def link_bibliography_references(markdown_text):
    lines = markdown_text.splitlines()
    bib_start = None

    # Step 1: Find the start of the bibliography
    for i, line in enumerate(lines):
        if line.strip().lower().startswith("### bibliography"):
            bib_start = i
            break

    if bib_start is None:
        raise ValueError("Bibliography section not found.")

    # Step 2: Modify the bibliography lines to include anchors
    for i in range(bib_start + 1, len(lines)):
        line = lines[i]
        match = re.match(r'^\[(\d+)]\s*–', line)
        if match:
            ref_num = match.group(1)
            rest = line.split("–", 1)[1].strip()
            lines[i] = f'<div id="ref-{ref_num}">[{ref_num}] – {rest}</div>'

    # Step 3: Replace in-text [n] with clickable HTML anchor
    def replace_citation(match):
        num = match.group(1)
        return f'<a href="#ref-{num}">[{num}]</a>'

    updated_text = '\n'.join(lines[:bib_start])
    updated_text = re.sub(r'\[(\d+)]', replace_citation, updated_text)

    # Combine updated in-text and modified bib
    final_text = updated_text + '\n' + '\n'.join(lines[bib_start:])
    return final_text

def demote_bibliography_heading(markdown_text):
    lines = markdown_text.splitlines()
    for i, line in enumerate(lines):
        if re.match(r'^###\s+Bibliography$', line.strip()):
            lines[i] = '## Bibliography'
            break  # Only the first match
    return '\n'.join(lines)

def preserve_visual_line_breaks(markdown_text):
    lines = markdown_text.splitlines()
    processed_lines = []

    for i, line in enumerate(lines):
        stripped = line.strip()

        is_last_line = (i == len(lines) - 1)
        next_line_blank = (not is_last_line and lines[i+1].strip() == '')

        # Skip block-level elements and leave them unchanged
        if (
            stripped == '' or
            stripped.startswith('#') or
            stripped.startswith('<') or
            re.match(r'^[-*+]\s', stripped) or
            re.match(r'^\d+\.', stripped) or
            re.match(r'^---+$', stripped)
        ):
            processed_lines.append(line)
        else:
            # Only force line break if next line is NOT blank (i.e. same paragraph)
            if not next_line_blank:
                processed_lines.append(line.rstrip() + '  ')
            else:
                processed_lines.append(line)

    return '\n'.join(processed_lines)

def bold_first_term_in_subsections(md_text):
    lines = md_text.splitlines()
    i = 0

    while i < len(lines):
        if re.match(r"^####\s+3\.\d+\.\d+", lines[i].strip()):
            j = i + 1
            while j < len(lines):
                if lines[j].strip() and not lines[j].lstrip().startswith("<") and not lines[j].lstrip().startswith("#"):
                    if not lines[j].lstrip().startswith("**"):
                        leading_spaces = len(lines[j]) - len(lines[j].lstrip())
                        stripped = lines[j].strip()
                        lines[j] = " " * leading_spaces + f"**{stripped}**  "
                    break
                j += 1
        i += 1

    return "\n".join(lines)

def replace_dcmi_terms_url(md_text: str) -> str:
    """
    Find any URL that contains ...dublin-core/dcmi-terms/... (allowing Unicode hyphen variants)
    and replace just that URL with the canonical DCMI Terms URL, preserving trailing punctuation.
    """
    CANON = "https://www.dublincore.org/specifications/dublin-core/dcmi-terms/2020-01-20/"

    # Accept ASCII hyphen and common Unicode hyphen/dash variants
    H = r"\-\u2010\u2011\u2012\u2013\u2014\u2212"

    # Match a URL, require the path to contain dublin{H}core/dcmi{H}terms/, stop before ) ] " > or whitespace
    pattern = re.compile(
        rf'(http?://[^\s<>"\)\]]*dublin[{H}]core/dcmi[{H}]terms/[^\s<>"\)\]]*)(?P<punc>[.,;:]?)',
        re.IGNORECASE
    )

    return pattern.sub(lambda m: CANON + (m.group('punc') or ''), md_text)

# Step 1: Read HTML
with open("docs/p1_clean.html", "r", encoding="utf-8") as f:
    html = f.read()

# Step 2: Preprocess HTML and capture placeholders
patched_html, div_placeholders = preprocess_html(html)

# Step 3: Convert to Markdown
converter = html2text.HTML2Text()
converter.ignore_links = False
converter.body_width = 0
converter.single_line_break = True
markdown = converter.handle(patched_html)

# Step 4: Postprocess
for placeholder, div_id in div_placeholders:
    div_html = f'<div class="sts-tbx-sec" id="{div_id}"></div>'
    markdown = markdown.replace(placeholder, div_html)

markdown = fix_bibliography_bolding(markdown)
markdown = demote_all_headings_but_first(markdown)
markdown = ensure_blank_line_before_headings(markdown)
markdown = ensure_blank_lines_around_images(markdown)
markdown = convert_starred_lettered_to_clean_list(markdown)
markdown = link_bibliography_references(markdown)
markdown = demote_bibliography_heading(markdown)
markdown = preserve_visual_line_breaks(markdown)
markdown = bold_first_term_in_subsections(markdown)
markdown = replace_dcmi_terms_url(markdown)

# Step 5: Write final Markdown
with open("docs/p1_clean.md", "w", encoding="utf-8") as f:
    f.write(markdown)

In [None]:
# File cleanup and renaming

# Paths
old_file = "docs/p1_clean.md"
new_file = "docs/index.md"
files_to_delete = [
    "docs/p1_clean.md",
    "docs/p1.html",
    "docs/p1_clean.html"
]

# Rename p1_clean.md -> index.md
if os.path.exists(old_file):
    os.rename(old_file, new_file)
    print(f"Renamed {old_file} -> {new_file}")
else:
    print(f"{old_file} not found, skipping rename.")

# Delete specified files
for file_path in files_to_delete:
    if os.path.exists(file_path):
        os.remove(file_path)
        print(f"Deleted {file_path}")
    else:
        print(f"{file_path} not found, skipping delete.")


Renamed docs/p1_clean.md -> docs/index.md
docs/p1_clean.md not found, skipping delete.
Deleted docs/p1.html
Deleted docs/p1_clean.html
