In [1]:
from wtpsplit import SaT

sat = SaT("sat-3l", language="nl", style_or_domain="ud")

config.json:   0%|          | 0.00/5.57k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/855M [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/530 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/855M [00:00<?, ?B/s]

head_config.json:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

pytorch_adapter.bin:   0%|          | 0.00/1.33M [00:00<?, ?B/s]

pytorch_model_head.bin:   0%|          | 0.00/343k [00:00<?, ?B/s]

There are adapters available but none are activated for the forward pass.


In [2]:
from docx import Document

doc = Document("data/Ontology-based Annotation.docx")

In [3]:
sentences = [s for p in doc.paragraphs for s in sat.split(p.text)]

In [4]:
test_paragraph = doc.paragraphs[3]
print(test_paragraph.text)

Wilhelmshöhe (E53 Place), voorheen Weissenstein (E53 Place) genoemd en ook reeds een blaauwe maandag Napoleonshöhe (E53 Place) geheten, is een lustplaats aan den Keurvorst van Hessen Cassel (E21 Person) toebehorende een groot uur (E52 Time-Span) van de stad gelegen: de landgraven Karel (E21 Person) en Frederik de II (E21 Person) hebben hetzelve begonnen & het is op eene heerlyke wyze door den thans regeerende Keurvorst Willem de IX (E21 Person) voltooid geworden. Even als ik reeds meermalen gedaan heb zal ik u maar alles in die orde mededeelen als het ons is voorgekomen. Langs eene lange breede laan gingen wy Zondag ll (E52 Time-Span) naar het zoo beroemde Weissenstein (E53 Place) en wel op dien dag omdat alle waterwerken dan in beweging zyn hetgeen men niet dan met groote moeite op andere dagen gedaan kan krygen waarby men dan toch altyd het groote gewoel der wandelaars zoo wel inwoners der stad als vreemdelingen mist. Op eenige afstand der stad gekomen zagen wy regts af de enorme Cas

In [5]:
from dataclasses import dataclass, asdict
from typing import Tuple, List


@dataclass
class Entity:
    label: str
    span: Tuple[int, int]


@dataclass
class AnnotatedText:
    text: str
    entities: List[Entity]


def annotations_from_paragraph(paragraph) -> AnnotatedText:
    clean_text = ""
    entities = []

    # Track positions
    clean_position = 0

    # Track entity building state
    current_entity_text = ""
    current_entity_start = None
    in_entity = False
    in_label = False
    label_buffer = ""

    for run in paragraph.runs:
        text = run.text

        if in_label:
            # We're in the middle of collecting a label
            closing_paren_pos = text.find(")")
            if closing_paren_pos != -1:
                # Found the end of the label
                label_buffer += text[: closing_paren_pos + 1]
                label = label_buffer[
                    label_buffer.find("(") + 1 : label_buffer.find(")")
                ]
                if current_entity_text:  # Only add if we have an entity
                    entities.append(
                        Entity(label=label, span=(current_entity_start, clean_position))
                    )

                # Reset states
                current_entity_text = ""
                current_entity_start = None
                in_entity = False
                in_label = False
                label_buffer = ""

                # Add any remaining text after the label
                remaining_text = text[closing_paren_pos + 1 :]
                if remaining_text:
                    clean_text += remaining_text
                    clean_position += len(remaining_text)
            else:
                # Still collecting label
                label_buffer += text
        else:
            # Not in label - check if this run starts a label
            opening_paren_pos = text.find("(")

            if opening_paren_pos != -1:
                # Found start of label
                # First add any text before the label if not in entity
                if not in_entity and opening_paren_pos > 0:
                    prefix_text = text[:opening_paren_pos]
                    clean_text += prefix_text
                    clean_position += len(prefix_text)

                in_label = True
                label_buffer = text[opening_paren_pos:]

                # Check if label ends in this same run
                closing_paren_pos = text.find(")", opening_paren_pos)
                if closing_paren_pos != -1:
                    label = text[opening_paren_pos + 1 : closing_paren_pos]
                    if current_entity_text:  # Only add if we have an entity
                        entities.append(
                            Entity(
                                label=label, span=(current_entity_start, clean_position)
                            )
                        )

                    # Reset states
                    current_entity_text = ""
                    current_entity_start = None
                    in_entity = False
                    in_label = False
                    label_buffer = ""

                    # Add any remaining text after the label
                    remaining_text = text[closing_paren_pos + 1 :]
                    if remaining_text:
                        clean_text += remaining_text
                        clean_position += len(remaining_text)

            elif run.bold and not in_entity:
                # Start of new entity
                current_entity_start = clean_position
                in_entity = True
                current_entity_text += text
                clean_text += text
                clean_position += len(text)

            elif run.bold and in_entity:
                # Continuation of entity
                current_entity_text += text
                clean_text += text
                clean_position += len(text)

            elif text.strip() == "":
                # Whitespace - include if in entity
                if in_entity:
                    current_entity_text += text
                    clean_text += text
                    clean_position += len(text)
                else:
                    clean_text += text
                    clean_position += len(text)
            else:
                # Regular text
                if in_entity:
                    # We were collecting an entity but found non-label text
                    in_entity = False
                    current_entity_text = ""
                    current_entity_start = None
                clean_text += text
                clean_position += len(text)
    return AnnotatedText(clean_text, entities)


In [6]:
from collections import deque


def get_sentence_boundaries(text: str, sentences: list[str]) -> list[int]:
    sentence_boundaries = []
    current_start = 0

    for i, sent in enumerate(sentences):
        sentence_boundaries.append((current_start, current_start + len(sent)))
        current_start += len(sent)

    return sentence_boundaries


def split_into_sentences_with_entities(
    original_text: str, sentences: list[str], entities: list[Entity]
) -> list[tuple[str, list[Entity]]]:
    sentence_boundaries = get_sentence_boundaries(original_text, sentences)

    result = []
    for sent, (sent_start, sent_end) in zip(sentences, sentence_boundaries):
        sent_entities = []

        for entity in entities:
            # Get the new positions for this entity
            ent_start = entity.span[0]
            ent_end = entity.span[1]

            # Check if the entity belongs to this sentence
            if (
                (ent_start >= sent_start and ent_start < sent_end)
                or (ent_end > sent_start and ent_end <= sent_end)
                or (ent_start <= sent_start and ent_end >= sent_end)
            ):
                # Adjust spans to be relative to sentence start
                adjusted_start = max(0, ent_start - sent_start)
                adjusted_end = min(len(sent), ent_end - sent_start)

                # Only add if there's actually an overlap
                if adjusted_end > adjusted_start:
                    sent_entities.append(
                        Entity(label=entity.label, span=(adjusted_start, adjusted_end))
                    )
        result.append((sent, sent_entities))

    return result


@dataclass
class DemoLabel:
    span: str
    types: list[str]


@dataclass
class DemoExample:
    text: str
    labels: list[DemoLabel]
    source: str


def format_example(sentence: str, entities: list[Entity], source: str) -> DemoExample:
    labels = [
        DemoLabel(
            types=[lab.strip() for lab in ent.label.split(",")],
            span=sentence[ent.span[0] : ent.span[1]].strip(),
        )
        for ent in entities
    ]
    return DemoExample(text=sentence, labels=labels, source=source)


annotated_paragraph = annotations_from_paragraph(test_paragraph)
sentences = [s for s in sat.split(annotated_paragraph.text)]
sentences_with_entities = split_into_sentences_with_entities(
    annotated_paragraph.text, sentences, annotated_paragraph.entities
)
[format_example(sent, ents, "") for sent, ents in sentences_with_entities]

[DemoExample(text='Wilhelmshöhe, voorheen Weissenstein genoemd en ook reeds een blaauwe maandag Napoleonshöhe geheten, is een lustplaats aan den Keurvorst van Hessen Cassel toebehorende een groot uur van de stad gelegen: ', labels=[DemoLabel(span='Wilhelmshöhe', types=['E53 Place']), DemoLabel(span='Weissenstein', types=['E53 Place']), DemoLabel(span='Napoleonshöhe', types=['E53 Place']), DemoLabel(span='Keurvorst van Hessen Cassel', types=['E21 Person']), DemoLabel(span='groot uur', types=['E52 Time-Span'])], source=''),
 DemoExample(text='de landgraven Karel en Frederik de II hebben hetzelve begonnen & het is op eene heerlyke wyze door den thans regeerende Keurvorst Willem de IX voltooid geworden. ', labels=[DemoLabel(span='Karel', types=['E21 Person']), DemoLabel(span='Frederik de II', types=['E21 Person']), DemoLabel(span='Keurvorst Willem de IX', types=['E21 Person'])], source=''),
 DemoExample(text='Even als ik reeds meermalen gedaan heb zal ik u maar alles in die orde mededeelen

In [None]:
from pathlib import Path

datadir = Path("data")
input_files = datadir.glob("*.docx")

demo_examples = []

for file in input_files:
    doc = Document(file)
    for paragraph in doc.paragraphs:
        annotated_paragraph = annotations_from_paragraph(paragraph)
        sentences = [s for s in sat.split(annotated_paragraph.text)]
        sentences_with_entities = split_into_sentences_with_entities(
            annotated_paragraph.text, sentences, annotated_paragraph.entities
        )
        demo_examples += [
            format_example(sent, ents, str(file.relative_to(datadir)))
            for sent, ents in sentences_with_entities
        ]


In [None]:
import json

with datadir.joinpath("examples_to_clean.jsonl").open(
    "w", encoding="utf-8"
) as f:
    for ex in demo_examples:
        f.write(json.dumps(asdict(ex), ensure_ascii=False) + "\n")

In [None]:
!head data/examples_to_clean.jsonl