In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
import spacy
from spacy import displacy

In [None]:
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large-finetuned-conll03-german")
model = AutoModelForTokenClassification.from_pretrained("xlm-roberta-large-finetuned-conll03-german")

nlp = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=True)
example = "My name is Wolfgang and I live in Berlin"

ner_results = nlp(example)
print(ner_results)

In [None]:
with open("data/fontane_brandenburg01_1862_ch1.txt", 'r', encoding='utf-8') as f:
    fontane=f.read()

In [None]:
print(len(fontane))

In [None]:
ner_results = nlp(fontane)
print(ner_results)

In [None]:
print(len(ner_results))

Text überschreitet Input-Länge des Modells. Chunking nötig.

In [None]:
def ner_chunks(text, nlp_pipeline, chunk_chars=3000, overlap=100, batch_size=16):
    assert 0 <= overlap < chunk_chars, "overlap must be >=0 and < chunk_chars"
    step = chunk_chars - overlap
    L = len(text)
    starts = list(range(0, L, step))
    chunks = [ text[s: min(s + chunk_chars, L)] for s in starts ]

    results = []
    for i in range(0, len(chunks), batch_size):
        batch = chunks[i : i + batch_size]
        batch_out = nlp_pipeline(batch)
        for j, ents in enumerate(batch_out):
            chunk_start = starts[i + j]
            for e in ents:
                if 'start' in e and 'end' in e:
                    e2 = dict(e)
                    e2['start'] += chunk_start
                    e2['end'] += chunk_start
                    results.append(e2)

    # dedupe
    seen = set()
    uniq = []
    for e in sorted(results, key=lambda x: (x['start'], x['end'])):
        key = (e.get('start'), e.get('end'), e.get('entity_group') or e.get('entity'), e.get('word'))
        if key not in seen:
            seen.add(key)
            uniq.append(e)
    return uniq



In [None]:
ents = ner_chunks(fontane, nlp, chunk_chars=1200, overlap=50)
print(len(ents), ents[:10])

Spacy (displacy) nutzen, um die Ergebnisse darzustellen.

In [None]:
def results_to_spacy_doc(text, entities, lang="de"):
    """
    Convert the raw text + entities (with absolute 'start'/'end') into a spaCy Doc.
    Entities must have: start, end, entity_group/entity.
    """
    # 1) Create blank spaCy model (does NOT tokenize automatically into words)
    nlp = spacy.blank(lang)
    
    # 2) Create doc as one continuous text; spaCy will tokenize it
    doc = nlp(text)

    spans = []
    for e in entities:
        start = e["start"]
        end = e["end"]
        label = e.get("entity_group") or e.get("entity")

        # spaCy requires token-aligned spans — so we use char span with alignment mode 'contract'
        span = doc.char_span(start, end, label=label, alignment_mode="contract")

        if span is None:
            # If a token boundary misalignment happens, skip it gracefully
            # (usually rare unless tokenizer splits oddly)
            continue

        spans.append(span)

    # assign entities to doc
    doc.set_ents(spans, default="unmodified")
    return doc

In [None]:
doc = results_to_spacy_doc(fontane, ents, lang="de")
displacy.render(doc, style="ent", jupyter=True)