In [1]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

In [3]:
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large-finetuned-conll03-german")
model = AutoModelForTokenClassification.from_pretrained("xlm-roberta-large-finetuned-conll03-german")

nlp = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=True)
example = "My name is Wolfgang and I live in Berlin"

ner_results = nlp(example)
print(ner_results)

Some weights of the model checkpoint at xlm-roberta-large-finetuned-conll03-german were not used when initializing XLMRobertaForTokenClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


[{'entity_group': 'PER', 'score': np.float32(0.9999894), 'word': 'Wolfgang', 'start': 11, 'end': 19}, {'entity_group': 'LOC', 'score': np.float32(0.99996984), 'word': 'Berlin', 'start': 34, 'end': 40}]


In [4]:
with open("data/fontane_brandenburg01_1862_ch1.txt", 'r', encoding='utf-8') as f:
    fontane=f.read()

In [27]:
print(fontane[:1000])

Wustrau.
Da liegen wir zwei Beide
Bis zum Appell im Grab.

Der Ruppiner See, der genau die Form eines halben Mondes
hat, scheidet sich seinen Ufern nach in zwei sehr verschiedene Hälften.
Die nördliche Hälfte ist sandig und unfruchtbar, und, die hübsch
gelegenen Städte Alt- und Neu-Ruppin abgerechnet, ohne allen
malerischen Reiz; die Südhälfte aber ist theils angebaut, theils
bewaldet und seit alten Zeiten her von vier hübschen Dörfern ein-
gefaßt. Das eine dieser Dörfer, Treskow geheißen, war bis vor
Kurzem ein altes Kämmerei-Gut der Stadt Ruppin; die drei an-
dern sind Rittergüter. Ihre Namen sind: Gnewkow, Car


In [5]:
ner_results = nlp(fontane)
print(ner_results)

[{'entity_group': 'LOC', 'score': np.float32(0.9998619), 'word': 'Wustrau', 'start': 0, 'end': 7}, {'entity_group': 'LOC', 'score': np.float32(0.98664886), 'word': 'Ruppiner See', 'start': 63, 'end': 75}, {'entity_group': 'LOC', 'score': np.float32(0.99878836), 'word': 'Alt-', 'start': 270, 'end': 274}, {'entity_group': 'LOC', 'score': np.float32(0.9998606), 'word': 'Neu-Ruppin', 'start': 279, 'end': 289}, {'entity_group': 'LOC', 'score': np.float32(0.9998048), 'word': 'Treskow', 'start': 477, 'end': 484}, {'entity_group': 'LOC', 'score': np.float32(0.9999672), 'word': 'Ruppin', 'start': 547, 'end': 553}, {'entity_group': 'LOC', 'score': np.float32(0.9998862), 'word': 'Gnewkow', 'start': 608, 'end': 615}, {'entity_group': 'LOC', 'score': np.float32(0.99990237), 'word': 'Carwe', 'start': 617, 'end': 622}, {'entity_group': 'LOC', 'score': np.float32(0.9998777), 'word': 'Wustrau', 'start': 627, 'end': 634}, {'entity_group': 'PER', 'score': np.float32(0.78738153), 'word': 'von Wol', 'start

In [6]:
print(len(ner_results))

25


In [None]:
def ner_by_char_chunks(text, nlp, chunk_chars=1000, overlap=100):
    entities = []
    start = 0
    L = len(text)
    while start < L:
        end = min(start + chunk_chars, L)
        chunk = text[start:end]
        ents = nlp(chunk)
        for e in ents:
            # pipeline returns chunk-relative start/end if offsets available
            if 'start' in e and 'end' in e:
                e2 = dict(e)
                e2['start'] += start
                e2['end'] += start
                entities.append(e2)
        start = end - overlap  # overlap so entities spanning boundary are caught
    # simple dedupe & sort
    seen = set()
    uniq = []
    for e in sorted(entities, key=lambda x: (x['start'], x['end'])):
        key = (e['start'], e['end'], e.get('entity_group') or e.get('entity'), e.get('word'))
        if key not in seen:
            seen.add(key); uniq.append(e)
    return uniq



In [None]:
ents = ner_by_char_chunks(fontane, nlp, chunk_chars=2000, overlap=200)
print(len(ents), ents[:10])