In [1]:
import os
os.environ['TRANSFORMERS_CACHE'] = './cache'

In [2]:
import spacy 
# !python -m spacy download de_core_news_sm
nlp = spacy.load("de_core_news_sm")



In [141]:
import pandas as pd
from datasets import Dataset

#  Jsonl to IOB

In [None]:
import import_ipynb
from Jsonl_to_IOB import *
path = 'all.jsonl'
data = get_data(path)
tokenized = tokenized_output(data)
tokenized.head()

In [3]:
def tokenization(data):
    count = 0 
    i=0
    starts = [x['start_offset'] for x in data['entities']]
    ends = [x['end_offset'] for x in data['entities']]
    starts.append(99999)
    ends.append(99999)
    
    sen_temp = []
    tag_temp = []
    
    text = data['text']
    if text.startswith(" "):
        text = text[1:]
        count = 1
    doc = nlp(text)
    

    for t in doc:
        sen_temp.append(t.text)
        if count == starts[i]:
            tag_temp.append('B-ORG')
            count = count+len(t.text)+1
            if count > ends[i]:
                i += 1
                
        elif count > starts[i] and count < ends[i]:
            if tag_temp[-1] == 'O':
                tag_temp.append('B-ORG')
            else:
                tag_temp.append('I-ORG')
            count = count+len(t.text)+1
            if count > ends[i]:
                i += 1
                
        else:
            tag_temp.append('O')
            count = count+len(t.text)+1
    
    return sen_temp, tag_temp

In [4]:
#load data
import json
path = 'all.jsonl'

with open(path,'r', encoding = 'utf-8') as f:
    data = [json.loads(line) for line in f]

In [11]:
data[0]

{'id': 26614,
 'text': 'Maßnahmenbekanntgabe zu MA 40, Prüfung der Nebenbeschäftigungen',
 'doc_id': 0,
 'para_id': 0,
 'entities': [{'id': 51631,
   'label': 'ORG',
   'start_offset': 24,
   'end_offset': 29}],
 'relations': [],
 'Comments': []}

In [12]:
tokenization(data[0])

(['Maßnahmenbekanntgabe',
  'zu',
  'MA',
  '40',
  ',',
  'Prüfung',
  'der',
  'Nebenbeschäftigungen'],
 ['O', 'O', 'B-ORG', 'I-ORG', 'O', 'O', 'O', 'O'])

In [9]:

doc_id_list = []
para_id_list = []
txt_list = []
tag_list = []

for d in data:
    doc_id_list.append(d['doc_id'])
    para_id_list.append(d['para_id'])
    txt, tag = tokenization(d)
    txt_list.append(txt)
    tag_list.append(tag)
    
tokenized = pd.DataFrame({'doc_id' : doc_id_list,
                         'para_id': para_id_list,
                         'tokens' : txt_list,
                         'ner_tags' : tag_list})
tokenized

Unnamed: 0,doc_id,para_id,tokens,ner_tags
0,0,0,"[Maßnahmenbekanntgabe, zu, MA, 40, ,, Prüfung,...","[O, O, B-ORG, I-ORG, O, O, O, O]"
1,0,1,[INHALTSVERZEICHNIS],[O]
2,0,2,[ABKÜRZUNGSVERZEICHNIS],[O]
3,0,3,"[bzw., beziehungsweise, Nr., Nummer]","[O, O, O, O]"
4,0,4,"[Erledigung, des, Prüfungsberichtes, Der, Stad...","[O, O, O, O, B-ORG, I-ORG, O, O, O, O, O, O, O..."
...,...,...,...,...
3700,41,31,"[Empfehlung, Nr., 8, Vor, Erstellung, eines, L...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3701,41,32,"[Ergebnis, der, Prüfung, des, Stadtrechnungsho...","[O, O, O, O, B-ORG, I-ORG, O, O, O, O, O, O, O..."
3702,41,33,"[Empfehlung, Nr., 9, Die, Führung, von, Bautag...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3703,41,34,"[Ergebnis, der, Prüfung, des, Stadtrechnungsho...","[O, O, O, O, B-ORG, I-ORG, O, O, O, O, O, O, O..."


# Preprocess for using transformer

In [13]:
from transformers import AutoTokenizer

model_checkpoint = "bert-base-german-dbmdz-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [159]:
label_encoding_dict = {'O': 0,
                 'B-ORG':1,
                'I-ORG':2}
label_list = ['O','B-ORG','I-ORG']

In [116]:
def align_labels(data, label_encoding_dict):

    tokenized_inputs = tokenizer(data["tokens"],
                        max_length = 128, padding = 'max_length',
                        truncation=True, is_split_into_words=True)

    label_id_temp = {}
    for i, label in enumerate(data['ner_tags']):
        label_id_temp.update({i:label})

    word_ids = tokenized_inputs.word_ids(batch_index=0)

    labels = []
    for w_id in word_ids:
        if w_id == None:
            labels.append(-100)
        else:
            tag = label_id_temp[w_id]
            labels.append(label_encoding_dict[tag])

    tokenized_inputs["labels"] = labels

    return tokenized_inputs

In [139]:
train = tokenized.copy()
train['input_ids'] = ""
train['token_type_ids'] = ""
train['attention_mask']= ""
train['labels'] = ""

for index, row in tokenized.iterrows():
    inputs = align_labels(row,label_encoding_dict)
    train.at[index,'input_ids'] = inputs['input_ids']
    train.at[index,'token_type_ids'] = inputs['token_type_ids']
    train.at[index,'attention_mask'] = inputs['attention_mask']
    train.at[index,'labels'] = inputs['labels']

train

Unnamed: 0,doc_id,para_id,tokens,ner_tags,input_ids,token_type_ids,attention_mask,labels
0,0,0,"[Maßnahmenbekanntgabe, zu, MA, 40, ,, Prüfung,...","[O, O, B-ORG, I-ORG, O, O, O, O]","[102, 2400, 3366, 837, 1621, 205, 21669, 2161,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, ..."
1,0,1,[INHALTSVERZEICHNIS],[O],"[102, 5331, 30925, 22171, 3610, 949, 24626, 50...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, -100, -100, -10..."
2,0,2,[ABKÜRZUNGSVERZEICHNIS],[O],"[102, 9059, 30918, 12939, 30945, 13895, 15853,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100, -100, ..."
3,0,3,"[bzw., beziehungsweise, Nr., Nummer]","[O, O, O, O]","[102, 2100, 566, 9542, 1559, 566, 5311, 103, 0...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ...","[-100, 0, 0, 0, 0, 0, 0, -100, -100, -100, -10..."
4,0,4,"[Erledigung, des, Prüfungsberichtes, Der, Stad...","[O, O, O, O, B-ORG, I-ORG, O, O, O, O, O, O, O...","[102, 8179, 3553, 132, 222, 17639, 15602, 3088...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 0, ..."
...,...,...,...,...,...,...,...,...
3700,41,31,"[Empfehlung, Nr., 8, Vor, Erstellung, eines, L...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[102, 11340, 1559, 566, 642, 445, 13248, 683, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3701,41,32,"[Ergebnis, der, Prüfung, des, Stadtrechnungsho...","[O, O, O, O, B-ORG, I-ORG, O, O, O, O, O, O, O...","[102, 3942, 125, 6868, 222, 668, 24831, 1312, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 0, 0, 0, 0, 1, 1, 1, 1, 2, 0, 0, 0, 0, ..."
3702,41,33,"[Empfehlung, Nr., 9, Die, Führung, von, Bautag...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[102, 11340, 1559, 566, 680, 229, 5203, 195, 1...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3703,41,34,"[Ergebnis, der, Prüfung, des, Stadtrechnungsho...","[O, O, O, O, B-ORG, I-ORG, O, O, O, O, O, O, O...","[102, 3942, 125, 6868, 222, 668, 24831, 1312, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 0, 0, 0, 0, 1, 1, 1, 1, 2, 0, 0, 0, 0, ..."


In [142]:
train_drop = train.drop(['doc_id','para_id'],axis = 1)
train_dataset = Dataset.from_pandas(train_drop)
print(type(train_dataset))

<class 'datasets.arrow_dataset.Dataset'>


In [154]:
dset = train_dataset.train_test_split(test_size=0.2)
dset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 2964
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 741
    })
})

In [155]:
dset['train']

Dataset({
    features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 2964
})

# Fine-tune model

In [145]:
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
import torch
from seqeval.metrics import classification_report
from seqeval.metrics import accuracy_score
# from seqeval.metrics import f1_score
# from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score


model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_encoding_dict))

Downloading:   0%|          | 0.00/442M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-german-dbmdz-cased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at 

In [168]:
from seqeval.metrics import accuracy_score
from seqeval.metrics import classification_report
from seqeval.metrics import f1_score

y_true = [['B-PER', 'I-PER', 'O'], ['O', 'O', 'B-LOC']]
y_pred = [['B-PER', 'O', 'O'], ['O', 'O', 'B-LOC']]
result = classification_report(y_true, y_pred, output_dict = True)
result['PER']
result

{'LOC': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 1},
 'PER': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 1},
 'micro avg': {'precision': 0.5, 'recall': 0.5, 'f1-score': 0.5, 'support': 2},
 'macro avg': {'precision': 0.5, 'recall': 0.5, 'f1-score': 0.5, 'support': 2},
 'weighted avg': {'precision': 0.5,
  'recall': 0.5,
  'f1-score': 0.5,
  'support': 2}}

In [170]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [[label_list[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    true_labels = [[label_list[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    
    result = classification_report(true_labels, true_predictions, output_dict = True)

    print(result)
    
    return result['ORG']


batch_size = 32
args = TrainingArguments(
    "test-ner",
    evaluation_strategy = "epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=1e-5,
)

data_collator = DataCollatorForTokenClassification(tokenizer)


trainer = Trainer(
    model,
    args,
    train_dataset=dset['train'],
    eval_dataset=dset['test'],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

print("Training...")
trainer.train()
print("Evaluating...")
trainer.evaluate()
# print("Saving...")
# trainer.save_model('un-ner.model')

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, ner_tags. If tokens, ner_tags are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 2964
  Num Epochs = 3
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 279


Training...


Epoch,Training Loss,Validation Loss,Precision,Recall,F1-score,Support
1,No log,0.047051,0.803057,0.900461,0.848975,1517
2,No log,0.05135,0.813827,0.884641,0.847757,1517
3,No log,0.054068,0.807556,0.90178,0.852071,1517


The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, ner_tags. If tokens, ner_tags are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 741
  Batch size = 32


{'ORG': {'precision': 0.8030570252792475, 'recall': 0.9004614370468029, 'f1-score': 0.8489745183343691, 'support': 1517}, 'micro avg': {'precision': 0.8030570252792475, 'recall': 0.9004614370468029, 'f1-score': 0.8489745183343691, 'support': 1517}, 'macro avg': {'precision': 0.8030570252792475, 'recall': 0.9004614370468029, 'f1-score': 0.8489745183343691, 'support': 1517}, 'weighted avg': {'precision': 0.8030570252792475, 'recall': 0.9004614370468029, 'f1-score': 0.8489745183343691, 'support': 1517}}


The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, ner_tags. If tokens, ner_tags are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 741
  Batch size = 32


{'ORG': {'precision': 0.813826561552456, 'recall': 0.8846407382992749, 'f1-score': 0.8477574226152874, 'support': 1517}, 'micro avg': {'precision': 0.813826561552456, 'recall': 0.8846407382992749, 'f1-score': 0.8477574226152874, 'support': 1517}, 'macro avg': {'precision': 0.813826561552456, 'recall': 0.8846407382992749, 'f1-score': 0.8477574226152874, 'support': 1517}, 'weighted avg': {'precision': 0.813826561552456, 'recall': 0.8846407382992749, 'f1-score': 0.8477574226152874, 'support': 1517}}


The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, ner_tags. If tokens, ner_tags are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 741
  Batch size = 32


{'ORG': {'precision': 0.807556080283353, 'recall': 0.9017798286090969, 'f1-score': 0.8520710059171597, 'support': 1517}, 'micro avg': {'precision': 0.807556080283353, 'recall': 0.9017798286090969, 'f1-score': 0.8520710059171597, 'support': 1517}, 'macro avg': {'precision': 0.807556080283353, 'recall': 0.9017798286090969, 'f1-score': 0.8520710059171597, 'support': 1517}, 'weighted avg': {'precision': 0.807556080283353, 'recall': 0.9017798286090969, 'f1-score': 0.8520710059171597, 'support': 1517}}




Training completed. Do not forget to share your model on huggingface.co/models =)


The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, ner_tags. If tokens, ner_tags are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 741
  Batch size = 32


Evaluating...


{'ORG': {'precision': 0.807556080283353, 'recall': 0.9017798286090969, 'f1-score': 0.8520710059171597, 'support': 1517}, 'micro avg': {'precision': 0.807556080283353, 'recall': 0.9017798286090969, 'f1-score': 0.8520710059171597, 'support': 1517}, 'macro avg': {'precision': 0.807556080283353, 'recall': 0.9017798286090969, 'f1-score': 0.8520710059171597, 'support': 1517}, 'weighted avg': {'precision': 0.807556080283353, 'recall': 0.9017798286090969, 'f1-score': 0.8520710059171597, 'support': 1517}}


{'eval_loss': 0.05406766012310982,
 'eval_precision': 0.807556080283353,
 'eval_recall': 0.9017798286090969,
 'eval_f1-score': 0.8520710059171597,
 'eval_support': 1517,
 'eval_runtime': 213.0744,
 'eval_samples_per_second': 3.478,
 'eval_steps_per_second': 0.113,
 'epoch': 3.0}

In [176]:
# from transformers import pipeline

# pipe = pipeline("ner", model=model_checkpoint, tokenizer=tokenizer)
# test_text = ' '.join(dset['test'][0]['tokens'])

# test_result = pipe(test_text)
# test_result

loading configuration file config.json from cache at ./cache\models--bert-base-german-dbmdz-cased\snapshots\1338901726062fab13465d4b37f0f0c55b662a78\config.json
Model config BertConfig {
  "_name_or_path": "bert-base-german-dbmdz-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.22.0.dev0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 31102
}

loading configuration file config.json from cache at ./cache\models--bert-base-german-dbmdz-cased\snapshots\1338901726062fab13465d4b37f0f0c55b662a78\config.json
Model config BertConfig {
  "_name_o

[{'entity': 'LABEL_1',
  'score': 0.6156101,
  'index': 1,
  'word': 'Dabei',
  'start': 0,
  'end': 5},
 {'entity': 'LABEL_0',
  'score': 0.54003084,
  'index': 2,
  'word': 'war',
  'start': 6,
  'end': 9},
 {'entity': 'LABEL_1',
  'score': 0.50910103,
  'index': 3,
  'word': 'festzustellen',
  'start': 10,
  'end': 23},
 {'entity': 'LABEL_0',
  'score': 0.52891177,
  'index': 4,
  'word': ',',
  'start': 24,
  'end': 25},
 {'entity': 'LABEL_0',
  'score': 0.533248,
  'index': 5,
  'word': 'dass',
  'start': 26,
  'end': 30},
 {'entity': 'LABEL_0',
  'score': 0.53162944,
  'index': 6,
  'word': 'der',
  'start': 31,
  'end': 34},
 {'entity': 'LABEL_1',
  'score': 0.52718717,
  'index': 7,
  'word': 'in',
  'start': 35,
  'end': 37},
 {'entity': 'LABEL_1',
  'score': 0.5688556,
  'index': 8,
  'word': 'der',
  'start': 38,
  'end': 41},
 {'entity': 'LABEL_1',
  'score': 0.5575596,
  'index': 9,
  'word': 'Maßnahmen',
  'start': 42,
  'end': 51},
 {'entity': 'LABEL_1',
  'score': 0.611