In [None]:
# This code can be removed or commented out if not run on Colab.

from google.colab import drive
drive.mount('/content/drive')

!pip install transformers
!pip install transformers torch
!pip install torch
!pip install datasets==2.6.1
!pip install seqeval
!pip install accelerate

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertForTokenClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report

In [None]:
import json
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

nltk.download('punkt')

def iob_tagging(text, annotations):
    #sentences = sent_tokenize(text)
    sentences = [text]
    all_tokens = []
    all_tags = []

    for sentence in sentences:
        tokens = word_tokenize(sentence)
        tags = ['O'] * len(tokens)
        sentence_start = text.index(sentence)
        token_positions = []
        position = sentence_start
        for token in tokens:
            position = text.find(token, position)
            token_positions.append((position, position + len(token)))
            position += len(token)

        for annotation in annotations:
            start, end = annotation['start'], annotation['end']
            label = annotation['tag']
            start_token = next((i for i, pos in enumerate(token_positions) if pos[0] <= start < pos[1]), None)
            end_token = next((i for i, pos in enumerate(token_positions) if pos[0] < end <= pos[1]), None)

            if start_token is not None and end_token is not None and start_token < len(tags) and end_token < len(tags):
                tags[start_token] = f'B-{label}'
                for i in range(start_token + 1, end_token + 1):
                    tags[i] = f'I-{label}'

        all_tokens.append(tokens)
        all_tags.append(tags)

    return all_tokens, all_tags

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
with open('/content/drive/MyDrive/Thesis/1000-aufl_annotations-1.json', 'r') as file:
    data_aufl = json.load(file)

with open('/content/drive/MyDrive/Thesis/1000-sentences_annotations-3.json', 'r') as file:
    data_sentences = json.load(file)

texts = []
sentences = []
entities = []

datas = [data_aufl, data_sentences]

for i in datas:
  for document in i['examples']:
    if document['annotations'] != []:
      text = document['content']
      annotations = document['annotations']
      # if annotations != []:
      token_lists, tag_lists = iob_tagging(text, annotations)
      flattened_token_lists = [item for row in token_lists for item in row]
      flattened_tag_lists = [tagz for columnz in tag_lists for tagz in columnz]
      sentences.append(flattened_token_lists)
      entities.append(flattened_tag_lists)
      texts.append(text)

tokenized_sentences = sentences
iob_tags = entities

tags_flattened = [item for row in iob_tags for item in row]

tags_set = list(set(tags_flattened))
print(tags_set)

tag2id = {"O": 0, "B-Autor": 1, "I-Autor": 2, "B-Aktenzeichen": 3, "I-Aktenzeichen": 4, "B-Auflage": 5, "I-Auflage": 6, "B-Datum": 7, "I-Datum": 8, "B-Editor": 9, "B-Gesetz": 10, "I-Gesetz": 11, "B-Gericht": 12, "I-Gericht": 13, "B-Jahr": 14, "B-Nummer": 15, "I-Nummer": 16, "B-Randnummer": 17, "I-Randnummer": 18, "B-Paragraph": 19, "I-Paragraph": 20, "B-Seite-Beginn": 21, "I-Seite-Beginn": 22, "B-Seite-Fundstelle": 23, "B-Titel": 24, "I-Titel": 25, "B-Zeitschrift": 26, "I-Zeitschrift": 27, "I-Editor": 28, "I-Seite-Fundstelle" : 29}
id2tag = {0: "O", 1: "B-Autor", 2: "I-Autor", 3: "B-Aktenzeichen", 4: "I-Aktenzeichen", 5: "B-Auflage", 6: "I-Auflage", 7: "B-Datum", 8: "I-Datum", 9: "B-Editor", 10: "B-Gesetz", 11: "I-Gesetz", 12: "B-Gericht", 13: "I-Gericht", 14: "B-Jahr", 15: "B-Nummer", 16: "I-Nummer", 17: "B-Randnummer", 18: "I-Randnummer", 19: "B-Paragraph", 20: "I-Paragraph", 21: "B-Seite-Beginn", 22: "I-Seite-Beginn", 23: "B-Seite-Fundstelle", 24: "B-Titel", 25: "I-Titel", 26: "B-Zeitschrift", 27: "I-Zeitschrift", 28: "I-Editor", 29: "I-Seite-Fundstelle"}
print(len(tokenized_sentences))

['B-Gericht', 'B-Wort:Auflage', 'I-Datum', 'I-Nummer', 'B-Zeitschrift', 'I-Wort:aaO', 'I-Gesetz', 'B-Datum', 'I-Randnummer', 'I-Paragraph', 'I-Gericht', 'B-Editor', 'B-Autor', 'I-Aktenzeichen', 'B-Aktenzeichen', 'B-Nummer', 'B-Auflage', 'I-Zeitschrift', 'B-Gesetz', 'B-Randnummer', 'B-Paragraph', 'B-Seite-Beginn', 'B-Jahr', 'B-Seite-Fundstelle', 'B-Titel', 'B-Wort:aaO', 'I-Autor', 'I-Auflage', 'O', 'I-Titel']
249


In [None]:
import pandas as pd
from transformers import BertTokenizerFast
from datasets import Dataset

data = {
    'tokens': tokenized_sentences,
    'tags': iob_tags
}

df = pd.DataFrame(data)

dataset = Dataset.from_pandas(df)

tokenizer = BertTokenizerFast.from_pretrained('bert-base-german-cased')

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples['tokens'], is_split_into_words=True, truncation=True, padding='max_length', max_length=128)
    labels = []
    for i, label in enumerate(examples['tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label_map[label[word_idx]])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

label_list = tags_set
label_map = {label: i for i, label in enumerate(label_list)}

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

train_test_split = tokenized_dataset.train_test_split(test_size=0.1,shuffle=True)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
# add the validation data

with open('/content/drive/MyDrive/all_regions_validation_data.json', 'r') as file:
  all_regions_validation = json.load(file)

with open('/content/drive/MyDrive/all_regions_validation2.json', 'r') as file:
  all_regions_validation2 = json.load(file)

all_regions_sentences = []
all_regions_tokenized = []
all_regions_predictions = []

for i in all_regions_validation['sentences']:
  all_regions_sentences.append(i)

for i in all_regions_validation['tokenized_sentence']:
  all_regions_tokenized.append(i)

for i in all_regions_validation['predicted_labels']:
  all_regions_predictions.append(i)

for i in all_regions_validation2['sentences']:
  all_regions_sentences.append(i)

for i in all_regions_validation2['tokenized_sentence']:
  all_regions_tokenized.append(i)

for i in all_regions_validation2['predicted_labels']:
  all_regions_predictions.append(i)

validation_sentences = all_regions_sentences
validation_tokenized = all_regions_tokenized
validation_labels = all_regions_predictions

#we now have:

print(len(sentences))
print(len(iob_tags))
print(len(validation_tokenized))
print(len(validation_labels))

# add post-labeling-program data

with open('/content/drive/MyDrive/validated_data.json', 'r') as file:
    data_update = json.load(file)

sentences_new = []
tokenized_sentences_new = []
predicted_labels_new = []

for i in data_update['sentences']:
    sentences_new.append(i)

for i in data_update['tokenized_sentence']:
    tokenized_sentences_new.append(i)

for i in data_update['predicted_labels']:
    predicted_labels_new.append(i)

tokenized_sentences = tokenized_sentences + tokenized_sentences_new
iob_tags = iob_tags + predicted_labels_new

print(len(tokenized_sentences))
print(len(validation_tokenized))
print(len(tokenized_sentences) + len(validation_tokenized))

# adding old legal text data

with open('/content/drive/MyDrive/old_documents_labeled.json', 'r') as file:
  data_old = json.load(file)

sentences_old = []
tokenized_sentences_old = []
predicted_labels_old = []

for i in data_old['sentences']:
  sentences_old.append(i)

for i in data_old['tokenized_sentence']:
  tokenized_sentences_old.append(i)

for i in data_old['predicted_labels']:
  predicted_labels_old.append(i)

print(len(sentences_old))
print(len(tokenized_sentences_old))
print(len(predicted_labels_old))

249
249
422
422
554
422
976
58
58
58


In [None]:
!pip install evaluate
from transformers import BertTokenizer, BertForTokenClassification, Trainer, TrainingArguments
from datasets import Dataset, load_metric
from sklearn.metrics import classification_report
import numpy as np

sentences = tokenized_sentences
tags = iob_tags

dataset_dict = {'tokens': sentences, 'tags': tags}

dataset = Dataset.from_dict(dataset_dict)

train_test_split = dataset.train_test_split(test_size=0.1, shuffle=True)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

val_sentences = validation_tokenized
val_tags = validation_labels

val_dataset_dict = {'tokens': val_sentences, 'tags': val_tags}
val_dataset = Dataset.from_dict(val_dataset_dict)

test_dataset_dict = {'tokens': tokenized_sentences_old, 'tags': predicted_labels_old}
test_dataset = Dataset.from_dict(test_dataset_dict)

tokenizer = BertTokenizerFast.from_pretrained('bert-base-german-cased')

label_list = list(set(tag for doc in tags + val_tags for tag in doc))
label_list.append('I-Seite-Fundstelle')
label_map = {label: i for i, label in enumerate(label_list)}

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples['tokens'],
        is_split_into_words=True,
        truncation=True,
        padding='max_length',
        max_length=128
    )
    labels = []
    for i, label in enumerate(examples['tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                if word_idx < len(label):
                    label_ids.append(label_map[label[word_idx]])
                else:
                    label_ids.append(-100)
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_val_dataset = val_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_and_align_labels, batched=True)

print('test set:')
print(tokenized_test_dataset)

model = BertForTokenClassification.from_pretrained('bert-base-german-cased', num_labels=len(label_list))

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    weight_decay=0.01,
    save_total_limit=1,
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset
)

trainer.train()

test_results = trainer.evaluate(eval_dataset=tokenized_test_dataset)
print(test_results)

# Get predictions on the test set
predictions, labels, _ = trainer.predict(tokenized_test_dataset)

true_labels = [[label_list[l] for l, p in zip(label, pred) if l != -100] for label, pred in zip(labels, predictions.argmax(-1))]
pred_labels = [[label_list[p] for l, p in zip(label, pred) if l != -100] for label, pred in zip(labels, predictions.argmax(-1))]
true_labels = [item for sublist in true_labels for item in sublist]
pred_labels = [item for sublist in pred_labels for item in sublist]

# Generate classification report
report = classification_report(true_labels, pred_labels, labels=label_list)
print(report)





  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

test set:
Dataset({
    features: ['tokens', 'tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 58
})


model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: [32m[41mERROR[0m API key must be 40 characters long, yours was 1


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

In [None]:
def combine_bio_tags(labels):
    combined_labels = []
    for label in labels:
        if label.startswith('B-') or label.startswith('I-'):
            combined_labels.append(label[2:])
        else:
            combined_labels.append(label)
    return combined_labels

# Combine the BIO tags in true and predicted labels
true_labels_combined = combine_bio_tags(true_labels)
predicted_labels_combined = combine_bio_tags(pred_labels)

# Generate the classification report
report = classification_report(true_labels_combined, predicted_labels_combined)
print(report)

                  precision    recall  f1-score   support

    Aktenzeichen       0.42      0.67      0.52        12
         Auflage       1.00      1.00      1.00         6
           Autor       0.55      1.00      0.71        12
           Datum       1.00      1.00      1.00        24
          Editor       1.00      1.00      1.00         2
         Gericht       0.00      0.00      0.00         7
          Gesetz       1.00      0.44      0.61        25
            Jahr       1.00      0.50      0.67         2
          Nummer       0.89      0.94      0.91        33
               O       0.86      0.99      0.92       512
       Paragraph       1.00      0.90      0.95        58
      Randnummer       0.67      0.75      0.71         8
    Seite-Beginn       0.94      0.94      0.94        34
Seite-Fundstelle       0.75      1.00      0.86         3
           Titel       0.44      0.69      0.54        16
        Wort:aaO       0.00      0.00      0.00         0
     Zeitschr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
