In [1]:
from google.colab import drive
drive.mount('/content/drive')

!pip install transformers
!pip install transformers torch
!pip install torch
!pip install datasets==2.6.1
!pip install seqeval
!pip install accelerate

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertForTokenClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report
!pip install psutil transformers datasets
import psutil
import time
!pip install evaluate

class MemoryUsageCallback:
    def __init__(self):
        self.process = psutil.Process()
        self.mem_usage = []

    def on_epoch_end(self, args, state, control, **kwargs):
        mem_info = self.process.memory_info().rss / 1024 ** 2  # in MB
        self.mem_usage.append(mem_info)
        print(f'Epoch {state.epoch} - Memory Usage: {mem_info:.2f} MB')

    def on_train_end(self, args, state, control, **kwargs):
        print(f'Max Memory Usage: {max(self.mem_usage):.2f} MB')

memory_callback = MemoryUsageCallback()



In [9]:
import json
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

nltk.download('punkt')

def iob_tagging(text, annotations):
    #sentences = sent_tokenize(text)
    sentences = [text]
    all_tokens = []
    all_tags = []

    for sentence in sentences:
        tokens = word_tokenize(sentence)
        tags = ['O'] * len(tokens)
        sentence_start = text.index(sentence)
        token_positions = []
        position = sentence_start
        for token in tokens:
            position = text.find(token, position)
            token_positions.append((position, position + len(token)))
            position += len(token)

        for annotation in annotations:
            start, end = annotation['start'], annotation['end']
            label = annotation['tag']
            start_token = next((i for i, pos in enumerate(token_positions) if pos[0] <= start < pos[1]), None)
            end_token = next((i for i, pos in enumerate(token_positions) if pos[0] < end <= pos[1]), None)

            if start_token is not None and end_token is not None and start_token < len(tags) and end_token < len(tags):
                tags[start_token] = f'B-{label}'
                for i in range(start_token + 1, end_token + 1):
                    tags[i] = f'I-{label}'

        all_tokens.append(tokens)
        all_tags.append(tags)

    return all_tokens, all_tags

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [10]:
with open('/content/drive/MyDrive/just-citation-aufl_annotations.json', 'r') as file:
    data_aufl = json.load(file)

with open('/content/drive/MyDrive/just-citation-checking_annotations.json', 'r') as file:
    data_sentences = json.load(file)

with open('/content/drive/MyDrive/new_just-citation-aufl_annotations.json', 'r') as file:
    new_data_aufl = json.load(file)

with open('/content/drive/MyDrive/new_just-citation-checking_annotations.json', 'r') as file:
    new_data_sentences = json.load(file)

texts = []
sentences = []
entities = []

tag2id = {"O": 0, "B-citation": 1, "I-citation": 2}
id2tag = {0: "O", 1: "B-citation", 2: "I-citation"}

loopz = [data_aufl, data_sentences, new_data_aufl, new_data_sentences]

for i in loopz:
  for document in i['examples']:
    if document['annotations'] != []:
      text = document['content']
      annotations = document['annotations']
      # if annotations != []:
      token_lists, tag_lists = iob_tagging(text, annotations)
      flattened_token_lists = [item for row in token_lists for item in row]
      flattened_tag_lists = [tagz for columnz in tag_lists for tagz in columnz]
      sentences.append(flattened_token_lists)
      entities.append(flattened_tag_lists)
      texts.append(text)

tokenized_sentences = sentences
iob_tags = entities

tags_flattened = [item for row in iob_tags for item in row]

print(len(tokenized_sentences))

# post-labeling-program data

with open('/content/drive/MyDrive/aufl_just_citation_data.json', 'r') as file:
  post_labeling_aufl = json.load(file)

with open('/content/drive/MyDrive/sentences_just_citation_data.json', 'r') as file:
  post_labeling_sentences = json.load(file)

262


In [11]:
for i in post_labeling_aufl['sentences']:
  texts.append(i)

for i in post_labeling_aufl['tokenized_sentence']:
  tokenized_sentences.append(i)

for i in post_labeling_aufl['predicted_labels']:
  iob_tags.append(i)

for i in post_labeling_sentences['sentences']:
  texts.append(i)

for i in post_labeling_sentences['tokenized_sentence']:
  tokenized_sentences.append(i)

for i in post_labeling_sentences['predicted_labels']:
  iob_tags.append(i)

In [12]:
import pandas as pd
from transformers import BertTokenizerFast
from datasets import Dataset

data = {
    'tokens': tokenized_sentences,
    'tags': iob_tags
}

df = pd.DataFrame(data)

dataset = Dataset.from_pandas(df)

tokenizer = BertTokenizerFast.from_pretrained('bert-base-german-cased')

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples['tokens'], is_split_into_words=True, truncation=True, padding='max_length', max_length=128)
    labels = []
    for i, label in enumerate(examples['tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label_map[label[word_idx]])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

label_list = ['O', 'B-citation', 'I-citation']
label_map = {label: i for i, label in enumerate(label_list)}

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

train_test_split = tokenized_dataset.train_test_split(test_size=0.1,shuffle=True)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']




  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
from transformers import BertForTokenClassification, TrainingArguments, Trainer
import numpy as np
from datasets import load_metric

model = BertForTokenClassification.from_pretrained('bert-base-german-cased', num_labels=len(label_map))

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
)

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[label_list[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

metric = load_metric("seqeval")

class CustomTrainer(Trainer):
    def __init__(self, memory_callback, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.memory_callback = memory_callback

    def train(self, resume_from_checkpoint=None, trial=None, **kwargs):
        super().train(resume_from_checkpoint, trial, **kwargs)
        self.memory_callback.on_train_end(self.args, self.state, self.control)

    def evaluation_loop(self, *args, **kwargs):
        eval_dataloader = self.get_eval_dataloader()
        self.memory_callback.on_epoch_end(self.args, self.state, self.control)
        return super().evaluation_loop(*args, **kwargs)

trainer = CustomTrainer(
    memory_callback=memory_callback,
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

In [None]:
from sklearn.metrics import classification_report

# Existing code
results = trainer.evaluate()
print(f"Testing accuracy: {results['eval_accuracy']}")

def align_predictions(predictions, label_ids):
    preds = np.argmax(predictions, axis=2)
    batch_size, seq_len = preds.shape

    out_label_list = [[] for _ in range(batch_size)]
    out_pred_list = [[] for _ in range(batch_size)]

    for i in range(batch_size):
        for j in range(seq_len):
            if label_ids[i, j] != -100:
                out_label_list[i].append(label_list[label_ids[i][j]])
                out_pred_list[i].append(label_list[preds[i][j]])

    return out_pred_list, out_label_list

predictions, labels, _ = trainer.predict(eval_dataset)
pred_tags, true_tags = align_predictions(predictions, labels)

for i in pred_tags:
  print (i)

for i in true_tags:
  print(i)

flat_pred_tags = [tag for pred in pred_tags for tag in pred]
flat_true_tags = [tag for true in true_tags for tag in true]

# classification report
print(classification_report(flat_true_tags, flat_pred_tags))



Testing accuracy: 0.9416983523447402
['I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation']
['O', 'O', 'O', 'O', 'O', 'O', 'B-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'O', 'O', 'I-citation', 'I-citation', 'I-citation', 'I-citation']
['O', 'O', 'O', 'B-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'O', 'B-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-c

In [None]:
def combine_bio_tags(labels):
    combined_labels = []
    for label in labels:
        if label.startswith('B-') or label.startswith('I-'):
            combined_labels.append(label[2:])
        else:
            combined_labels.append(label)
    return combined_labels

true_labels = flat_true_tags
true_labels_combined = combine_bio_tags(true_labels)
predicted_labels_combined = combine_bio_tags(flat_pred_tags)

# classification report
report = classification_report(true_labels_combined, predicted_labels_combined)
print(report)

              precision    recall  f1-score   support

           O       0.97      0.97      0.97       321
    citation       0.99      0.99      0.99      1600

    accuracy                           0.99      1921
   macro avg       0.98      0.98      0.98      1921
weighted avg       0.99      0.99      0.99      1921



In [None]:
def predict_entities_readable(text):
    model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    tokenized_sentence = []
    output_labels = []
    encoding = tokenizer(text, return_tensors="pt", truncation=True, padding=True, is_split_into_words=False)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits

    predictions = torch.argmax(logits, dim=-1)
    predicted_label_indices = predictions[0].tolist()
    tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
    predicted_labels = [id2tag[label] for label in predicted_label_indices]
    processed_tokens = []
    processed_labels = []

    for token, label in zip(tokens, predicted_labels):
        if token.startswith("##"):
            processed_tokens[-1] += token[2:]
        else:
            if token not in ["[CLS]", "[SEP]", "[PAD]"]:
                processed_tokens.append(token)
                processed_labels.append(label)

    for token, label in zip(processed_tokens, processed_labels):
        tokenized_sentence.append(token)
        output_labels.append(label)
    return tokenized_sentence, output_labels

input_text = "BGHZ 108, 372/375; HÃ¼gel/Kramer GBO 3. Aufl. Â§ 71 Rn. 96f"
tokens, labels = predict_entities_readable(input_text)
print(tokens)
print(labels)

['BGHZ', '108', ',', '372', '/', '375', ';', 'HÃ¼gel', '/', 'Kramer', 'GBO', '3', '.', 'Aufl', '.', 'Â§', '71', 'Rn', '.', '96f']
['B-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'O', 'B-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation']


In [None]:
# Predicting the old dataset

In [None]:
import re

input_file = '/content/drive/MyDrive/all-braces-rgz.txt'

def extract_sentences(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()

    sentences = re.findall(r'(?<=txt\s).+', text)
    return sentences

sentences_list_old = extract_sentences(input_file)

print(len(sentences_list_old))

6342


In [None]:
tokenized_sentences = []
predicted_labels = []

for i in sentences_list_old:
  tokens, predictions = predict_entities_readable(i)
  tokenized_sentences.append(tokens)
  predicted_labels.append(predictions)

In [None]:
for i in range(1, 500):
  print(sentences_list_old[i])
  print(tokenized_sentences[i])
  print(predicted_labels[i])
  print()

Vgl. Entsch. des Obertribunals Bd. 41 S. 255.
['Vgl', '.', 'Entsch', '.', 'des', 'Obertribunals', 'Bd', '.', '41', 'S', '.', '255', '.']
['B-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation']

vgl. Savigny, Besitz, Einleitung S. LXXII
['vgl', '.', 'Savigny', ',', 'Besitz', ',', 'Einleitung', 'S', '.', 'LXXII']
['O', 'O', 'B-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation']

Simon u. Strampff, Rechtsprechung Bd. I S. 221
['Simon', 'u', '.', 'Strampff', ',', 'Rechtsprechung', 'Bd', '.', 'I', 'S', '.', '221']
['B-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation', 'I-citation']

Entscheidungen Bd. 10 S. 162
['Entscheidungen', 'Bd', '.', '10', 'S', '.', '162']
['B-citation', 'I-citation', 'I-citation', 'I-cit

In [None]:
data = {
    "texts": sentences_list_old,
    "tokenized texts": tokenized_sentences,
    "predicted labels": predicted_labels
}

with open("old_documents_just_citation_predictions.json", "w") as file:
    json.dump(data, file, indent=4)