# Setup 

In [None]:
!pip install transformers[torch] datasets scikit-learn pandas numpy

In [52]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from collections import Counter
from sklearn.metrics import f1_score
import numpy as np
from pathlib import Path

In [None]:
from functions import load_jsonl

# Global Variables


In [None]:
# Data
train_file_path = Path('data_germeval/train.jsonl')
dev_file_path = Path('data_germeval/development.jsonl')
test_file_path = Path('data_germeval/test.jsonl')

# Model 
model_name = 'google-bert/bert-base-german-cased'

# Log directories
output_dir_bin_maj = Path('./logs/run_final_bin_maj/')
output_dir_bin_one = Path('./logs/run_final_bin_one/')
output_dir_bin_all = Path('./logs/run_final_bin_all/')
output_dir_multi_maj = Path('./logs/run_final_multi_maj/')
output_dir_disagree_bin = Path('./logs/run_final_disagree_bin/')

# Model directories
model_path_bin_maj = Path("models/bin_maj_model")
model_path_bin_one = Path("models/bin_one_model")
model_path_bin_all = Path("models/bin_all_model")
model_path_multi_maj = Path("models/multi_maj_model")
model_path_disagree_bin = Path("models/disagree_bin_model")

#do we want to do the training?
run_this = False

# Let us load our data

In [5]:
train_data = load_jsonl(train_file_path)
dev_data = load_jsonl(dev_file_path)
test_data = load_jsonl(test_file_path)

# Let us define the functions that compute the different labels for the germeval task

Version "bin_maj"

In [9]:
def assign_bin_maj(item, is_test=False):
    """
    takes a tweet and its annotations (if available) and computes 1 if a majority of annotators assigned a label other than 0-Kein, predicts 0 if a majority assigned 0-Kein. If there was no majority, either label is considered correct for evaluation.
    :param item: dictionary of the form {'id': , 'text': , 'annotators': }
    :param is_test: if False annotations are available. If True not
    :return: dictionary of the form {'id': , 'text': , 'label': }
    """
    text = item['text']
    text = text.replace('\n', ' ')
    if not is_test:
        labels = [ann['label'] for ann in item['annotations']]
        label_counts = Counter(labels)
        majority_label, majority_count = label_counts.most_common(1)[0]
        bin_maj_label = 1 if majority_label != '0-Kein' else 0
    else:
        bin_maj_label = None
    return {'id': item['id'], 'text': text, 'label': bin_maj_label}
    

Version "bin_one"

In [10]:
def assign_bin_one(item, is_test=False):
    """
    takes a tweet and its annotations (if available) and computes 1 if at least one annotator assigned a label other than 0-Kein, 0 otherwise.
    :param item: dictionary of the form {'id': , 'text': , 'annotators': }
    :param is_test: if False annotations are available. If True not
    :return: dictionary of the form {'id': , 'text': , 'label': }
    """
    text = item['text']
    text = text.replace('\n', ' ')
    if not is_test:
        bin_one_label = 1 if any(ann['label'] != '0-Kein' for ann in item['annotations']) else 0
    else:
        bin_one_label = None
    return {'id': item['id'], 'text': text, 'label': bin_one_label}

Version "bin_all"

In [11]:
def assign_bin_all(item, is_test=False):
    """
    takes a tweet and its annotations (if available) and computes 1 if all annotators assigned labels other than 0-Kein, 0 otherwise.
    :param item: dictionary of the form {'id': , 'text': , 'annotators': }
    :param is_test: if False annotations are available. If True not
    :return: dictionary of the form {'id': , 'text': , 'label': }
    """
    text = item['text']
    text = text.replace('\n', ' ')
    if not is_test:
        bin_all_label = 1 if all(ann['label'] != '0-Kein' for ann in item['annotations']) else 0
    else:
        bin_all_label = None
    return {'id': item['id'], 'text': text, 'label': bin_all_label}

Version "multi_maj"

In [12]:
def assign_multi_maj(item, is_test=False):
    """
    takes a tweet and its annotations (if available) and predicts the majority label if there is one, if there is no majority label, any of the labels assigned is counted as a correct prediction for evaluation.
    :param item: dictionary of the form {'id': , 'text': , 'annotators': }
    :param is_test: if False annotations are available. If True not
    :return: dictionary of the form {'id': , 'text': , 'label': }
    """
    text = item['text']
    text = text.replace('\n', ' ')
    if not is_test:
        labels = [ann['label'] for ann in item['annotations']]
        label_counts = Counter(labels)
        majority_label, majority_count = label_counts.most_common(1)[0]
        multi_maj_label = majority_label if majority_count > len(labels) / 2 else labels[0]
        multi_maj_label = int(multi_maj_label.split('-')[0])
    else:
        multi_maj_label = None
    return {'id': item['id'], 'text': text, 'label': multi_maj_label}

Version "disagree bin"

In [13]:
def assign_disagree_bin(item, is_test=False):
    """
    takes a tweet and its annotations (if available) and predicts 1 if there is a disagreement between annotators on 0-Kein versus all other labels and 0 otherwise.
    :param item: dictionary of the form {'id': , 'text': , 'annotators': }
    :param is_test: if False annotations are available. If True not
    :return: dictionary of the form {'id': , 'text': , 'label': }
    """
    text = item['text']
    text = text.replace('\n', ' ')
    if not is_test:
        labels = [ann['label'] for ann in item['annotations']]
        unique_labels = set(labels)
        disagree_bin_label = 1 if '0-Kein' in unique_labels and len(unique_labels) > 1 else 0
    else:
        disagree_bin_label = None
    return {'id': item['id'], 'text': text, 'label': disagree_bin_label}

# Let us not define the function, that transforms our data into the suitable objects, i.e., huggingface datasets

In [14]:
def transform(func, data, is_test=False):
    """
    Computes a particular label for a whole set of data
    :param func: one of the five functions defined above
    :param data: list of dictionaries
    :param is_test: if False annotations are available. If True not
    :return: huggingface dataset  
    """
    transformed_data = []
    for item in data:
        transformed_data.append(func(item, is_test))
    return Dataset.from_list(transformed_data)

# Now we have to load the tokenizer and then we can train our models

In [15]:
tokenizer = BertTokenizer.from_pretrained(model_name)

In [16]:
def tokenize_seqs(examples):
    return tokenizer(
        examples["text"],
        truncation=True,   # or padding=True
        max_length=512
    )

do we want to train?

# Preparing for training

Let's define our metrics for evaluation:

In [18]:
def compute_metrics(eval_preds):
    preds = eval_preds.predictions.argmax(-1)
    lbls = eval_preds.label_ids
    f1 = f1_score(lbls, preds, average='weighted')
    return {'f1': f1}

# Let's set up our bin_maj data and model:

In [50]:
tokenized_train_bin_maj = transform(assign_bin_maj, train_data).map(tokenize_seqs, batched=True)
tokenized_dev_bin_maj = transform(assign_bin_maj, dev_data).map(tokenize_seqs, batched=True)
tokenized_test_bin_maj = transform(assign_bin_maj, test_data, is_test= True).map(tokenize_seqs, batched=True)

Map: 100%|██████████| 3588/3588 [00:02<00:00, 1330.33 examples/s]
Map: 100%|██████████| 449/449 [00:00<00:00, 2005.72 examples/s]
Map: 100%|██████████| 449/449 [00:00<00:00, 1952.15 examples/s]


In [20]:
model_bin_maj = BertForSequenceClassification.from_pretrained(model_name, num_labels = 2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
training_args_bin_maj= TrainingArguments(
    output_dir=output_dir_bin_maj,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    gradient_accumulation_steps=2,
    learning_rate=1e-5,
    weight_decay=5e-3,
    num_train_epochs=3,
    logging_steps= 30,
    save_strategy='epoch',
    eval_strategy='epoch',
    save_total_limit=1,
    seed=42,
    data_seed=42,
    fp16=True,
    dataloader_num_workers=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1",  
    dataloader_pin_memory=False
)

In [22]:
trainer_bin_maj = Trainer(
    model=model_bin_maj,
    args=training_args_bin_maj,
    compute_metrics=compute_metrics,
    train_dataset=tokenized_train_bin_maj,
    eval_dataset=tokenized_dev_bin_maj,
    processing_class = tokenizer
)

In [23]:
if run_this:
    trainer_bin_maj.train()
    trainer_bin_maj.save_model(model_path_bin_maj)

# Let's set up our bin_one data and model:

In [51]:
tokenized_train_bin_one = transform(assign_bin_one, train_data).map(tokenize_seqs, batched=True)
tokenized_dev_bin_one = transform(assign_bin_one, dev_data).map(tokenize_seqs, batched=True)
tokenized_test_bin_one = transform(assign_bin_one, test_data, is_test=True).map(tokenize_seqs, batched=True)

Map: 100%|██████████| 3588/3588 [00:02<00:00, 1216.63 examples/s]
Map: 100%|██████████| 449/449 [00:00<00:00, 1796.64 examples/s]
Map: 100%|██████████| 449/449 [00:00<00:00, 1282.16 examples/s]


In [25]:
model_bin_one = BertForSequenceClassification.from_pretrained(model_name, num_labels = 2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
training_args_bin_one= TrainingArguments(
    output_dir=output_dir_bin_one,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    gradient_accumulation_steps=2,
    learning_rate=1e-5,
    weight_decay=5e-3,
    num_train_epochs=3,
    logging_steps= 30,
    save_strategy='epoch',
    eval_strategy='epoch',
    save_total_limit=1,
    seed=42,
    data_seed=42,
    fp16=True,
    dataloader_num_workers=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1",  
    dataloader_pin_memory=False
)

In [27]:
trainer_bin_one = Trainer(
    model=model_bin_one,
    args=training_args_bin_one,
    compute_metrics=compute_metrics,
    train_dataset=tokenized_train_bin_one,
    eval_dataset=tokenized_dev_bin_one,
    processing_class = tokenizer
)

In [28]:
if run_this:
    trainer_bin_one.train()
    trainer_bin_one.save_model(model_path_bin_one)

# Let's set up our bin_all data and model:

In [29]:
tokenized_train_bin_all = transform(assign_bin_all, train_data).map(tokenize_seqs, batched=True)
tokenized_dev_bin_all = transform(assign_bin_all, dev_data).map(tokenize_seqs, batched=True)
tokenized_test_bin_all = transform(assign_bin_all, test_data, is_test=True).map(tokenize_seqs, batched=True)

Map: 100%|██████████| 3588/3588 [00:02<00:00, 1387.87 examples/s]
Map: 100%|██████████| 449/449 [00:00<00:00, 1191.41 examples/s]
Map: 100%|██████████| 449/449 [00:00<00:00, 1678.94 examples/s]


In [30]:
model_bin_all = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)
training_args_bin_all = TrainingArguments(
    output_dir=output_dir_bin_all,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    gradient_accumulation_steps=2,
    learning_rate=1e-5,
    weight_decay=5e-3,
    num_train_epochs=3,
    logging_steps=30,
    save_strategy='epoch',
    eval_strategy='epoch',
    save_total_limit=1,
    seed=42,
    data_seed=42,
    fp16=True,
    dataloader_num_workers=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    dataloader_pin_memory=False
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
trainer_bin_all = Trainer(
    model=model_bin_all,
    args=training_args_bin_all,
    compute_metrics=compute_metrics,
    train_dataset=tokenized_train_bin_all,
    eval_dataset=tokenized_dev_bin_all,
    processing_class=tokenizer
)

In [32]:
if run_this:
    trainer_bin_all.train()
    trainer_bin_all.save_model(model_path_bin_all)

# Let's set up our multi_maj data and model:

In [33]:
tokenized_train_multi_maj = transform(assign_multi_maj, train_data).map(tokenize_seqs, batched=True)
tokenized_dev_multi_maj = transform(assign_multi_maj, dev_data).map(tokenize_seqs, batched=True)
tokenized_test_multi_maj = transform(assign_multi_maj, test_data, is_test=True).map(tokenize_seqs, batched=True)

Map: 100%|██████████| 3588/3588 [00:01<00:00, 1947.05 examples/s]
Map: 100%|██████████| 449/449 [00:00<00:00, 1389.22 examples/s]
Map: 100%|██████████| 449/449 [00:00<00:00, 1279.60 examples/s]


In [34]:
model_multi_maj = BertForSequenceClassification.from_pretrained(model_name, num_labels=5)
training_args_multi_maj = TrainingArguments(
    output_dir=output_dir_multi_maj,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    gradient_accumulation_steps=2,
    learning_rate=1e-5,
    weight_decay=5e-3,
    num_train_epochs=3,
    logging_steps=30,
    save_strategy='epoch',
    eval_strategy='epoch',
    save_total_limit=1,
    seed=42,
    data_seed=42,
    fp16=True,
    dataloader_num_workers=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    dataloader_pin_memory=False
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [35]:
trainer_multi_maj = Trainer(
    model=model_multi_maj,
    args=training_args_multi_maj,
    compute_metrics=compute_metrics,
    train_dataset=tokenized_train_multi_maj,
    eval_dataset=tokenized_dev_multi_maj,
    processing_class=tokenizer
)

In [36]:
if run_this:
    trainer_multi_maj.train()
    trainer_multi_maj.save_model(model_path_multi_maj)

# Let's set up our disagree_bin data and model:

In [37]:
tokenized_train_disagree_bin = transform(assign_disagree_bin, train_data).map(tokenize_seqs, batched=True)
tokenized_dev_disagree_bin = transform(assign_disagree_bin, dev_data).map(tokenize_seqs, batched=True)
tokenized_test_disagree_bin = transform(assign_disagree_bin, test_data, is_test=True).map(tokenize_seqs, batched=True)

Map: 100%|██████████| 3588/3588 [00:02<00:00, 1757.55 examples/s]
Map: 100%|██████████| 449/449 [00:00<00:00, 1156.88 examples/s]
Map: 100%|██████████| 449/449 [00:00<00:00, 1840.66 examples/s]


In [38]:
model_disagree_bin = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)
training_args_disagree_bin = TrainingArguments(
    output_dir=output_dir_disagree_bin,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    gradient_accumulation_steps=2,
    learning_rate=1e-5,
    weight_decay=5e-3,
    num_train_epochs=3,
    logging_steps=30,
    save_strategy='epoch',
    eval_strategy='epoch',
    save_total_limit=1,
    seed=42,
    data_seed=42,
    fp16=True,
    dataloader_num_workers=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    dataloader_pin_memory=False
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [39]:
trainer_disagree_bin = Trainer(
    model=model_disagree_bin,
    args=training_args_disagree_bin,
    compute_metrics=compute_metrics,
    train_dataset=tokenized_train_disagree_bin,
    eval_dataset=tokenized_dev_disagree_bin,
    processing_class=tokenizer
)

In [40]:
if run_this:
    trainer_disagree_bin.train()
    trainer_disagree_bin.save_model(model_path_disagree_bin)

# Let us make some predictions!

# First we have to load the different models

In [41]:
model_bin_maj = BertForSequenceClassification.from_pretrained(model_path_bin_maj)

trainer_bin_maj = Trainer(
    model=model_bin_maj,
    args=training_args_bin_maj,
    compute_metrics=compute_metrics,
    train_dataset=tokenized_train_bin_maj,
    eval_dataset=tokenized_dev_bin_maj,
    processing_class = tokenizer
)

model_bin_all = BertForSequenceClassification.from_pretrained(model_path_bin_all)

trainer_bin_all = Trainer(
    model=model_bin_all,
    args=training_args_bin_all,
    compute_metrics=compute_metrics,
    train_dataset=tokenized_train_bin_all,
    eval_dataset=tokenized_dev_bin_all,
    processing_class = tokenizer
)

model_bin_one = BertForSequenceClassification.from_pretrained(model_path_bin_one)

trainer_bin_one = Trainer(
    model=model_bin_one,
    args=training_args_bin_one,
    compute_metrics=compute_metrics,
    train_dataset=tokenized_train_bin_one,
    eval_dataset=tokenized_dev_bin_one,
    processing_class = tokenizer
)

model_multi_maj = BertForSequenceClassification.from_pretrained(model_path_multi_maj)

trainer_multi_maj = Trainer(
    model=model_multi_maj,
    args=training_args_multi_maj,
    compute_metrics=compute_metrics,
    train_dataset=tokenized_train_multi_maj,
    eval_dataset=tokenized_dev_multi_maj,
    processing_class = tokenizer
)

model_disagree_bin = BertForSequenceClassification.from_pretrained(model_path_disagree_bin)

trainer_disagree_bin = Trainer(
    model=model_disagree_bin,
    args=training_args_disagree_bin,
    compute_metrics=compute_metrics,
    train_dataset=tokenized_train_disagree_bin,
    eval_dataset=tokenized_dev_disagree_bin,
    processing_class = tokenizer
)

# Now we can make some predictions

In [42]:
def determine_label(prediction):
  labels = []
  for score in prediction[0]:
    labels.append(np.where(score == max(score))[0][0])
  return labels

# First let's check the dev sets, since here we have reference data (though there is actually no need to do this, as we already know the outcome - but good to see it works)

In [43]:
dev_predictions_bin_maj = trainer_bin_maj.predict(test_dataset=tokenized_dev_bin_maj)
dev_predictions_bin_one = trainer_bin_one.predict(test_dataset=tokenized_dev_bin_one)
dev_predictions_bin_all = trainer_bin_all.predict(test_dataset=tokenized_dev_bin_all)
dev_predictions_multi_maj = trainer_multi_maj.predict(test_dataset=tokenized_dev_multi_maj)
dev_predictions_disagree_bin = trainer_disagree_bin.predict(test_dataset=tokenized_dev_disagree_bin)

In [44]:
print(f"Dev set F1 score Bin Maj: {dev_predictions_bin_maj.metrics['test_f1']:.4f}")
print(f"Dev set F1 score Bin One: {dev_predictions_bin_one.metrics['test_f1']:.4f}")
print(f"Dev set F1 score Bin All: {dev_predictions_bin_all.metrics['test_f1']:.4f}")
print(f"Dev set F1 score Multi Maj: {dev_predictions_multi_maj.metrics['test_f1']:.4f}")
print(f"Dev set F1 score Disagree Bin: {dev_predictions_disagree_bin.metrics['test_f1']:.4f}")

Dev set F1 score Bin Maj: 0.7610
Dev set F1 score Bin One: 0.7572
Dev set F1 score Bin All: 0.8308
Dev set F1 score Multi Maj: 0.6532
Dev set F1 score Disagree Bin: 0.6741


# Let's make predictions for the testset now. 

In [45]:
tokenized_test_bin_maj = tokenized_test_bin_maj.remove_columns(['label'])
test_predictions_bin_maj = trainer_bin_maj.predict(test_dataset=tokenized_test_bin_maj)
tokenized_test_bin_one = tokenized_test_bin_one.remove_columns(['label'])
test_predictions_bin_one = trainer_bin_one.predict(test_dataset=tokenized_test_bin_one)
tokenized_test_bin_all = tokenized_test_bin_all.remove_columns(['label'])
test_predictions_bin_all = trainer_bin_all.predict(test_dataset=tokenized_test_bin_all)
tokenized_test_multi_maj = tokenized_test_multi_maj.remove_columns(['label'])
test_predictions_multi_maj = trainer_multi_maj.predict(test_dataset=tokenized_test_multi_maj)
tokenized_test_disagree_bin = tokenized_test_disagree_bin.remove_columns(['label'])
test_predictions_disagree_bin = trainer_disagree_bin.predict(test_dataset=tokenized_test_disagree_bin)

In [46]:
def determine_label(prediction):
  labels = []
  for score in prediction[0]:
    labels.append(np.where(score == max(score))[0][0])
  return labels

In [47]:
predicted_bin_maj_labels_test_set = determine_label(test_predictions_bin_maj)
predicted_bin_one_labels_test_set = determine_label(test_predictions_bin_one)
predicted_bin_all_labels_test_set = determine_label(test_predictions_bin_all)
predicted_multi_maj_labels_test_set = determine_label(test_predictions_multi_maj)
predicted_disagree_bin_labels_test_set = determine_label(test_predictions_disagree_bin)

In [48]:
predicted_multi_maj_labels_test_set_with_label =[['0-Kein', '1-Gering', '2-Vorhanden', '3-Stark', '4-Extrem'][i] for i in predicted_multi_maj_labels_test_set]

In [49]:
import csv

id_test = tokenized_test_bin_maj['id']

rows = zip(id_test,
           predicted_bin_maj_labels_test_set,
           predicted_bin_one_labels_test_set,
           predicted_bin_all_labels_test_set,
           predicted_multi_maj_labels_test_set_with_label,
           predicted_disagree_bin_labels_test_set)

header = ["id", "bin_maj", "bin_one", "bin_all", "multi_maj", "disagree_bin"]

# Write to CSV file
with open('test.csv', 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile)

    # Write the header
    csvwriter.writerow(header)

    # Write the data
    csvwriter.writerows(rows)