# Setup 

In [1]:
!pip install transformers[torch] scikit-learn pandas




[notice] A new release of pip is available: 23.2.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
from transformers import BertTokenizer, BertForSequenceClassification
from datasets import Dataset
from transformers import Trainer, TrainingArguments
import json
from collections import Counter
from sklearn.metrics import f1_score
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


# Let us load our data

In [3]:
def load_jsonl(file_path):
    """
    Load a JSONL file and return a list of JSON objects.
    :param file_path: str, path to the JSONL file
    :return: list of dicts, each representing a JSON object
    """
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            data.append(json.loads(line))
    return data

In [4]:
train_file_path = 'data_germeval/train.jsonl'
dev_file_path = 'data_germeval/development.jsonl'
test_file_path = 'data_germeval/test.jsonl'

In [5]:
train_data = load_jsonl(train_file_path)
dev_data = load_jsonl(dev_file_path)
test_data = load_jsonl(test_file_path)

In [6]:
train_data

[{'id': 'bc55d8e060148f4bbf1204ecf7ecb2d3',
  'text': 'Darum ist also die grüninin die betroffenen Gebiete gefahren, warum eigentlich?',
  'annotations': [{'user': 'A002', 'label': '0-Kein'},
   {'user': 'A008', 'label': '0-Kein'},
   {'user': 'A004', 'label': '0-Kein'},
   {'user': 'A010', 'label': '3-Stark'},
   {'user': 'A012', 'label': '2-Vorhanden'}]},
 {'id': '43f768c41d2d58f4290640f7eee3bd27',
  'text': 'wieder ein zuckerl für erdogan ...\nund die hiesige "willst du watsche mit fuss" fraktion.\n\naber hauptsache alle anderen (wesentlich besser integrierten "ausländer" müssen alle paar monate nach hause fahren um ihre arbeitserlaubnis zu behalten!\n\nUND, falls jetzt der einwand kommt unsere türkischen freunde wären ja alles österreichische staatsbürger ...\n\ndas mag schon so sein. aber integrationswillen ist von 95 % dieser menschen überhaupt keiner zu sehen!\n\nstimmt nicht? man besuche die wiener "arbeiterbezirke" und die wiener schulen ...\n\nJETZT kommt von den rot bewerter

In [7]:
dev_data

[{'id': '56787b0c4d8200c38864bfdabc745b55',
  'text': 'Das ist ein richtig gutes Portrait von Greta!',
  'annotations': [{'user': 'A002', 'label': '0-Kein'},
   {'user': 'A008', 'label': '0-Kein'},
   {'user': 'A003', 'label': '0-Kein'},
   {'user': 'A012', 'label': '0-Kein'},
   {'user': 'A010', 'label': '0-Kein'}]},
 {'id': '4231ccfc3e386e806633c52988287b1a',
  'text': 'bei den dort üblichen kalaschnikows wärs eher eine zahl mit ein paar nullen mehr ...',
  'annotations': [{'user': 'A002', 'label': '0-Kein'},
   {'user': 'A009', 'label': '0-Kein'},
   {'user': 'A003', 'label': '0-Kein'},
   {'user': 'A012', 'label': '0-Kein'},
   {'user': 'A010', 'label': '0-Kein'}]},
 {'id': 'e0d93773e5e2bf204b665da37fd72547',
  'text': 'Nein.Es war eine Single-Börse,  die den ganzen deutschsprachigen Raum umfasst- ich glaube Lovescout. Hat aber nur zu virtuellen Verbindungen geführt- eben weil die betreffenden Männer hunderte Kilometer von mir entfernt gewohnt haben.Und dann die Websingles mit 2 wi

In [8]:
test_data

[{'id': '37a662ddab92f0700bd10631527932a6',
  'text': 'Stellt sich die Frage, warum Sie junge Frau keine Kinder haben wollen?\nTragen Sie Erbkrankheiten in sich?\nHatten Sie eine schlimme Kindheit?\nArgumentieren Sie so, weil Sie Ihrem Freund nicht trauen?\nWarum "trauen" Sie sich nicht?',
  'annotators': ['A002', 'A007', 'A003', 'A010', 'A012']},
 {'id': '6c6801010ff226351956984cbd930c25',
  'text': 'sorry, aber das betreffende tv-bild ist dermassen harmlos, einfach nur weite schwarze shorts mit handabdrücken drauf. wenn sie das vermeiden will, könnte sie doch einfach welche in einer anderen farbe tragen, dann sieht man genau nix.seltsam ist eher, dass der kletterverband auf sowas eingeht, aber man will halt in "metoo"-zeiten bloss keine angriffsfläche bieten.',
  'annotators': ['A002', 'A009', 'A010', 'A012']},
 {'id': '4ee48355cfa329927cf5158918d560cc',
  'text': 'Wer ist jetzt die Krähe und wer die Taube? Ist Mac schön anzusehen aber unnütz, eher träge und kackt alles voll und Wind

# Let us define the functions that compute the different labels for the germeval task

Version "bin_maj"

In [9]:
def assign_bin_maj(item, is_test=False):
    """
    takes a tweet and its annotations (if available) and computes 1 if a majority of annotators assigned a label other than 0-Kein, predicts 0 if a majority assigned 0-Kein. If there was no majority, either label is considered correct for evaluation.
    :param item: dictionary of the form {'id': , 'text': , 'annotators': }
    :param is_test: if False annotations are available. If True not
    :return: dictionary of the form {'id': , 'text': , 'label': }
    """
    text = item['text']
    text = text.replace('\n', ' ')
    if not is_test:
        labels = [ann['label'] for ann in item['annotations']]
        label_counts = Counter(labels)
        majority_label, majority_count = label_counts.most_common(1)[0]
        bin_maj_label = 1 if majority_label != '0-Kein' else 0
    else:
        bin_maj_label = None
    return {'id': item['id'], 'text': text, 'label': bin_maj_label}
    

Version "bin_one"

In [10]:
def assign_bin_one(item, is_test=False):
    """
    takes a tweet and its annotations (if available) and computes 1 if at least one annotator assigned a label other than 0-Kein, 0 otherwise.
    :param item: dictionary of the form {'id': , 'text': , 'annotators': }
    :param is_test: if False annotations are available. If True not
    :return: dictionary of the form {'id': , 'text': , 'label': }
    """
    text = item['text']
    text = text.replace('\n', ' ')
    if not is_test:
        bin_one_label = 1 if any(ann['label'] != '0-Kein' for ann in item['annotations']) else 0
    else:
        bin_one_label = None
    return {'id': item['id'], 'text': text, 'label': bin_one_label}

Version "bin_all"

In [11]:
def assign_bin_all(item, is_test=False):
    """
    takes a tweet and its annotations (if available) and computes 1 if all annotators assigned labels other than 0-Kein, 0 otherwise.
    :param item: dictionary of the form {'id': , 'text': , 'annotators': }
    :param is_test: if False annotations are available. If True not
    :return: dictionary of the form {'id': , 'text': , 'label': }
    """
    text = item['text']
    text = text.replace('\n', ' ')
    if not is_test:
        bin_all_label = 1 if all(ann['label'] != '0-Kein' for ann in item['annotations']) else 0
    else:
        bin_all_label = None
    return {'id': item['id'], 'text': text, 'label': bin_all_label}

Version "multi_maj"

In [12]:
def assign_multi_maj(item, is_test=False):
    """
    takes a tweet and its annotations (if available) and predicts the majority label if there is one, if there is no majority label, any of the labels assigned is counted as a correct prediction for evaluation.
    :param item: dictionary of the form {'id': , 'text': , 'annotators': }
    :param is_test: if False annotations are available. If True not
    :return: dictionary of the form {'id': , 'text': , 'label': }
    """
    text = item['text']
    text = text.replace('\n', ' ')
    if not is_test:
        labels = [ann['label'] for ann in item['annotations']]
        label_counts = Counter(labels)
        majority_label, majority_count = label_counts.most_common(1)[0]
        multi_maj_label = majority_label if majority_count > len(labels) / 2 else labels[0]
        multi_maj_label = int(multi_maj_label.split('-')[0])
    else:
        multi_maj_label = None
    return {'id': item['id'], 'text': text, 'label': multi_maj_label}

Version "disagree bin"

In [13]:
def assign_disagree_bin(item, is_test=False):
    """
    takes a tweet and its annotations (if available) and predicts 1 if there is a disagreement between annotators on 0-Kein versus all other labels and 0 otherwise.
    :param item: dictionary of the form {'id': , 'text': , 'annotators': }
    :param is_test: if False annotations are available. If True not
    :return: dictionary of the form {'id': , 'text': , 'label': }
    """
    text = item['text']
    text = text.replace('\n', ' ')
    if not is_test:
        labels = [ann['label'] for ann in item['annotations']]
        unique_labels = set(labels)
        disagree_bin_label = 1 if '0-Kein' in unique_labels and len(unique_labels) > 1 else 0
    else:
        disagree_bin_label = None
    return {'id': item['id'], 'text': text, 'label': disagree_bin_label}

# Let us not define the function, that transforms our data into the suitable objects, i.e., huggingface datasets

In [14]:
def transform(func, data, is_test=False):
    """
    Computes a particular label for a whole set of data
    :param func: one of the five functions defined above
    :param data: list of dictionaries
    :param is_test: if False annotations are available. If True not
    :return: huggingface dataset  
    """
    transformed_data = []
    for item in data:
        transformed_data.append(func(item, is_test))
    return Dataset.from_list(transformed_data)

# Now we have to load the tokenizer and then we can train our models

In [15]:
model_name = "google-bert/bert-base-german-cased"
tokenizer = BertTokenizer.from_pretrained(model_name)

In [16]:
def tokenize_seqs(examples):
    return tokenizer(
        examples["text"],
        truncation=True,   # or padding=True
        max_length=512
    )

do we want to train?

In [17]:
training = False

# Preparing for training

Let's define our metrics for evaluation:

In [18]:
def compute_metrics(eval_preds):
    preds = eval_preds.predictions.argmax(-1)
    lbls = eval_preds.label_ids
    f1 = f1_score(lbls, preds, average='weighted')
    return {'f1': f1}

# Let's set up our bin_maj data and model:

In [19]:
tokenized_train_bin_maj = transform(assign_bin_maj, train_data).map(tokenize_seqs, batched=True)
tokenized_dev_bin_maj = transform(assign_bin_maj, dev_data).map(tokenize_seqs, batched=True)
tokenized_test_bin_maj = transform(assign_bin_maj, test_data, is_test= True).map(tokenize_seqs, batched=True)

Map: 100%|██████████| 3588/3588 [00:02<00:00, 1452.12 examples/s]
Map: 100%|██████████| 449/449 [00:00<00:00, 685.52 examples/s]
Map: 100%|██████████| 449/449 [00:00<00:00, 1008.24 examples/s]


In [20]:
model_bin_maj = BertForSequenceClassification.from_pretrained("google-bert/bert-base-german-cased", num_labels = 2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
training_args_bin_maj= TrainingArguments(
    output_dir='./logs/run_final_bin_maj/',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    gradient_accumulation_steps=2,
    learning_rate=1e-5,
    weight_decay=5e-3,
    num_train_epochs=3,
    logging_steps= 30,
    save_strategy='epoch',
    eval_strategy='epoch',
    save_total_limit=1,
    seed=42,
    data_seed=42,
    fp16=True,
    dataloader_num_workers=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1",  
    dataloader_pin_memory=False
)

In [22]:
trainer_bin_maj = Trainer(
    model=model_bin_maj,
    args=training_args_bin_maj,
    compute_metrics=compute_metrics,
    train_dataset=tokenized_train_bin_maj,
    eval_dataset=tokenized_dev_bin_maj,
    processing_class = tokenizer
)

In [23]:
if training:
    trainer_bin_maj.train()
    model_path = "models/bin_maj_model"
    trainer_bin_maj.save_model(model_path)

# Let's set up our bin_one data and model:

In [24]:
tokenized_train_bin_one = transform(assign_bin_one, train_data).map(tokenize_seqs, batched=True)
tokenized_dev_bin_one = transform(assign_bin_one, dev_data).map(tokenize_seqs, batched=True)
tokenized_test_bin_one = transform(assign_bin_one, test_data, is_test=True).map(tokenize_seqs, batched=True)

Map: 100%|██████████| 3588/3588 [00:02<00:00, 1408.63 examples/s]
Map: 100%|██████████| 449/449 [00:00<00:00, 1985.19 examples/s]
Map: 100%|██████████| 449/449 [00:00<00:00, 2044.94 examples/s]


In [25]:
model_bin_one = BertForSequenceClassification.from_pretrained("google-bert/bert-base-german-cased", num_labels = 2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
training_args_bin_one= TrainingArguments(
    output_dir='./logs/run_final_bin_one/',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    gradient_accumulation_steps=2,
    learning_rate=1e-5,
    weight_decay=5e-3,
    num_train_epochs=3,
    logging_steps= 30,
    save_strategy='epoch',
    eval_strategy='epoch',
    save_total_limit=1,
    seed=42,
    data_seed=42,
    fp16=True,
    dataloader_num_workers=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1",  
    dataloader_pin_memory=False
)

In [27]:
trainer_bin_one = Trainer(
    model=model_bin_one,
    args=training_args_bin_one,
    compute_metrics=compute_metrics,
    train_dataset=tokenized_train_bin_one,
    eval_dataset=tokenized_dev_bin_one,
    processing_class = tokenizer
)

In [28]:
if training:
    trainer_bin_one.train()
    model_path = "models/bin_one_model"
    trainer_bin_one.save_model(model_path)

# Let's set up our bin_all data and model:

In [29]:
tokenized_train_bin_all = transform(assign_bin_all, train_data).map(tokenize_seqs, batched=True)
tokenized_dev_bin_all = transform(assign_bin_all, dev_data).map(tokenize_seqs, batched=True)
tokenized_test_bin_all = transform(assign_bin_all, test_data, is_test=True).map(tokenize_seqs, batched=True)

Map: 100%|██████████| 3588/3588 [00:02<00:00, 1387.87 examples/s]
Map: 100%|██████████| 449/449 [00:00<00:00, 1191.41 examples/s]
Map: 100%|██████████| 449/449 [00:00<00:00, 1678.94 examples/s]


In [30]:
model_bin_all = BertForSequenceClassification.from_pretrained("google-bert/bert-base-german-cased", num_labels=2)
training_args_bin_all = TrainingArguments(
    output_dir='./logs/run_final_bin_all/',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    gradient_accumulation_steps=2,
    learning_rate=1e-5,
    weight_decay=5e-3,
    num_train_epochs=3,
    logging_steps=30,
    save_strategy='epoch',
    eval_strategy='epoch',
    save_total_limit=1,
    seed=42,
    data_seed=42,
    fp16=True,
    dataloader_num_workers=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    dataloader_pin_memory=False
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
trainer_bin_all = Trainer(
    model=model_bin_all,
    args=training_args_bin_all,
    compute_metrics=compute_metrics,
    train_dataset=tokenized_train_bin_all,
    eval_dataset=tokenized_dev_bin_all,
    processing_class=tokenizer
)

In [32]:
if training:
    trainer_bin_all.train()
    model_path = "models/bin_all_model"
    trainer_bin_all.save_model(model_path)

# Let's set up our multi_maj data and model:

In [33]:
tokenized_train_multi_maj = transform(assign_multi_maj, train_data).map(tokenize_seqs, batched=True)
tokenized_dev_multi_maj = transform(assign_multi_maj, dev_data).map(tokenize_seqs, batched=True)
tokenized_test_multi_maj = transform(assign_multi_maj, test_data, is_test=True).map(tokenize_seqs, batched=True)

Map: 100%|██████████| 3588/3588 [00:01<00:00, 1947.05 examples/s]
Map: 100%|██████████| 449/449 [00:00<00:00, 1389.22 examples/s]
Map: 100%|██████████| 449/449 [00:00<00:00, 1279.60 examples/s]


In [34]:
model_multi_maj = BertForSequenceClassification.from_pretrained("google-bert/bert-base-german-cased", num_labels=5)
training_args_multi_maj = TrainingArguments(
    output_dir='./logs/run_final_multi_maj/',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    gradient_accumulation_steps=2,
    learning_rate=1e-5,
    weight_decay=5e-3,
    num_train_epochs=3,
    logging_steps=30,
    save_strategy='epoch',
    eval_strategy='epoch',
    save_total_limit=1,
    seed=42,
    data_seed=42,
    fp16=True,
    dataloader_num_workers=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    dataloader_pin_memory=False
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [35]:
trainer_multi_maj = Trainer(
    model=model_multi_maj,
    args=training_args_multi_maj,
    compute_metrics=compute_metrics,
    train_dataset=tokenized_train_multi_maj,
    eval_dataset=tokenized_dev_multi_maj,
    processing_class=tokenizer
)

In [36]:
if training:
    trainer_multi_maj.train()
    model_path = "models/multi_maj_model"
    trainer_multi_maj.save_model(model_path)

# Let's set up our disagree_bin data and model:

In [37]:
tokenized_train_disagree_bin = transform(assign_disagree_bin, train_data).map(tokenize_seqs, batched=True)
tokenized_dev_disagree_bin = transform(assign_disagree_bin, dev_data).map(tokenize_seqs, batched=True)
tokenized_test_disagree_bin = transform(assign_disagree_bin, test_data, is_test=True).map(tokenize_seqs, batched=True)

Map: 100%|██████████| 3588/3588 [00:02<00:00, 1757.55 examples/s]
Map: 100%|██████████| 449/449 [00:00<00:00, 1156.88 examples/s]
Map: 100%|██████████| 449/449 [00:00<00:00, 1840.66 examples/s]


In [38]:
model_disagree_bin = BertForSequenceClassification.from_pretrained("google-bert/bert-base-german-cased", num_labels=2)
training_args_disagree_bin = TrainingArguments(
    output_dir='./logs/run_final_disagree_bin/',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    gradient_accumulation_steps=2,
    learning_rate=1e-5,
    weight_decay=5e-3,
    num_train_epochs=3,
    logging_steps=30,
    save_strategy='epoch',
    eval_strategy='epoch',
    save_total_limit=1,
    seed=42,
    data_seed=42,
    fp16=True,
    dataloader_num_workers=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    dataloader_pin_memory=False
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [39]:
trainer_disagree_bin = Trainer(
    model=model_disagree_bin,
    args=training_args_disagree_bin,
    compute_metrics=compute_metrics,
    train_dataset=tokenized_train_disagree_bin,
    eval_dataset=tokenized_dev_disagree_bin,
    processing_class=tokenizer
)

In [40]:
if training:
    trainer_disagree_bin.train()
    model_path = "models/disagree_bin_model"
    trainer_disagree_bin.save_model(model_path)

# Let us make some predictions!

# First we have to load the different models

In [41]:
model_path = "models/bin_maj_model"
model_bin_maj = BertForSequenceClassification.from_pretrained(model_path)

# rebuild the Trainer (with same args/tokenizer you used before)
trainer_bin_maj = Trainer(
    model=model_bin_maj,
    args=training_args_bin_maj,
    compute_metrics=compute_metrics,
    train_dataset=tokenized_train_bin_maj,
    eval_dataset=tokenized_dev_bin_maj,
    processing_class = tokenizer
)

model_path = "models/bin_all_model"
model_bin_all = BertForSequenceClassification.from_pretrained(model_path)

# rebuild the Trainer (with same args/tokenizer you used before)
trainer_bin_all = Trainer(
    model=model_bin_all,
    args=training_args_bin_all,
    compute_metrics=compute_metrics,
    train_dataset=tokenized_train_bin_all,
    eval_dataset=tokenized_dev_bin_all,
    processing_class = tokenizer
)

model_path = "models/bin_one_model"
model_bin_one = BertForSequenceClassification.from_pretrained(model_path)

# rebuild the Trainer (with same args/tokenizer you used before)
trainer_bin_one = Trainer(
    model=model_bin_one,
    args=training_args_bin_one,
    compute_metrics=compute_metrics,
    train_dataset=tokenized_train_bin_one,
    eval_dataset=tokenized_dev_bin_one,
    processing_class = tokenizer
)

model_path = "models/multi_maj_model"
model_multi_maj = BertForSequenceClassification.from_pretrained(model_path)

# rebuild the Trainer (with same args/tokenizer you used before)
trainer_multi_maj = Trainer(
    model=model_multi_maj,
    args=training_args_multi_maj,
    compute_metrics=compute_metrics,
    train_dataset=tokenized_train_multi_maj,
    eval_dataset=tokenized_dev_multi_maj,
    processing_class = tokenizer
)

model_path = "models/disagree_bin_model"
model_disagree_bin = BertForSequenceClassification.from_pretrained(model_path)

# rebuild the Trainer (with same args/tokenizer you used before)
trainer_disagree_bin = Trainer(
    model=model_disagree_bin,
    args=training_args_disagree_bin,
    compute_metrics=compute_metrics,
    train_dataset=tokenized_train_disagree_bin,
    eval_dataset=tokenized_dev_disagree_bin,
    processing_class = tokenizer
)

# Now we can make some predictions

In [42]:
def determine_label(prediction):
  labels = []
  for score in prediction[0]:
    labels.append(np.where(score == max(score))[0][0])
  return labels

# First let's check the dev sets, since here we have reference data (though there is actually no need to do this, as we already know the outcome - but good to see it works)

In [None]:
dev_predictions_bin_maj = trainer_bin_maj.predict(test_dataset=tokenized_dev_bin_maj)
dev_predictions_bin_one = trainer_bin_one.predict(test_dataset=tokenized_dev_bin_one)
dev_predictions_bin_all = trainer_bin_all.predict(test_dataset=tokenized_dev_bin_all)
dev_predictions_multi_maj = trainer_multi_maj.predict(test_dataset=tokenized_dev_multi_maj)
dev_predictions_disagree_bin = trainer_disagree_bin.predict(test_dataset=tokenized_dev_disagree_bin)

In [35]:
print(f"Dev set F1 score Bin Maj: {dev_predictions_bin_maj.metrics['test_f1']:.4f}")
print(f"Dev set F1 score Bin One: {dev_predictions_bin_one.metrics['test_f1']:.4f}")
print(f"Dev set F1 score Bin All: {dev_predictions_bin_all.metrics['test_f1']:.4f}")
print(f"Dev set F1 score Multi Maj: {dev_predictions_multi_maj.metrics['test_f1']:.4f}")
print(f"Dev set F1 score Disagree Bin: {dev_predictions_disagree_bin.metrics['test_f1']:.4f}")

Dev set F1 score Bin Maj: 0.7610
Dev set F1 score Bin One: 0.7572
Dev set F1 score Bin All: 0.8308
Dev set F1 score Multi Maj: 0.6532
Dev set F1 score Disagree Bin: 0.6741


# Let's make predictions for the testset now. 

In [47]:
tokenized_test_bin_maj = tokenized_test_bin_maj.remove_columns(['label'])
test_predictions_bin_maj = trainer_bin_maj.predict(test_dataset=tokenized_test_bin_maj)
tokenized_test_bin_one = tokenized_test_bin_one.remove_columns(['label'])
test_predictions_bin_one = trainer_bin_one.predict(test_dataset=tokenized_test_bin_one)
tokenized_test_bin_all = tokenized_test_bin_all.remove_columns(['label'])
test_predictions_bin_all = trainer_bin_all.predict(test_dataset=tokenized_test_bin_all)
tokenized_test_multi_maj = tokenized_test_multi_maj.remove_columns(['label'])
test_predictions_multi_maj = trainer_multi_maj.predict(test_dataset=tokenized_test_multi_maj)
tokenized_test_disagree_bin = tokenized_test_disagree_bin.remove_columns(['label'])
test_predictions_disagree_bin = trainer_disagree_bin.predict(test_dataset=tokenized_test_disagree_bin)

In [54]:
def determine_label(prediction):
  labels = []
  for score in prediction[0]:
    labels.append(np.where(score == max(score))[0][0])
  return labels

In [55]:
predicted_bin_maj_labels_test_set = determine_label(test_predictions_bin_maj)
predicted_bin_one_labels_test_set = determine_label(test_predictions_bin_one)
predicted_bin_all_labels_test_set = determine_label(test_predictions_bin_all)
predicted_multi_maj_labels_test_set = determine_label(test_predictions_multi_maj)
predicted_disagree_bin_labels_test_set = determine_label(test_predictions_disagree_bin)

In [56]:
predicted_multi_maj_labels_test_set_with_label =[['0-Kein', '1-Gering', '2-Vorhanden', '3-Stark', '4-Extrem'][i] for i in predicted_multi_maj_labels_test_set]

In [59]:
import csv

id_test = tokenized_test_bin_maj['id']

rows = zip(id_test,
           predicted_bin_maj_labels_test_set,
           predicted_bin_one_labels_test_set,
           predicted_bin_all_labels_test_set,
           predicted_multi_maj_labels_test_set_with_label,
           predicted_disagree_bin_labels_test_set)

header = ["id", "bin_maj", "bin_one", "bin_all", "multi_maj", "disagree_bin"]

# Write to CSV file
with open('test.csv', 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile)

    # Write the header
    csvwriter.writerow(header)

    # Write the data
    csvwriter.writerows(rows)