## Setup

In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorForTokenClassification, Trainer, AutoModelForTokenClassification, TrainingArguments
from datasets import load_metric
import numpy as np

In [7]:
train_dataset = load_dataset("DFKI-SLT/few-nerd", "supervised", split="train").select(range(500))
val_dataset = load_dataset("DFKI-SLT/few-nerd", "supervised", split="validation").select(range(50))

label_names = train_dataset.features['ner_tags'].feature.names
label_names

['O',
 'art',
 'building',
 'event',
 'location',
 'organization',
 'other',
 'person',
 'product']

In [3]:
sample = train_dataset[40]
[(sample['tokens'][i],
  train_dataset.features['ner_tags'].feature.names[sample['ner_tags'][i]])
 for i in range(len(sample['tokens']))]

[('This', 'O'),
 ('song', 'O'),
 ('was', 'O'),
 ('recorded', 'O'),
 ('in', 'O'),
 ('Puerto', 'location'),
 ('Rico', 'location'),
 ('at', 'O'),
 ('the', 'O'),
 ('Alpha', 'building'),
 ('Recording', 'building'),
 ('Studios', 'building'),
 ('.', 'O')]

In [4]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base", add_prefix_space=True)

def tokenize_adjust_labels(all_samples_per_split):
    tokenized_samples = tokenizer.batch_encode_plus(all_samples_per_split["tokens"],
                                                    padding=True, max_length=512,
                                                    truncation=True, is_split_into_words=True)

    total_adjusted_labels = []
    for k in range(len(tokenized_samples["input_ids"])):
        prev_wid = -1
        word_ids_list = tokenized_samples.word_ids(batch_index=k)
        existing_label_ids = all_samples_per_split["ner_tags"][k]
        i = -1
        adjusted_label_ids = []

        for wid in word_ids_list:
            if wid is None:
                adjusted_label_ids.append(-100)
            elif wid != prev_wid:
                i += 1
                adjusted_label_ids.append(existing_label_ids[i])
                prev_wid = wid
            else:
                adjusted_label_ids.append(existing_label_ids[i])

        total_adjusted_labels.append(adjusted_label_ids)

    tokenized_samples["labels"] = total_adjusted_labels
    return tokenized_samples

In [5]:
train_tokenized_dataset = train_dataset.map(tokenize_adjust_labels, batched=True)
val_tokenized_dataset = val_dataset.map(tokenize_adjust_labels, batched=True)

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

In [6]:
metric = load_metric("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_names[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    flattened_results = {
        "overall_precision": results["overall_precision"],
        "overall_recall": results["overall_recall"],
        "overall_f1": results["overall_f1"],
        "overall_accuracy": results["overall_accuracy"],
    }

    return flattened_results

  metric = load_metric("seqeval")


## Baseline

In [9]:
model = AutoModelForTokenClassification.from_pretrained("roberta-base", num_labels=len(label_names))
training_args = TrainingArguments(
    output_dir="./fine_tune_roberta_output_baseline",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized_dataset,
    eval_dataset=val_tokenized_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Overall Precision,Overall Recall,Overall F1,Overall Accuracy
1,0.7894,0.691356,0.392857,0.371622,0.381944,0.806173
2,0.5327,0.417114,0.583333,0.614865,0.598684,0.878395
3,0.302,0.326383,0.634286,0.75,0.687307,0.911728
4,0.2465,0.302427,0.698225,0.797297,0.744479,0.920988
5,0.2211,0.272611,0.731707,0.810811,0.769231,0.924074
6,0.164,0.278643,0.762821,0.804054,0.782895,0.924691
7,0.1246,0.290512,0.751592,0.797297,0.77377,0.924074
8,0.1183,0.307475,0.780645,0.817568,0.79868,0.928395
9,0.0949,0.304871,0.759494,0.810811,0.784314,0.926543
10,0.0867,0.314146,0.759494,0.810811,0.784314,0.925309


  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=320, training_loss=0.3002376567572355, metrics={'train_runtime': 97.9775, 'train_samples_per_second': 51.032, 'train_steps_per_second': 3.266, 'total_flos': 265396313520000.0, 'train_loss': 0.3002376567572355, 'epoch': 10.0})

## Hyperparameter tests

In [None]:
model = AutoModelForTokenClassification.from_pretrained("roberta-base", num_labels=len(label_names))
training_args = TrainingArguments(
    output_dir="./fine_tune_roberta_output",
    eval_strategy="epoch",
    learning_rate=2e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized_dataset,
    eval_dataset=val_tokenized_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Overall Precision,Overall Recall,Overall F1,Overall Accuracy
1,0.6608,0.484281,0.646667,0.655405,0.651007,0.868519
2,0.3963,0.414005,0.644737,0.662162,0.653333,0.878395
3,0.2307,0.314959,0.674847,0.743243,0.707395,0.909259
4,0.1723,0.310187,0.741935,0.777027,0.759076,0.926543
5,0.1473,0.339862,0.745223,0.790541,0.767213,0.911728
6,0.0902,0.386161,0.67052,0.783784,0.722741,0.914815
7,0.0534,0.386583,0.762821,0.804054,0.782895,0.918519
8,0.0415,0.386063,0.707317,0.783784,0.74359,0.917901
9,0.0187,0.429739,0.736196,0.810811,0.771704,0.933333
10,0.0083,0.435137,0.72619,0.824324,0.772152,0.929012


  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=320, training_loss=0.19466895756777375, metrics={'train_runtime': 159.461, 'train_samples_per_second': 31.356, 'train_steps_per_second': 2.007, 'total_flos': 265396313520000.0, 'train_loss': 0.19466895756777375, 'epoch': 10.0})

In [None]:
model = AutoModelForTokenClassification.from_pretrained("roberta-base", num_labels=len(label_names))
training_args = TrainingArguments(
    output_dir="./fine_tune_roberta_output",
    eval_strategy="epoch",
    learning_rate=8e-6,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=15,
    logging_steps=15,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized_dataset,
    eval_dataset=val_tokenized_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Overall Precision,Overall Recall,Overall F1,Overall Accuracy
1,1.0337,0.907664,0.0,0.0,0.0,0.704938
2,0.7643,0.70235,0.435897,0.344595,0.384906,0.806173
3,0.5362,0.547619,0.419162,0.472973,0.444444,0.817901
4,0.4678,0.420629,0.592593,0.648649,0.619355,0.882099
5,0.3602,0.352393,0.629412,0.722973,0.672956,0.903704
6,0.2918,0.304967,0.701863,0.763514,0.731392,0.914198
7,0.2431,0.28727,0.740506,0.790541,0.764706,0.92716
8,0.2337,0.302551,0.734177,0.783784,0.75817,0.922222
9,0.1794,0.286779,0.721212,0.804054,0.760383,0.923457
10,0.1899,0.278781,0.767296,0.824324,0.794788,0.924691


  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=480, training_loss=0.3679256968200207, metrics={'train_runtime': 230.4982, 'train_samples_per_second': 32.538, 'train_steps_per_second': 2.082, 'total_flos': 398094470280000.0, 'train_loss': 0.3679256968200207, 'epoch': 15.0})

In [10]:
model = AutoModelForTokenClassification.from_pretrained("roberta-base", num_labels=len(label_names))
training_args = TrainingArguments(
    output_dir="./fine_tune_roberta_output",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=10,
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized_dataset,
    eval_dataset=val_tokenized_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Overall Precision,Overall Recall,Overall F1,Overall Accuracy
1,No log,1.085677,0.0,0.0,0.0,0.701235
2,1.533200,0.885848,0.0,0.0,0.0,0.707407
3,0.915100,0.759822,0.289157,0.162162,0.207792,0.778395
4,0.714900,0.662957,0.407895,0.418919,0.413333,0.812346
5,0.573100,0.549714,0.428571,0.486486,0.455696,0.82284
6,0.573100,0.451766,0.524096,0.587838,0.55414,0.859877
7,0.462100,0.403296,0.573099,0.662162,0.61442,0.880864
8,0.393000,0.371909,0.635802,0.695946,0.664516,0.893827
9,0.348700,0.35546,0.634146,0.702703,0.666667,0.896296
10,0.322100,0.349625,0.622754,0.702703,0.660317,0.899383


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=80, training_loss=0.6577754437923431, metrics={'train_runtime': 86.9595, 'train_samples_per_second': 57.498, 'train_steps_per_second': 0.92, 'total_flos': 265396313520000.0, 'train_loss': 0.6577754437923431, 'epoch': 10.0})

In [16]:
model = AutoModelForTokenClassification.from_pretrained("roberta-base", num_labels=len(label_names))
training_args = TrainingArguments(
    output_dir="./fine_tune_roberta_output",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=8,
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized_dataset,
    eval_dataset=val_tokenized_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Overall Precision,Overall Recall,Overall F1,Overall Accuracy
1,No log,0.935375,0.0,0.0,0.0,0.701235
2,No log,0.715688,0.2,0.236486,0.216718,0.775926
3,No log,0.523817,0.383784,0.47973,0.426426,0.820988
4,No log,0.371469,0.678788,0.756757,0.715655,0.910494
5,No log,0.31176,0.751553,0.817568,0.783172,0.920988
6,No log,0.268708,0.763975,0.831081,0.796117,0.940123
7,No log,0.262875,0.751515,0.837838,0.792332,0.941358
8,No log,0.258926,0.771605,0.844595,0.806452,0.942593


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=128, training_loss=0.5476521849632263, metrics={'train_runtime': 72.8157, 'train_samples_per_second': 54.933, 'train_steps_per_second': 1.758, 'total_flos': 212317050816000.0, 'train_loss': 0.5476521849632263, 'epoch': 8.0})

In [18]:
model.save_pretrained("./results/model")
tokenizer.save_pretrained("./results/model")

('./results/model/tokenizer_config.json',
 './results/model/special_tokens_map.json',
 './results/model/vocab.json',
 './results/model/merges.txt',
 './results/model/added_tokens.json',
 './results/model/tokenizer.json')

In [31]:
model = AutoModelForTokenClassification.from_pretrained("roberta-base", num_labels=len(label_names))
training_args = TrainingArguments(
    output_dir="./fine_tune_roberta_output",
    eval_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized_dataset,
    eval_dataset=val_tokenized_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Overall Precision,Overall Recall,Overall F1,Overall Accuracy
1,1.5321,0.866908,0.0,0.0,0.0,0.703086
2,0.7766,0.624614,0.417808,0.412162,0.414966,0.810494
3,0.5817,0.465799,0.497041,0.567568,0.529968,0.866049
4,0.3851,0.352562,0.701863,0.763514,0.731392,0.911111
5,0.2872,0.315197,0.710692,0.763514,0.736156,0.903086
6,0.24,0.273243,0.740506,0.790541,0.764706,0.925926
7,0.1916,0.282593,0.762821,0.804054,0.782895,0.923457
8,0.1958,0.265951,0.780645,0.817568,0.79868,0.932099
9,0.1695,0.269035,0.779221,0.810811,0.794702,0.930864
10,0.1447,0.266057,0.784314,0.810811,0.797342,0.933951


  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=160, training_loss=0.42280238494277, metrics={'train_runtime': 92.0653, 'train_samples_per_second': 54.309, 'train_steps_per_second': 1.738, 'total_flos': 265396313520000.0, 'train_loss': 0.42280238494277, 'epoch': 10.0})

In [33]:
model = AutoModelForTokenClassification.from_pretrained("roberta-base", num_labels=len(label_names))
training_args = TrainingArguments(
    output_dir="./fine_tune_roberta_output",
    eval_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=15,
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized_dataset,
    eval_dataset=val_tokenized_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Overall Precision,Overall Recall,Overall F1,Overall Accuracy
1,1.5276,0.904548,0.0,0.0,0.0,0.704938
2,0.7864,0.679105,0.42,0.425676,0.422819,0.812346
3,0.6021,0.493811,0.486034,0.587838,0.53211,0.851852
4,0.395,0.364045,0.652695,0.736486,0.692063,0.904321
5,0.284,0.32434,0.654971,0.756757,0.702194,0.908642
6,0.2281,0.288612,0.742138,0.797297,0.76873,0.931481
7,0.1728,0.282132,0.770701,0.817568,0.793443,0.925926
8,0.1681,0.284048,0.7625,0.824324,0.792208,0.932716
9,0.1359,0.292787,0.767296,0.824324,0.794788,0.932099
10,0.1094,0.303169,0.767296,0.824324,0.794788,0.932099


  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=240, training_loss=0.30475450108448665, metrics={'train_runtime': 138.1499, 'train_samples_per_second': 54.289, 'train_steps_per_second': 1.737, 'total_flos': 398094470280000.0, 'train_loss': 0.30475450108448665, 'epoch': 15.0})

In [39]:
model = AutoModelForTokenClassification.from_pretrained("roberta-base", num_labels=len(label_names))
training_args = TrainingArguments(
    output_dir="./fine_tune_roberta_output",
    eval_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=24,
    per_device_eval_batch_size=24,
    num_train_epochs=15,
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized_dataset,
    eval_dataset=val_tokenized_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Overall Precision,Overall Recall,Overall F1,Overall Accuracy
1,0.906,0.826864,0.0,0.0,0.0,0.701852
2,0.6583,0.574723,0.412791,0.47973,0.44375,0.820988
3,0.3865,0.372466,0.615819,0.736486,0.670769,0.896914
4,0.2859,0.281544,0.735484,0.77027,0.752475,0.917284
5,0.1937,0.26256,0.702381,0.797297,0.746835,0.916049
6,0.1561,0.256247,0.688235,0.790541,0.735849,0.922222
7,0.1192,0.288082,0.730769,0.77027,0.75,0.916667
8,0.109,0.265251,0.784314,0.810811,0.797342,0.92716
9,0.0954,0.302672,0.753165,0.804054,0.777778,0.92037
10,0.0764,0.28302,0.738854,0.783784,0.760656,0.921605


  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=315, training_loss=0.2427577869286613, metrics={'train_runtime': 142.1185, 'train_samples_per_second': 52.773, 'train_steps_per_second': 2.216, 'total_flos': 398094470280000.0, 'train_loss': 0.2427577869286613, 'epoch': 15.0})

## END