In [22]:
from transformers import (
    AutoTokenizer,
    DataCollatorForTokenClassification,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer
)

import numpy as np
from datasets import Dataset, load_metric
import json
from preprocessing import preprocess

In [23]:
map_labels = {
    "O": 0,
    "B-PRO": 1,
    "I-PRO": 2
}
label_names = list(map_labels.keys())

In [24]:
def map_profanities_to_token_tags(comment_tokens, examples_tokens):
    tags = [0 for _ in range(len(comment_tokens))]
    for example_tokens in examples_tokens:
        for idx in range(len(comment_tokens) - len(example_tokens) + 1):
            if comment_tokens[idx: idx + len(example_tokens)] == example_tokens:
                tags[idx] = 1
                for i in range(idx+1, idx+len(example_tokens)):
                    tags[i] = 2
    return tags


def get_dataset():
    with open('data/blitz_comments.json', 'r', encoding="utf-8") as f:
        for d in json.load(f):
            if 'label' in d:
                comment_tokens = preprocess(d['comment'])
                yield {
                        'tokens': comment_tokens,
                        'tags': map_profanities_to_token_tags(comment_tokens, [preprocess(e) for e in d['examples']] if 'examples' in d else [])
                    }

In [25]:
dataset = Dataset.from_generator(get_dataset).train_test_split(test_size=0.1)

In [26]:
tokenizer = AutoTokenizer.from_pretrained("usmiva/bert-web-bg")

In [27]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples['tokens'], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples['tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [28]:
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/3631 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/404 [00:00<?, ? examples/s]

In [29]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [30]:
model = AutoModelForTokenClassification.from_pretrained('usmiva/bert-web-bg', num_labels=3)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at usmiva/bert-web-bg and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
metric = load_metric("seqeval")
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_names[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    flattened_results = {
        "overall_precision": results["overall_precision"],
        "overall_recall": results["overall_recall"],
        "overall_f1": results["overall_f1"],
        "overall_accuracy": results["overall_accuracy"],
    }
    for k in results.keys():
        if(k not in flattened_results.keys()):
            flattened_results[k+"_f1"]=results[k]["f1"]

    return flattened_results

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [33]:
training_args = TrainingArguments(
    output_dir="./data/bert-finetuning",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

  0%|          | 0/2270 [00:00<?, ?it/s]

  0%|          | 0/26 [00:00<?, ?it/s]

{'eval_loss': 0.16461077332496643, 'eval_overall_precision': 0.3858267716535433, 'eval_overall_recall': 0.2300469483568075, 'eval_overall_f1': 0.28823529411764703, 'eval_overall_accuracy': 0.9577172503242543, 'eval_PRO_f1': 0.28823529411764703, 'eval_runtime': 5.712, 'eval_samples_per_second': 70.728, 'eval_steps_per_second': 4.552, 'epoch': 1.0}


  0%|          | 0/26 [00:00<?, ?it/s]

{'eval_loss': 0.17202037572860718, 'eval_overall_precision': 0.3471502590673575, 'eval_overall_recall': 0.3145539906103286, 'eval_overall_f1': 0.3300492610837439, 'eval_overall_accuracy': 0.9538261997405967, 'eval_PRO_f1': 0.3300492610837439, 'eval_runtime': 5.9247, 'eval_samples_per_second': 68.189, 'eval_steps_per_second': 4.388, 'epoch': 2.0}


Checkpoint destination directory ./data/bert-finetuning\checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.0511, 'learning_rate': 1.5594713656387664e-05, 'epoch': 2.2}


  0%|          | 0/26 [00:00<?, ?it/s]

{'eval_loss': 0.18787802755832672, 'eval_overall_precision': 0.4230769230769231, 'eval_overall_recall': 0.3615023474178404, 'eval_overall_f1': 0.389873417721519, 'eval_overall_accuracy': 0.9583657587548639, 'eval_PRO_f1': 0.389873417721519, 'eval_runtime': 5.1654, 'eval_samples_per_second': 78.213, 'eval_steps_per_second': 5.033, 'epoch': 3.0}


  0%|          | 0/26 [00:00<?, ?it/s]

{'eval_loss': 0.20336796343326569, 'eval_overall_precision': 0.39325842696629215, 'eval_overall_recall': 0.3286384976525822, 'eval_overall_f1': 0.35805626598465473, 'eval_overall_accuracy': 0.9568093385214008, 'eval_PRO_f1': 0.35805626598465473, 'eval_runtime': 5.9473, 'eval_samples_per_second': 67.93, 'eval_steps_per_second': 4.372, 'epoch': 4.0}
{'loss': 0.0286, 'learning_rate': 1.1189427312775332e-05, 'epoch': 4.41}


  0%|          | 0/26 [00:00<?, ?it/s]

{'eval_loss': 0.24247276782989502, 'eval_overall_precision': 0.41304347826086957, 'eval_overall_recall': 0.3568075117370892, 'eval_overall_f1': 0.38287153652392947, 'eval_overall_accuracy': 0.9587548638132296, 'eval_PRO_f1': 0.38287153652392947, 'eval_runtime': 5.7282, 'eval_samples_per_second': 70.528, 'eval_steps_per_second': 4.539, 'epoch': 5.0}


  0%|          | 0/26 [00:00<?, ?it/s]

{'eval_loss': 0.25958701968193054, 'eval_overall_precision': 0.4230769230769231, 'eval_overall_recall': 0.3615023474178404, 'eval_overall_f1': 0.389873417721519, 'eval_overall_accuracy': 0.9600518806744488, 'eval_PRO_f1': 0.389873417721519, 'eval_runtime': 5.2569, 'eval_samples_per_second': 76.851, 'eval_steps_per_second': 4.946, 'epoch': 6.0}
{'loss': 0.0123, 'learning_rate': 6.784140969162997e-06, 'epoch': 6.61}


  0%|          | 0/26 [00:00<?, ?it/s]

{'eval_loss': 0.2656710743904114, 'eval_overall_precision': 0.4067796610169492, 'eval_overall_recall': 0.3380281690140845, 'eval_overall_f1': 0.36923076923076925, 'eval_overall_accuracy': 0.9587548638132296, 'eval_PRO_f1': 0.36923076923076925, 'eval_runtime': 5.767, 'eval_samples_per_second': 70.054, 'eval_steps_per_second': 4.508, 'epoch': 7.0}


  0%|          | 0/26 [00:00<?, ?it/s]

{'eval_loss': 0.2697904109954834, 'eval_overall_precision': 0.42424242424242425, 'eval_overall_recall': 0.39436619718309857, 'eval_overall_f1': 0.40875912408759124, 'eval_overall_accuracy': 0.9595330739299611, 'eval_PRO_f1': 0.40875912408759124, 'eval_runtime': 5.7246, 'eval_samples_per_second': 70.572, 'eval_steps_per_second': 4.542, 'epoch': 8.0}
{'loss': 0.0056, 'learning_rate': 2.378854625550661e-06, 'epoch': 8.81}


  0%|          | 0/26 [00:00<?, ?it/s]

{'eval_loss': 0.2846177816390991, 'eval_overall_precision': 0.4, 'eval_overall_recall': 0.3474178403755869, 'eval_overall_f1': 0.3718592964824121, 'eval_overall_accuracy': 0.9582360570687419, 'eval_PRO_f1': 0.3718592964824121, 'eval_runtime': 5.746, 'eval_samples_per_second': 70.31, 'eval_steps_per_second': 4.525, 'epoch': 9.0}


  0%|          | 0/26 [00:00<?, ?it/s]

{'eval_loss': 0.2881735861301422, 'eval_overall_precision': 0.4175824175824176, 'eval_overall_recall': 0.3568075117370892, 'eval_overall_f1': 0.3848101265822785, 'eval_overall_accuracy': 0.9586251621271077, 'eval_PRO_f1': 0.3848101265822785, 'eval_runtime': 5.6679, 'eval_samples_per_second': 71.279, 'eval_steps_per_second': 4.587, 'epoch': 10.0}
{'train_runtime': 1213.8069, 'train_samples_per_second': 29.914, 'train_steps_per_second': 1.87, 'train_loss': 0.021882648641317427, 'epoch': 10.0}


TrainOutput(global_step=2270, training_loss=0.021882648641317427, metrics={'train_runtime': 1213.8069, 'train_samples_per_second': 29.914, 'train_steps_per_second': 1.87, 'train_loss': 0.021882648641317427, 'epoch': 10.0})