In [10]:
import evaluate
import datetime
from transformers import AutoTokenizer, DataCollatorForTokenClassification, TrainingArguments, Trainer, AutoModelForTokenClassification, pipeline, EarlyStoppingCallback

from labels import LABELS
from utils import load_data, create_dataset_dict, generate_tokenized_datasets, compute_metrics

# Configs

In [18]:
BASE_MODEL_NAME = "neuralmind/bert-base-portuguese-cased"

# Load Data

The file must be in format of the output of the labelling tool
a txt file with a list of json

In [3]:
data = load_data("data_file.txt")

In [4]:
len(data)

504

In [5]:
data[0]

{'name': 'Kit Shampoo Revitay Óleo de Coco Novex 300ml e Condicionador Revitay Óleo de Coco Novex 300ml',
 'tags': [['O', 0, 3],
  ['B-PRO', 4, 11],
  ['B-ESP', 12, 19],
  ['B-ESP', 20, 24],
  ['I-ESP', 25, 27],
  ['I-ESP', 28, 32],
  ['B-MAR', 33, 38],
  ['B-TAM', 39, 44],
  ['O', 45, 46],
  ['B-PRO', 47, 60],
  ['B-ESP', 61, 68],
  ['B-ESP', 69, 73],
  ['I-ESP', 74, 76],
  ['I-ESP', 77, 81],
  ['B-MAR', 82, 87],
  ['B-TAM', 88, 93]]}

# Create Datasets

In [6]:
dataset = create_dataset_dict(data, test_size=0.2)
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'ner_tags', 'tokens'],
        num_rows: 403
    })
    test: Dataset({
        features: ['id', 'ner_tags', 'tokens'],
        num_rows: 101
    })
})

# Train the model

In [7]:
id2label = {i: label for i, label in enumerate(LABELS)}
label2id = {v: k for k, v in id2label.items()}

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME, max_length=512, truncation=True)
model = AutoModelForTokenClassification.from_pretrained(
    BASE_MODEL_NAME,
    id2label=id2label,
    label2id=label2id,
)
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
metric = evaluate.load("seqeval")

Some weights of BertForTokenClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
tokenized_datasets = generate_tokenized_datasets(tokenizer, dataset)

Map:   0%|          | 0/403 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/101 [00:00<?, ? examples/s]

In [11]:
output_model_dir = f"bert_finetuned_ner_{int(datetime.datetime.now().timestamp())}"
args = TrainingArguments(
    output_dir=output_model_dir,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=10,
    weight_decay=0.01,
    push_to_hub=False,
    metric_for_best_model="precision",
    load_best_model_at_end=True,
)

In [12]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)
trainer.train()

  0%|          | 0/510 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/13 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.6473009586334229, 'eval_precision': 0.6486042692939245, 'eval_recall': 0.7655038759689923, 'eval_f1': 0.7022222222222222, 'eval_accuracy': 0.80548128342246, 'eval_runtime': 2.6106, 'eval_samples_per_second': 38.689, 'eval_steps_per_second': 4.98, 'epoch': 1.0}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.4726336598396301, 'eval_precision': 0.7368421052631579, 'eval_recall': 0.813953488372093, 'eval_f1': 0.7734806629834253, 'eval_accuracy': 0.8449197860962567, 'eval_runtime': 1.5393, 'eval_samples_per_second': 65.613, 'eval_steps_per_second': 8.445, 'epoch': 2.0}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.4285411536693573, 'eval_precision': 0.8025594149908593, 'eval_recall': 0.8507751937984496, 'eval_f1': 0.8259642521166509, 'eval_accuracy': 0.8683155080213903, 'eval_runtime': 1.4218, 'eval_samples_per_second': 71.037, 'eval_steps_per_second': 9.143, 'epoch': 3.0}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.42887091636657715, 'eval_precision': 0.8144712430426716, 'eval_recall': 0.8507751937984496, 'eval_f1': 0.8322274881516587, 'eval_accuracy': 0.8716577540106952, 'eval_runtime': 1.3688, 'eval_samples_per_second': 73.785, 'eval_steps_per_second': 9.497, 'epoch': 4.0}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.4599468410015106, 'eval_precision': 0.8161764705882353, 'eval_recall': 0.8604651162790697, 'eval_f1': 0.8377358490566038, 'eval_accuracy': 0.8656417112299465, 'eval_runtime': 1.3574, 'eval_samples_per_second': 74.406, 'eval_steps_per_second': 9.577, 'epoch': 5.0}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.465029776096344, 'eval_precision': 0.8296296296296296, 'eval_recall': 0.8682170542635659, 'eval_f1': 0.8484848484848484, 'eval_accuracy': 0.8770053475935828, 'eval_runtime': 1.3981, 'eval_samples_per_second': 72.241, 'eval_steps_per_second': 9.298, 'epoch': 6.0}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.48287132382392883, 'eval_precision': 0.8107606679035251, 'eval_recall': 0.8468992248062015, 'eval_f1': 0.828436018957346, 'eval_accuracy': 0.8703208556149733, 'eval_runtime': 1.2633, 'eval_samples_per_second': 79.947, 'eval_steps_per_second': 10.29, 'epoch': 7.0}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.49582597613334656, 'eval_precision': 0.813780260707635, 'eval_recall': 0.8468992248062015, 'eval_f1': 0.8300094966761633, 'eval_accuracy': 0.8709893048128342, 'eval_runtime': 1.482, 'eval_samples_per_second': 68.151, 'eval_steps_per_second': 8.772, 'epoch': 8.0}
{'train_runtime': 59.0895, 'train_samples_per_second': 68.202, 'train_steps_per_second': 8.631, 'train_loss': 0.4547533147475299, 'epoch': 8.0}


TrainOutput(global_step=408, training_loss=0.4547533147475299, metrics={'train_runtime': 59.0895, 'train_samples_per_second': 68.202, 'train_steps_per_second': 8.631, 'train_loss': 0.4547533147475299, 'epoch': 8.0})

In [13]:
trainer.evaluate()

  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.465029776096344,
 'eval_precision': 0.8296296296296296,
 'eval_recall': 0.8682170542635659,
 'eval_f1': 0.8484848484848484,
 'eval_accuracy': 0.8770053475935828,
 'eval_runtime': 2.4617,
 'eval_samples_per_second': 41.028,
 'eval_steps_per_second': 5.281,
 'epoch': 8.0}

In [14]:
fine_tunned_model_path = "best_bert_finetuned_ner"
trainer.save_model(fine_tunned_model_path)

# Validation

In [15]:
token_classifier = pipeline(
    "token-classification", model=fine_tunned_model_path, aggregation_strategy="simple"
)

In [16]:
def show_predict(text):
    tokens = token_classifier(text)
    print(text)
    for token in tokens:
        if token:
            print(f"{token['word']} : {token['entity_group']}")

In [17]:
show_predict("Coca Cola 1L")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Coca Cola 1L
Coca Cola : PRO
1L : TAM
