In [1]:
import evaluate
import datetime
from transformers import AutoTokenizer, DataCollatorForTokenClassification, TrainingArguments, Trainer, AutoModelForTokenClassification, pipeline, EarlyStoppingCallback

from tools.labeling_tool import LABELS
from utils.train_utils import load_data, create_dataset_dict, generate_tokenized_datasets, compute_metrics

# Configs

In [2]:
BASE_MODEL_NAME = "neuralmind/bert-base-portuguese-cased"

# Load Data

The file must be in format of the output of the labelling tool
a txt file with a list of json

In [3]:
data = load_data("train_data.txt")

In [4]:
len(data)

504

In [5]:
data[0]

{'name': 'Kit Shampoo Revitay Óleo de Coco Novex 300ml e Condicionador Revitay Óleo de Coco Novex 300ml',
 'tags': [['O', 0, 3],
  ['B-PRO', 4, 11],
  ['B-ESP', 12, 19],
  ['B-ESP', 20, 24],
  ['I-ESP', 25, 27],
  ['I-ESP', 28, 32],
  ['B-MAR', 33, 38],
  ['B-TAM', 39, 44],
  ['O', 45, 46],
  ['B-PRO', 47, 60],
  ['B-ESP', 61, 68],
  ['B-ESP', 69, 73],
  ['I-ESP', 74, 76],
  ['I-ESP', 77, 81],
  ['B-MAR', 82, 87],
  ['B-TAM', 88, 93]]}

# Create Datasets

In [6]:
dataset = create_dataset_dict(data, test_size=0.2)
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'ner_tags', 'tokens'],
        num_rows: 403
    })
    test: Dataset({
        features: ['id', 'ner_tags', 'tokens'],
        num_rows: 101
    })
})

# Train the model

In [7]:
id2label = {i: label for i, label in enumerate(LABELS)}
label2id = {v: k for k, v in id2label.items()}

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME, max_length=512, truncation=True)
model = AutoModelForTokenClassification.from_pretrained(
    BASE_MODEL_NAME,
    id2label=id2label,
    label2id=label2id,
)
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
metric = evaluate.load("seqeval")

Some weights of BertForTokenClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
tokenized_datasets = generate_tokenized_datasets(tokenizer, dataset)

Map:   0%|          | 0/403 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/101 [00:00<?, ? examples/s]

In [9]:
output_model_dir = f"bert_finetuned_ner_{int(datetime.datetime.now().timestamp())}"
args = TrainingArguments(
    output_dir=output_model_dir,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=10,
    weight_decay=0.01,
    push_to_hub=False,
    metric_for_best_model="precision",
    load_best_model_at_end=True,
)

In [10]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)
trainer.train()

  0%|          | 0/510 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/13 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.662858247756958, 'eval_precision': 0.5891472868217055, 'eval_recall': 0.7364341085271318, 'eval_f1': 0.6546080964685616, 'eval_accuracy': 0.7921122994652406, 'eval_runtime': 2.4376, 'eval_samples_per_second': 41.434, 'eval_steps_per_second': 5.333, 'epoch': 1.0}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.48267507553100586, 'eval_precision': 0.7084048027444254, 'eval_recall': 0.8003875968992248, 'eval_f1': 0.751592356687898, 'eval_accuracy': 0.8348930481283422, 'eval_runtime': 4.3304, 'eval_samples_per_second': 23.323, 'eval_steps_per_second': 3.002, 'epoch': 2.0}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.4402821362018585, 'eval_precision': 0.7574692442882249, 'eval_recall': 0.8352713178294574, 'eval_f1': 0.7944700460829494, 'eval_accuracy': 0.8542780748663101, 'eval_runtime': 1.2369, 'eval_samples_per_second': 81.653, 'eval_steps_per_second': 10.51, 'epoch': 3.0}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.4186031222343445, 'eval_precision': 0.7851985559566786, 'eval_recall': 0.8430232558139535, 'eval_f1': 0.8130841121495327, 'eval_accuracy': 0.8656417112299465, 'eval_runtime': 1.2529, 'eval_samples_per_second': 80.612, 'eval_steps_per_second': 10.376, 'epoch': 4.0}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.43677565455436707, 'eval_precision': 0.7938517179023508, 'eval_recall': 0.8507751937984496, 'eval_f1': 0.8213283442469597, 'eval_accuracy': 0.8676470588235294, 'eval_runtime': 1.9076, 'eval_samples_per_second': 52.946, 'eval_steps_per_second': 6.815, 'epoch': 5.0}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.4555143713951111, 'eval_precision': 0.8055045871559633, 'eval_recall': 0.8507751937984496, 'eval_f1': 0.827521206409048, 'eval_accuracy': 0.8676470588235294, 'eval_runtime': 1.5544, 'eval_samples_per_second': 64.977, 'eval_steps_per_second': 8.363, 'epoch': 6.0}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.4986459016799927, 'eval_precision': 0.8022181146025879, 'eval_recall': 0.8410852713178295, 'eval_f1': 0.8211920529801325, 'eval_accuracy': 0.8582887700534759, 'eval_runtime': 1.2289, 'eval_samples_per_second': 82.188, 'eval_steps_per_second': 10.579, 'epoch': 7.0}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.48684245347976685, 'eval_precision': 0.8264925373134329, 'eval_recall': 0.8585271317829457, 'eval_f1': 0.8422053231939164, 'eval_accuracy': 0.8709893048128342, 'eval_runtime': 1.2756, 'eval_samples_per_second': 79.179, 'eval_steps_per_second': 10.191, 'epoch': 8.0}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.5034500360488892, 'eval_precision': 0.8114602587800369, 'eval_recall': 0.8507751937984496, 'eval_f1': 0.8306527909176915, 'eval_accuracy': 0.8643048128342246, 'eval_runtime': 2.2729, 'eval_samples_per_second': 44.436, 'eval_steps_per_second': 5.719, 'epoch': 9.0}
{'loss': 0.4004, 'learning_rate': 3.921568627450981e-07, 'epoch': 9.8}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.49399346113204956, 'eval_precision': 0.8121546961325967, 'eval_recall': 0.8546511627906976, 'eval_f1': 0.8328611898016998, 'eval_accuracy': 0.8716577540106952, 'eval_runtime': 1.2592, 'eval_samples_per_second': 80.211, 'eval_steps_per_second': 10.324, 'epoch': 10.0}
{'train_runtime': 76.0616, 'train_samples_per_second': 52.983, 'train_steps_per_second': 6.705, 'train_loss': 0.3945703628016453, 'epoch': 10.0}


TrainOutput(global_step=510, training_loss=0.3945703628016453, metrics={'train_runtime': 76.0616, 'train_samples_per_second': 52.983, 'train_steps_per_second': 6.705, 'train_loss': 0.3945703628016453, 'epoch': 10.0})

In [11]:
trainer.evaluate()

  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.48684245347976685,
 'eval_precision': 0.8264925373134329,
 'eval_recall': 0.8585271317829457,
 'eval_f1': 0.8422053231939164,
 'eval_accuracy': 0.8709893048128342,
 'eval_runtime': 1.8056,
 'eval_samples_per_second': 55.936,
 'eval_steps_per_second': 7.2,
 'epoch': 10.0}

In [12]:
fine_tunned_model_path = "best_bert_finetuned_ner"
trainer.save_model(fine_tunned_model_path)

# Validation

In [13]:
token_classifier = pipeline(
    "token-classification", model=fine_tunned_model_path, aggregation_strategy="simple"
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [16]:
type(token_classifier)

transformers.pipelines.token_classification.TokenClassificationPipeline

In [14]:
def show_predict(text):
    tokens = token_classifier(text)
    print(text)
    for token in tokens:
        if token:
            print(f"{token['word']} : {token['entity_group']}")

In [15]:
show_predict("Coca Cola 1L")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Coca Cola 1L
Coca Cola : PRO
1L : TAM


In [17]:
token_classifier("Coca Cola 1L")

[{'entity_group': 'PRO',
  'score': 0.9833276,
  'word': 'Coca Cola',
  'start': 0,
  'end': 9},
 {'entity_group': 'TAM',
  'score': 0.98347676,
  'word': '1L',
  'start': 10,
  'end': 12}]