In [None]:
# Transformers installation
! pip install transformers datasets
!pip install evaluate
!pip install deepspeed
!pip install accelerate
!pip install wandb
# To install from source instead of the last release, comment the command above and uncomment the following one.
# ! pip install git+https://github.com/huggingface/transformers.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m46.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.8.0-py3-none-any.whl (452 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m452.9/452.9 KB[0m [31m43.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m102.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.4/182.4 KB[0m [31m25.0 MB/s[0m eta [36m0:00:00[0m
Col

In [None]:
import os

os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = "9994"  # modify if RuntimeError: Address already in use
os.environ["RANK"] = "0"
os.environ["LOCAL_RANK"] = "0"
os.environ["WORLD_SIZE"] = "1"

# Fine-tune a pretrained model

There are significant benefits to using a pretrained model. It reduces computation costs, your carbon footprint, and allows you to use state-of-the-art models without having to train one from scratch. 🤗 Transformers provides access to thousands of pretrained models for a wide range of tasks. When you use a pretrained model, you train it on a dataset specific to your task. This is known as fine-tuning, an incredibly powerful training technique. In this tutorial, you will fine-tune a pretrained model with a deep learning framework of your choice:

* Fine-tune a pretrained model with 🤗 Transformers [Trainer](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer).
* Fine-tune a pretrained model in TensorFlow with Keras.
* Fine-tune a pretrained model in native PyTorch.

<a id='data-processing'></a>

## Prepare a dataset

Before you can fine-tune a pretrained model, download a dataset and prepare it for training. The previous tutorial showed you how to process data for training, and now you get an opportunity to put those skills to the test!

Begin by loading the [Yelp Reviews](https://huggingface.co/datasets/yelp_review_full) dataset:

In [None]:
from huggingface_hub import notebook_login
notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.huggingface/token
Login successful


In [None]:
from datasets import load_dataset

dataset = load_dataset('tiagoblima/mec-punctuation-explainable-balanced')
dataset["train"][100]

Downloading readme:   0%|          | 0.00/618 [00:00<?, ?B/s]



Downloading and preparing dataset None/None to /root/.cache/huggingface/datasets/tiagoblima___parquet/tiagoblima--mec-punctuation-explainable-balanced-954d3d7c54bc49c5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/24.6k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/165k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating test split:   0%|          | 0/48 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/428 [00:00<?, ? examples/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/tiagoblima___parquet/tiagoblima--mec-punctuation-explainable-balanced-954d3d7c54bc49c5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

{'text_id': 350,
 'annot_id': 1027,
 'text': 'os outros falaram – talves possa ser.',
 'label': 'error de pontuação',
 'annotation_span': ['os outros falaram – talves possa ser .', '.']}

As you now know, you need a tokenizer to process the text and include a padding and truncation strategy to handle any variable sequence lengths. To process your dataset in one step, use 🤗 Datasets [`map`](https://huggingface.co/docs/datasets/process.html#map) method to apply a preprocessing function over the entire dataset:

In [None]:
from transformers import AutoTokenizer
checkpoint = "neuralmind/bert-base-portuguese-cased"
#"tiagoblima/punctuation-nilc-bert-large"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

max_length = 512
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=max_length)


tokenized_datasets = dataset.map(tokenize_function, batched=True)

Downloading:   0%|          | 0.00/43.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/647 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/210k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:

id2label = dict(enumerate(set(dataset['train']['label'])))
label2id = dict(map(reversed, id2label.items()))
def map_label(batch):
  batch["label"] = label2id[batch['label']]
  return batch
tokenized_datasets = tokenized_datasets.map(map_label)
tokenized_datasets

  0%|          | 0/48 [00:00<?, ?ex/s]

  0%|          | 0/428 [00:00<?, ?ex/s]

DatasetDict({
    test: Dataset({
        features: ['text_id', 'annot_id', 'text', 'label', 'annotation_span', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 48
    })
    train: Dataset({
        features: ['text_id', 'annot_id', 'text', 'label', 'annotation_span', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 428
    })
})

In [None]:
id2label

{0: 'pontuação correta',
 1: 'error de pontuação',
 2: 'error de vírgula',
 3: 'error de pontuação e vírgula'}

In [None]:
tokenized_datasets['train']['input_ids'][5][:4]

[101, 5911, 2746, 229]

If you like, you can create a smaller subset of the full dataset to fine-tune on to reduce the time it takes:

In [None]:
from collections import Counter 

Counter(tokenized_datasets["train"]['label'])

Counter({0: 109, 3: 109, 1: 103, 2: 107})

In [None]:
Counter(tokenized_datasets["train"]['label'])

Counter({0: 109, 3: 109, 1: 103, 2: 107})

In [None]:
train_dataset = tokenized_datasets['train']
test_dataset = tokenized_datasets['test']

In [None]:
id2label

{0: 'pontuação correta',
 1: 'error de pontuação',
 2: 'error de vírgula',
 3: 'error de pontuação e vírgula'}

In [None]:
train_labels = train_dataset['label']
test_labels = test_dataset['label']
convert_ids = lambda item: (id2label[item[0]], item[1])


In [None]:
stats = {
    'train': dict(map(convert_ids, Counter(train_labels).items())), 
    'test':dict(map(convert_ids, Counter(test_labels).items())), 
}

import pandas as pd 


stats_df = pd.DataFrame.from_dict(stats, orient='index').T
stats_df

Unnamed: 0,train,test
pontuação correta,109,10
error de pontuação e vírgula,109,10
error de pontuação,103,16
error de vírgula,107,12


In [None]:
stats_df.to_csv()

',train,test\npontuação correta,109,10\nerror de pontuação e vírgula,109,10\nerror de pontuação,103,16\nerror de vírgula,107,12\n'

<a id='trainer'></a>

## Train

At this point, you should follow the section corresponding to the framework you want to use. You can use the links
in the right sidebar to jump to the one you want - and if you want to hide all of the content for a given framework,
just use the button at the top-right of that framework's block!

## Train with PyTorch Trainer

🤗 Transformers provides a [Trainer](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer) class optimized for training 🤗 Transformers models, making it easier to start training without manually writing your own training loop. The [Trainer](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer) API supports a wide range of training options and features such as logging, gradient accumulation, and mixed precision.

Start by loading your model and specify the number of expected labels. From the Yelp Review [dataset card](https://huggingface.co/datasets/yelp_review_full#data-fields), you know there are five labels:

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=len(label2id), id2label=id2label,
    label2id=label2id, ignore_mismatched_sizes=True)

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of the model checkpoint at neuralmind/bert-base-portuguese-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the

<Tip>

You will see a warning about some of the pretrained weights not being used and some weights being randomly
initialized. Don't worry, this is completely normal! The pretrained head of the BERT model is discarded, and replaced with a randomly initialized classification head. You will fine-tune this new model head on your sequence classification task, transferring the knowledge of the pretrained model to it.

</Tip>

### Training hyperparameters

Next, create a [TrainingArguments](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments) class which contains all the hyperparameters you can tune as well as flags for activating different training options. For this tutorial you can start with the default training [hyperparameters](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments), but feel free to experiment with these to find your optimal settings.

Specify where to save the checkpoints from your training:

### Evaluate

[Trainer](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer) does not automatically evaluate model performance during training. You'll need to pass [Trainer](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer) a function to compute and report metrics. The [🤗 Evaluate](https://huggingface.co/docs/evaluate/index) library provides a simple [`accuracy`](https://huggingface.co/spaces/evaluate-metric/accuracy) function you can load with the [evaluate.load](https://huggingface.co/docs/evaluate/main/en/package_reference/loading_methods#evaluate.load) (see this [quicktour](https://huggingface.co/docs/evaluate/a_quick_tour) for more information) function:

In [None]:
import numpy as np
import evaluate

metric = evaluate.load('accuracy')

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Call `compute` on `metric` to calculate the accuracy of your predictions. Before passing your predictions to `compute`, you need to convert the predictions to logits (remember all 🤗 Transformers models return logits):

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    wandb.log({"conf_mat" : wandb.plot.confusion_matrix(probs=None,
                        y_true=labels, preds=predictions,
                        class_names=list(id2label.values()))})
   
    return metric.compute(predictions=predictions, references=labels)

If you'd like to monitor your evaluation metrics during fine-tuning, specify the `evaluation_strategy` parameter in your training arguments to report the evaluation metric at the end of each epoch:

In [None]:
import wandb 

wandb.login(key='8e593ae9d0788bae2e0a84d07de0e76f5cf3dcf4')
!env WANDB_PROJECT=mec-multiclass-balanced-punkt
!env WANDB_LOG_MODEL=true

ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


NV_LIBCUBLAS_DEV_VERSION=11.4.1.1043-1
NV_CUDA_COMPAT_PACKAGE=cuda-compat-11-2
__EGL_VENDOR_LIBRARY_DIRS=/usr/lib64-nvidia:/usr/share/glvnd/egl_vendor.d/
NV_CUDNN_PACKAGE_DEV=libcudnn8-dev=8.1.1.33-1+cuda11.2
PYDEVD_USE_FRAME_EVAL=NO
WORLD_SIZE=1
LD_LIBRARY_PATH=/usr/lib64-nvidia
NV_LIBNCCL_DEV_PACKAGE=libnccl-dev=2.8.4-1+cuda11.2
TCLLIBPATH=/usr/share/tcltk/tcllib1.19
CLOUDSDK_PYTHON=python3
LANG=en_US.UTF-8
NV_LIBNPP_DEV_PACKAGE=libnpp-dev-11-2=11.3.2.152-1
ENABLE_DIRECTORYPREFETCHER=1
HOSTNAME=31b65ccf6731
OLDPWD=/
CLOUDSDK_CONFIG=/content/.config
KMP_INIT_AT_FORK=FALSE
USE_AUTH_EPHEM=1
KMP_EXTRA_ARGS=--listen_host=172.28.0.12 --target_host=172.28.0.12 --tunnel_background_save_url=https://colab.research.google.com/tun/m/cc48301118ce562b961b3c22d803539adc1e0c19/gpu-t4-s-rhl8dwvctz6h --tunnel_background_save_delay=10s --tunnel_periodic_background_save_frequency=30m0s --enable_output_coalescing=true --output_coalescing_required=true
NV_LIBNPP_VERSION=11.3.2.152-1
NV_NVPROF_DEV_PACKAGE=

In [None]:
from transformers import TrainingArguments, Trainer
from transformers import TrainingArguments

MODEL_NAME = "mec-multiclass-balanced"
training_args = TrainingArguments(output_dir=MODEL_NAME,
                                  save_strategy="epoch",
                                  auto_find_batch_size=True, 
                                  load_best_model_at_end = True,
                                  evaluation_strategy="epoch", 
                                  num_train_epochs=3,
                                  per_device_train_batch_size=4,
                                  overwrite_output_dir=True,
                                  push_to_hub=True,
                                  run_name=MODEL_NAME,
                                  report_to="wandb")

PyTorch: setting up devices


In [None]:
!rm -r mec-multiclass-balanced

rm: cannot remove 'mec-multiclass-balanced': No such file or directory


### Trainer

Create a [Trainer](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer) object with your model, training arguments, training and test datasets, and evaluation function:

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

Cloning https://huggingface.co/tiagoblima/mec-multiclass-balanced into local empty directory.


Then fine-tune your model by calling [train()](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer.train):

In [None]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, annotation_span, annot_id, text_id. If text, annotation_span, annot_id, text_id are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 428
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 321
  Number of trainable parameters = 108926212
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33mtblima[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.162738,0.416667
2,No log,1.286481,0.291667
3,No log,1.268316,0.375


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, annotation_span, annot_id, text_id. If text, annotation_span, annot_id, text_id are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 48
  Batch size = 8
Saving model checkpoint to mec-multiclass-balanced/checkpoint-107
Configuration saved in mec-multiclass-balanced/checkpoint-107/config.json
Model weights saved in mec-multiclass-balanced/checkpoint-107/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, annotation_span, annot_id, text_id. If text, annotation_span, annot_id, text_id are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 48
 

TrainOutput(global_step=321, training_loss=1.0230176693925233, metrics={'train_runtime': 171.4808, 'train_samples_per_second': 7.488, 'train_steps_per_second': 1.872, 'total_flos': 337840667885568.0, 'train_loss': 1.0230176693925233, 'epoch': 3.0})

In [None]:
result = trainer.evaluate()
result

The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, annotation_span, annot_id, text_id. If text, annotation_span, annot_id, text_id are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 48
  Batch size = 8


{'eval_loss': 1.1627382040023804,
 'eval_accuracy': 0.4166666666666667,
 'eval_runtime': 1.8652,
 'eval_samples_per_second': 25.734,
 'eval_steps_per_second': 3.217,
 'epoch': 3.0}

In [None]:
import pandas as pd 

pd.DataFrame.from_dict(result, orient='index').to_csv()

',0\neval_loss,1.1627382040023804\neval_accuracy,0.4166666666666667\neval_runtime,1.8652\neval_samples_per_second,25.734\neval_steps_per_second,3.217\nepoch,3.0\n'

In [None]:
trainer.push_to_hub()

Saving model checkpoint to mec-multiclass-balanced
Configuration saved in mec-multiclass-balanced/config.json
Model weights saved in mec-multiclass-balanced/pytorch_model.bin
Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Text Classification', 'type': 'text-classification'}, 'metrics': [{'name': 'Accuracy', 'type': 'accuracy', 'value': 0.4166666666666667}]}
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


Upload file pytorch_model.bin:   0%|          | 3.30k/416M [00:00<?, ?B/s]

KeyboardInterrupt: ignored

In [None]:
from transformers import pipeline
pipe = pipeline("text-classification", model=trainer.model, tokenizer=tokenizer, device=0)

In [None]:
test_dataset['annotation_span'][:1]

In [None]:
from collections import defaultdict

selections = defaultdict(list)
for examples in test_dataset:
    if len(selections[id2label[examples['label']]]) < 5:
      selections[id2label[examples['label']]].append(examples)


In [None]:
text = test_dataset['text'][10]
text

In [None]:
pipe('Eu vi você ontem mas quero vê hoje.')

In [None]:
for label in selections:
  for example in selections[label]:
      predicted = pipe(example['text'])
     
      predicted = "Predicted Label: {} Score: {}".format(*predicted[0].values())
      print(example['text'])
      true_label =  id2label[example['label']]
      print(f'{predicted} --> {true_label}')
      print()
  print('-'*30)


<a id='pytorch_native'></a>

## Explanaible AI