## Install dependencies



In [2]:
!pip install datasets
!pip install transformers==4.17
!pip install accelerate>=0.20.3

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed datasets-2

## Inspect a dataset’s attributes without committing to downloading it using `load_dataset_builder()`

In [2]:
from datasets import load_dataset_builder

In [3]:
ds_builder = load_dataset_builder("emotion", trust_remote_code=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/3.97k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.28k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/8.78k [00:00<?, ?B/s]

In [4]:
# Inspect dataset

# 1. A description of the dataset
print(ds_builder.info.description)
print("================")

# 2. A URL to the official homepage for the dataset.
print(ds_builder.info.homepage)
print("================")

# 3. The features used to specify the dataset's column types.
print(ds_builder.info.features)
print("================")

# 4. The mapping between split name and metadata.
print(ds_builder.info.splits)
print("================")

Emotion is a dataset of English Twitter messages with six basic emotions: anger, fear, joy, love, sadness, and surprise. For more detailed information please refer to the paper.

https://github.com/dair-ai/emotion_dataset
{'text': Value(dtype='string', id=None), 'label': ClassLabel(names=['sadness', 'joy', 'love', 'anger', 'fear', 'surprise'], id=None)}
{'train': SplitInfo(name='train', num_bytes=1741597, num_examples=16000, shard_lengths=None, dataset_name=None), 'validation': SplitInfo(name='validation', num_bytes=214703, num_examples=2000, shard_lengths=None, dataset_name=None), 'test': SplitInfo(name='test', num_bytes=217181, num_examples=2000, shard_lengths=None, dataset_name=None)}


## Loading data with `load_dataset()`

In [3]:
from datasets import load_dataset

In [10]:
emotion = load_dataset('emotion', trust_remote_code=True)

In [11]:
emotion

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [5]:
print(emotion['train'].features['label'].names)
print(emotion['validation'].features['label'].names)
print(emotion['test'].features['label'].names)

['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']
['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']
['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']


### We can see that our dataset consists of three sets:
1. `train`,
2. `validation`,
3. `test`,

and has six kinds of emotion:
0. `sadness`,
1. `joy`,
2. `love`,
3. `anger`,
4. `fear`,
5. `suprise`,

In [12]:
# Get the first row of train dataset
emotion['train'][0]

{'text': 'i didnt feel humiliated', 'label': 0}

## Tokenization

- Models cannot process raw text, because of that we need to convert the text into numbers. Tokenization provides a way to do this by dividing text into individual words called tokens. Tokens are finally converted to numbers.
- We will use tokenizer corresponding to a pretrained BERT model. Using the same tokenizer as the pretrained model is important because you want to make sure the text is split in the same way.

In [7]:
# Checkpoint for BERT model
checkpoint = 'bert-base-uncased'

In [8]:
from transformers import AutoTokenizer

In [9]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Downloading:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [10]:
# For example we will tokenize the first text from train
tokenizer.tokenize(emotion['train'][0]['text'])

['i', 'didn', '##t', 'feel', 'humiliated']

In [11]:
# Call our tokenizer on first row of `text` in the train dataset
tokenizer(emotion['train'][0]['text'])

{'input_ids': [101, 1045, 2134, 2102, 2514, 26608, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

- The tokenizer returns a dictionary with three items:

1. `input_ids`: the numbers representing the tokens in the text.
2. `token_type_ids`: indicates which sequence a token belongs to if there is more than one sequence.
3. `attention_mask`: indicates whether a token should be masked or not.


- These values are actually the model inputs.

In [None]:
# Saving model on computer
# model.save_pretrained("directory_on_my_computer")

In [12]:
# Define function for tokenization
def tokenization(text):
  return tokenizer(text['text'], truncation=True)

In [19]:
from transformers import AutoTokenizer, DataCollatorWithPadding

tokenized_datasets = emotion.map(tokenization, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

In [20]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2000
    })
})

We need to apply a bit of postprocessing to our tokenized_datasets. Specifically, we need to:

- Remove the columns corresponding to values the model does not expect (like the text column).
- Rename the column label to labels (because the model expects the argument to be named labels).
- Set the format of the datasets so they return PyTorch tensors instead of lists.

In [15]:
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

In [16]:
from transformers import TrainingArguments

training_args = TrainingArguments("test-trainer")

In [17]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=6)

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [21]:
from datasets import load_metric

accuracy_metric = load_metric("accuracy")
f1_metric = load_metric("f1")
precision_metric = load_metric("precision")
recall_metric = load_metric("recall")

def compute_metrics(p):
    predictions, labels = p
    predictions = predictions.argmax(axis=1)
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")
    precision = precision_metric.compute(predictions=predictions, references=labels, average="weighted")
    recall = recall_metric.compute(predictions=predictions, references=labels, average="weighted")
    return {
        "accuracy": accuracy["accuracy"],
        "f1": f1["f1"],
        "precision": precision["precision"],
        "recall": recall["recall"],
    }

  accuracy_metric = load_metric("accuracy")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.58k [00:00<?, ?B/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.52k [00:00<?, ?B/s]

In [22]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [23]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 16000
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 6000


Step,Training Loss
500,0.7407
1000,0.3602
1500,0.2729
2000,0.2542
2500,0.1592
3000,0.1637
3500,0.1457
4000,0.1615
4500,0.1125
5000,0.1171


Saving model checkpoint to test-trainer/checkpoint-500
Configuration saved in test-trainer/checkpoint-500/config.json
Model weights saved in test-trainer/checkpoint-500/pytorch_model.bin
tokenizer config file saved in test-trainer/checkpoint-500/tokenizer_config.json
Special tokens file saved in test-trainer/checkpoint-500/special_tokens_map.json
Saving model checkpoint to test-trainer/checkpoint-1000
Configuration saved in test-trainer/checkpoint-1000/config.json
Model weights saved in test-trainer/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in test-trainer/checkpoint-1000/tokenizer_config.json
Special tokens file saved in test-trainer/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to test-trainer/checkpoint-1500
Configuration saved in test-trainer/checkpoint-1500/config.json
Model weights saved in test-trainer/checkpoint-1500/pytorch_model.bin
tokenizer config file saved in test-trainer/checkpoint-1500/tokenizer_config.json
Special tokens file saved

TrainOutput(global_step=6000, training_loss=0.223392058690389, metrics={'train_runtime': 715.2743, 'train_samples_per_second': 67.107, 'train_steps_per_second': 8.388, 'total_flos': 1019275389366624.0, 'train_loss': 0.223392058690389, 'epoch': 3.0})

In [24]:
metrics = trainer.evaluate()
print(metrics)

The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 8


{'eval_loss': 0.22483882308006287, 'eval_accuracy': 0.94, 'eval_f1': 0.9399798588434877, 'eval_precision': 0.9404260201235153, 'eval_recall': 0.94, 'eval_runtime': 5.3557, 'eval_samples_per_second': 373.437, 'eval_steps_per_second': 46.68, 'epoch': 3.0}


In [26]:
model.save_pretrained('model')
tokenizer.save_pretrained('tokenizer')

Configuration saved in model/config.json
Model weights saved in model/pytorch_model.bin
tokenizer config file saved in tokenizer/tokenizer_config.json
Special tokens file saved in tokenizer/special_tokens_map.json


('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/vocab.txt',
 'tokenizer/added_tokens.json',
 'tokenizer/tokenizer.json')

In [27]:
from google.colab import files
import shutil

# Zip the model directory
shutil.make_archive('model', 'zip', 'model')
shutil.make_archive('tokenizer', 'zip', 'tokenizer')

# Download the zip files
files.download('model.zip')
files.download('tokenizer.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [29]:
# TEST

In [None]:
# Training model without Trainer class (that is the logic under the hood for the Trainer class)
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)



In [None]:
from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

6000


In [None]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

device(type='cuda')

In [None]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/6000 [00:00<?, ?it/s]

In [None]:
from datasets import load_metric

# Metrics computation
accuracy_metric = load_metric("accuracy", trust_remote_code=True)
f1_metric = load_metric("f1", trust_remote_code=True)
precision_metric = load_metric("precision", trust_remote_code=True)
recall_metric = load_metric("recall", trust_remote_code=True)

  accuracy_metric = load_metric("accuracy", trust_remote_code=True)


In [None]:
def compute_metrics(p):
    predictions, labels = p
    predictions = predictions.argmax(axis=1)
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")
    precision = precision_metric.compute(predictions=predictions, references=labels, average="weighted")
    recall = recall_metric.compute(predictions=predictions, references=labels, average="weighted")
    return {
        "accuracy": accuracy["accuracy"],
        "f1": f1["f1"],
        "precision": precision["precision"],
        "recall": recall["recall"],
    }

In [None]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

# Define evaluation arguments
eval_args = TrainingArguments(
    output_dir="./results",
    per_device_eval_batch_size=8,
    logging_dir='./logs',
    report_to="none",  # Avoid logging to files or other services if not needed
)

In [None]:
# Create Trainer instance for evaluation
trainer = Trainer(
    model=model,
    args=eval_args,
    eval_dataset=tokenized_datasets["validation"],
    compute_metrics=compute_metrics,
    data_collator=data_collator,
)

# Evaluate the model
eval_results = trainer.evaluate()

***** Running Evaluation *****
  Num examples = 2000
  Batch size = 8


In [None]:
eval_results

{'eval_loss': 0.11157584935426712,
 'eval_accuracy': 0.944,
 'eval_f1': 0.9439939204963176,
 'eval_precision': 0.9440095409926034,
 'eval_recall': 0.944,
 'eval_runtime': 5.8968,
 'eval_samples_per_second': 339.168,
 'eval_steps_per_second': 42.396}