## Install dependencies

In [1]:
!pip install datasets
!pip install transformers==4.17
!pip install accelerate>=0.20.3

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed datasets-2

## Importing modules

In [2]:
import shutil, numpy as np

from google.colab import files
from datasets import load_dataset, load_metric
from transformers import (
    Trainer,
    AutoTokenizer,
    TrainingArguments,
    DataCollatorWithPadding,
    AutoModelForSequenceClassification
)

## Loading and inspecting data

In [3]:
emotion = load_dataset('emotion', trust_remote_code=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/3.97k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.28k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/8.78k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/592k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/74.0k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/74.9k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/16000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [4]:
print("=======================================================")
print(emotion)
print("=======================================================")
print(emotion['train'].features['label'].names)
print(emotion['validation'].features['label'].names)
print(emotion['test'].features['label'].names)
print("=======================================================")
print(emotion['train'][0]) # Get the first row of train dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})
['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']
['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']
['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']
{'text': 'i didnt feel humiliated', 'label': 0}


### We can see that our dataset consists of three sets:
1. `train`,
2. `validation`,
3. `test`,

and has six kinds of emotion:
0. `sadness`,
1. `joy`,
2. `love`,
3. `anger`,
4. `fear`,
5. `suprise`,

## Loading Tokenizer and Model

In [5]:
# Checkpoint for BERT model
checkpoint = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=6)

Downloading:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

## Tokenization

- Models cannot process raw text, because of that we need to convert the text into numbers. Tokenization provides a way to do this by dividing text into individual words called tokens. Tokens are finally converted to numbers.
- We will use tokenizer corresponding to a pretrained BERT model. Using the same tokenizer as the pretrained model is important because you want to make sure the text is split in the same way.

In [6]:
# To see how tokenization works, we will tokenize some text
tokenizer.tokenize(emotion['train'][0]['text'])

['i', 'didn', '##t', 'feel', 'humiliated']

In [7]:
tokenizer(emotion['train'][0]['text'])

{'input_ids': [101, 1045, 2134, 2102, 2514, 26608, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

- The tokenizer returns a dictionary with three items:

1. `input_ids`: the numbers representing the tokens in the text.
2. `token_type_ids`: indicates which sequence a token belongs to if there is more than one sequence.
3. `attention_mask`: indicates whether a token should be masked or not.


- These values are actually the model inputs.

In [8]:
# Define function for tokenization
def tokenization(text):
    return tokenizer(text['text'], truncation=True)

In [9]:
tokenized_datasets = emotion.map(tokenization, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [10]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2000
    })
})

We need to apply a bit of postprocessing to our tokenized_datasets. Specifically, we need to:

- Remove the columns corresponding to values the model does not expect (like the text column).
- Rename the column label to labels (because the model expects the argument to be named labels).
- Set the format of the datasets so they return PyTorch tensors instead of lists.

In [11]:
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

## Defining Metrics for Evaluation

In [12]:
accuracy_metric = load_metric("accuracy", trust_remote_code=True)
f1_metric = load_metric("f1", trust_remote_code=True)
precision_metric = load_metric("precision", trust_remote_code=True)
recall_metric = load_metric("recall", trust_remote_code=True)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")
    precision = precision_metric.compute(predictions=predictions, references=labels, average="weighted")
    recall = recall_metric.compute(predictions=predictions, references=labels, average="weighted")

    return {
        "accuracy": accuracy["accuracy"],
        "f1": f1["f1"],
        "precision": precision["precision"],
        "recall": recall["recall"],
    }

  accuracy_metric = load_metric("accuracy", trust_remote_code=True)


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.58k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.52k [00:00<?, ?B/s]

## Training

In [13]:
training_args = TrainingArguments(
    output_dir="test-trainer",             # Директоријум за чување излазних резултата
    evaluation_strategy="epoch",           # Стратегија евалуације на крају сваке епохе
    num_train_epochs=3,                    # Број епоха
    per_device_train_batch_size=8,         # Величина пакета по уређају за тренинг
    per_device_eval_batch_size=8,          # Величина пакета по уређају за евалуацију
    logging_steps=500,                      # Учесталост логовања
)

In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

***** Running training *****
  Num examples = 16000
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 6000


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2436,0.248987,0.927,0.926185,0.927769,0.927
2,0.1615,0.169977,0.9375,0.936723,0.937626,0.9375
3,0.1111,0.219575,0.942,0.941915,0.9422,0.942


Saving model checkpoint to test-trainer/checkpoint-500
Configuration saved in test-trainer/checkpoint-500/config.json
Model weights saved in test-trainer/checkpoint-500/pytorch_model.bin
tokenizer config file saved in test-trainer/checkpoint-500/tokenizer_config.json
Special tokens file saved in test-trainer/checkpoint-500/special_tokens_map.json
Saving model checkpoint to test-trainer/checkpoint-1000
Configuration saved in test-trainer/checkpoint-1000/config.json
Model weights saved in test-trainer/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in test-trainer/checkpoint-1000/tokenizer_config.json
Special tokens file saved in test-trainer/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to test-trainer/checkpoint-1500
Configuration saved in test-trainer/checkpoint-1500/config.json
Model weights saved in test-trainer/checkpoint-1500/pytorch_model.bin
tokenizer config file saved in test-trainer/checkpoint-1500/tokenizer_config.json
Special tokens file saved

TrainOutput(global_step=6000, training_loss=0.22913156954447428, metrics={'train_runtime': 682.2163, 'train_samples_per_second': 70.359, 'train_steps_per_second': 8.795, 'total_flos': 1019275389366624.0, 'train_loss': 0.22913156954447428, 'epoch': 3.0})

In [15]:
metrics = trainer.evaluate()
print(metrics)

***** Running Evaluation *****
  Num examples = 2000
  Batch size = 8


{'eval_loss': 0.21957451105117798, 'eval_accuracy': 0.942, 'eval_f1': 0.9419145172846654, 'eval_precision': 0.9422000490081629, 'eval_recall': 0.942, 'eval_runtime': 5.7949, 'eval_samples_per_second': 345.129, 'eval_steps_per_second': 43.141, 'epoch': 3.0}


In [16]:
metrics_train = trainer.evaluate(eval_dataset=tokenized_datasets["test"])
print("Test Dataset Metrics:", metrics_train)

***** Running Evaluation *****
  Num examples = 2000
  Batch size = 8


Test Dataset Metrics: {'eval_loss': 0.255603164434433, 'eval_accuracy': 0.926, 'eval_f1': 0.925886352981489, 'eval_precision': 0.9262506105415047, 'eval_recall': 0.926, 'eval_runtime': 6.1313, 'eval_samples_per_second': 326.196, 'eval_steps_per_second': 40.775, 'epoch': 3.0}


## Saving model and tokenizer locally

In [17]:
# Save the trained model
model.save_pretrained("./saved_model")

# Save the tokenizer
tokenizer.save_pretrained("./saved_model")

# Zip the saved directory
shutil.make_archive("model_tokenizer", 'zip', "./saved_model")

# Download the zip file
files.download("model_tokenizer.zip")

Configuration saved in ./saved_model/config.json
Model weights saved in ./saved_model/pytorch_model.bin
tokenizer config file saved in ./saved_model/tokenizer_config.json
Special tokens file saved in ./saved_model/special_tokens_map.json


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>