# Text classification with TREC Dataset
# using **Transformer Models**

## Library instalation and importation

In [1]:
! pip install transformers datasets evaluate nvidia-ml-py3 pynvml accelerate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
#pip install --upgrade 

In [None]:
#!pip install pynvml

In [2]:
# Imports
import torch
import time
import numpy as np
import matplotlib.pyplot as plt
import evaluate
from datasets import load_dataset
from transformers import AutoTokenizer, get_scheduler
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification
from torch.optim import AdamW
from tqdm.auto import tqdm
from sklearn.metrics import confusion_matrix
from transformers import TrainingArguments, Trainer
from pynvml import *


## Function to plot Confusion Matrix

In [3]:
def plot_confusion_matrix(y_true, y_pred, plot_title='Confusion Matrix'):
    cm = confusion_matrix(y_true, y_pred)
    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    ax.set(xticks=np.arange(cm.shape[1]), yticks=np.arange(cm.shape[0]),
           xticklabels=['ABBR', 'ENTY', 'DESC', 'HUM', 'LOC', 'NUM'],
           yticklabels=['ABBR', 'ENTY', 'DESC', 'HUM', 'LOC', 'NUM'],
           title=plot_title, ylabel='True label', xlabel='Predicted label')
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], 'd'),
                    ha="center", va="center",
                    color="white" if cm[i, j] > cm.max() / 2. else "black")
    fig.tight_layout()
    plt.show()

    # Amb percenttages
    cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] 
    fig, ax = plt.subplots()
    im = ax.imshow(cm_normalized, interpolation='nearest', cmap=plt.cm.Blues)
    ax.set(xticks=np.arange(cm_normalized.shape[1]),
           yticks=np.arange(cm_normalized.shape[0]),
           xticklabels=['ABBR', 'ENTY', 'DESC', 'HUM', 'LOC', 'NUM'],
           yticklabels=['ABBR', 'ENTY', 'DESC', 'HUM', 'LOC', 'NUM'],
           title=plot_title,
           ylabel='True label',
           xlabel='Predicted label')

    plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")

    for i in range(cm_normalized.shape[0]):
        for j in range(cm_normalized.shape[1]):
            ax.text(j, i, format(cm_normalized[i, j], '.0%'),  # Display percentages with 2 decimal places
                    ha="center", va="center",
                    color="white" if cm_normalized[i, j] > cm_normalized.max() / 2. else "black")

    fig.tight_layout()
    plt.show()

## Load dataset

In [4]:
# Load dataset
dataset = load_dataset("trec")



  0%|          | 0/2 [00:00<?, ?it/s]

## Functions for counting parameters and gpu utilitzation

In [5]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()
print_gpu_utilization()

GPU memory occupied: 258 MB.


In [6]:
#model_name = 'bert-base-cased'
#model_name = 'distilbert-base-uncased'
#model_name = 'albert-base-v2'
#model_name = 'xlm-roberta-base'
#model_name = 'roberta-base'
model_name = 'google/electra-small-discriminator'

batch_size = 8
num_epochs = 3

# Load tokenizer and tokenize dataset
tokenizer = AutoTokenizer.from_pretrained(model_name)
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Remove unnecesary columns and rename columns
tokenized_datasets = tokenized_datasets.remove_columns(["text", "fine_label"])
tokenized_datasets = tokenized_datasets.rename_column("coarse_label", "labels")
tokenized_datasets.set_format("torch")

# Load model 
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=6)
print(count_parameters(model))
print_gpu_utilization()

# Define training parameters
if model_name in ['distilbert-base-uncased', 'albert-base-v2']:
  training_args = TrainingArguments(per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, 
                                    gradient_accumulation_steps=4, fp16=True, 
                                    output_dir="test_trainer", evaluation_strategy="epoch")
else:
  training_args = TrainingArguments(per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, 
                                    gradient_accumulation_steps=4, gradient_checkpointing=True, fp16=True, 
                                    output_dir="test_trainer", evaluation_strategy="epoch")

# Define evaluation metric
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

metric = evaluate.load("accuracy")

# Train model
trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_datasets["train"],
            eval_dataset=tokenized_datasets["test"],
            compute_metrics=compute_metrics,
            )
result = trainer.train()




Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Some weights of the model checkpoint at google/electra-small-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['classifier

13550342
GPU memory occupied: 258 MB.


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]



Epoch,Training Loss,Validation Loss,Accuracy
0,No log,0.773911,0.926
2,No log,0.398637,0.942
2,0.796400,0.325039,0.95


In [7]:
print_summary(result)

Time: 243.41
Samples/second: 67.19
GPU memory occupied: 1873 MB.
