# Training a Text Classification Model using the Huggingface Library

In this notebook we want to train a text classification model, namely the DistilBERT model (see: https://arxiv.org/pdf/1910.01108) to classify sentences.


In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch

from datasets import load_dataset
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, accuracy_score, f1_score
from sklearn.preprocessing import MinMaxScaler
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, Trainer, TrainingArguments
from umap.umap_ import UMAP

ckpt = "distilbert-base-uncased"
device = torch.device("cuda") if torch.cuda.is_available() else "cpu"
dataset = load_dataset("emotion")
tokenizer = AutoTokenizer.from_pretrained(ckpt)
model = AutoModel.from_pretrained(ckpt)

In [None]:
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

dataset_encoded = dataset.map(tokenize, batched=True, batch_size=None)
dataset_encoded

In [None]:
def extract_hidden_states(model):
    def _extract_hidden_states(batch):
        # Place model inputs on the GPU
        inputs = {k:v.to(device) for k,v in batch.items() if k in tokenizer.model_input_names}
        # Extract last hidden state
        with torch.no_grad():
            last_hidden_state = model(**inputs).last_hidden_state
        # Return vector for CLS token
        return {"hidden_state": last_hidden_state[:, 0].cpu().numpy()}
    return _extract_hidden_states

dataset_encoded.set_format("torch", columns=["input_ids", "attention_mask", "label"])
dataset_hidden = dataset_encoded.map(extract_hidden_states(model=model), batched=True)

In [None]:
# Inspect the hidden states dataset
dataset_hidden

## Create training, validation and test inputs

In [None]:
X_train = np.array(dataset_hidden["train"]["hidden_state"])
y_train = np.array(dataset_hidden["train"]["label"])

X_valid = np.array(dataset_hidden["validation"]["hidden_state"])
y_valid = np.array(dataset_hidden["validation"]["label"])

X_test = np.array(dataset_hidden["test"]["hidden_state"])
y_test = np.array(dataset_hidden["test"]["label"])


## Create 2D representations of dataset tensors

In [None]:
def create_2d_embeddings(X, y) -> pd.DataFrame:
    # Scale features to [0, 1]
    X_scaled = MinMaxScaler().fit_transform(X)
    # Initialite UMAP and fit it to the data
    mapper = UMAP(n_components=2, metric="cosine").fit(X_scaled)
    # Create a DataFrame of 2D embeddings
    df_emb = pd.DataFrame(mapper.embedding_, columns=["x1", "x2"])
    df_emb["label"] = y
    return df_emb

In [None]:
df_train_2d = create_2d_embeddings(X=X_train, y=y_train)
df_train_2d

In [None]:
def plot_2d_embeddings(df_2d):
    fig, axes = plt.subplots(2, 3, figsize=(12,8))
    axes = axes.flatten()
    cmaps = ["Greys", "Blues", "Oranges", "Reds", "Purples", "Greens"]
    labels = dataset["train"].features["label"].names

    for i, (label, cmap) in enumerate(zip(labels, cmaps)):
        df_emb_sub = df_2d.query(f"label == {i}")
        axes[i].hexbin(df_emb_sub["x1"], df_emb_sub["x2"], cmap=cmap, gridsize=60, linewidths=(0,))
        axes[i].set_title(label)
        axes[i].set_xticks([])
        axes[i].set_yticks([])

plot_2d_embeddings(df_2d=df_train_2d)

## Random classifier

First we will look how a complete random classifier would behave. The Scikit Learn library has a classifier named DummyClassifier which can simulate such "dumb" models. Why should we use such a "dumb" classifier? To evaluate how our model behaves compared to a model that has learned nothing.

There are multiple strategies a DummyClassifier can follow. We will inspect two of them:

***most_frequent***: The predict method always returns the most frequent class label in the observed y argument passed to fit. This simulates a model that has collaped to predicting always the same output.

***uniform***: This strategy generates predictions uniformly at random from the list of unique classes observed in y. This simulates a model that has learned nothing about the dataset.

First let's train a model using the "uniform" strategy.

In [None]:
# Create a DummyClassifier model using the 'uniform' strategy
model_uf = DummyClassifier(strategy="uniform")

# Fit the model to the training data
model_uf.fit(X_train, y_train)

# Evaluate the trained model on the validation data and compute the accuracy
model_uf.score(X_train, y_train)

The DummyClassifier using the uniform strategy reaches an accuracy of 17%. So each other model that has learned something about our data should perform much better.

Let's also simulate a model that has learned to always output the most frequent class.

In [None]:
# Create a DummyClassifier model using the 'most_frequent' strategy
model_mf = DummyClassifier(strategy="most_frequent")

# Fit the model to the training data
model_mf.fit(X_train, y_train)

# Evaluate the trained model on the validation data and compute the accuracy
model_mf.score(X_train, y_train)

The "most frequent" strategy does not perform that good as well, but better than the uniform strategy. But also always outputting the most frequent class isn't a good behaviour of a model. Let's switch to training a real model instead of simulating dumb models.

## Logistic Regression Model Training

First we do not train the neural network (our DistilBERT model) itself, but use its produced embeddings to train a classification model. We generate embeddings for each sentence using the pretrained model and run these embeddings through a logistic regression (LR) model from the Scikit Learn library. The LR model is trained only with the training data. After training the model we evaluate it using the validation data.

In [None]:
# Create a logistic regression classifier
model_lr = LogisticRegression(max_iter=3000)

# Fit the model to the training data
model_lr.fit(X_train, y_train)

# Evaluate the trained model on the validation data and compute the accuracy
model_lr.score(X_valid, y_valid)

The accuracy is round about 63% which is not that good. It seems using the pretrained DistilBERT model does not perform well on classifying emotions with its pretrained weights, but is way better than the dumb models we explored before. It seems like the pretrained DistilBERT model already produces slightly helpful embeddings, but actually we want to score much better than 63%.

## Plotting a Confusion Matrix

We want to explore the relaationships between the true and the predicted labels of our logistic regression model. We can do this by plotting a confusion matrix.

In [None]:
def plot_confusion_matrix(y_preds, y_true, labels):
    cm = confusion_matrix(y_true, y_preds, normalize="true")
    fig, ax = plt.subplots(figsize=(6,6))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
    disp.plot(cmap="Blues", values_format=".2f", ax=ax, colorbar=True)
    plt.title("Normalized confusion matrix")
    plt.show()

y_preds = model_lr.predict(X_valid)
plot_confusion_matrix(y_preds, y_valid, labels)

Looking at the confusion matrix it looks like that joy is the most easy class to predict. The model predicts the correct label for 80% of all sentences labeled with 'joy'. The hardest classes for out model seem to be 'love' and 'joy'. The model reaches 30% or less accuracy for these classes.

This is actually not a good result. it looks like the pretrained weights of the DistilBERT model are not useful for out problem. We might need to fine-tune the model to this specifiy dataset for getting better results. This is what we want to do now.

In [None]:
def compute_metrics(pred) -> dict:
    """This function computes accuracy and f1 score during training."""
    if not hasattr(pred, "labels"):
        return {}
    labels = pred.labels
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}


In [None]:
batch_size = 16
logging_steps = len(dataset_encoded["train"]) // batch_size
model_name = f"../../checkpoints/{ckpt}-finetuned-emotion"
train_feature_extractor = True

# Instantiate a DistilBERT model with a classification head
model_finetuned = AutoModelForSequenceClassification.from_pretrained(ckpt, num_labels=6).to(device)

# Freeze parameters of transformer layers (the feature extractor) when train_feature_extractor is set to false
if train_feature_extractor:
    for param in model_finetuned.distilbert.parameters():
        param.requires_grad = True
else:
    for param in model_finetuned.distilbert.parameters():
        param.requires_grad = False

training_args = TrainingArguments(
    output_dir=model_name,
    num_train_epochs=2,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    disable_tqdm=False,
    logging_steps=logging_steps,
    push_to_hub=False,
    log_level="error"
)

trainer = Trainer(
    model=model_finetuned,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=dataset_encoded["train"],
    eval_dataset=dataset_encoded["validation"],
    tokenizer=tokenizer
)

In [None]:
trainer.train()

## Evaluating the fine-tuned model

Now that we have fine-tuned the model on our dataset it's time to evaluate its performance. First we will let the model generate labels for the sentences of the validation dataset.

In [None]:
preds_output = trainer.predict(dataset_encoded["validation"])
preds_output

## Plot the confusion matrix for the fine-tuned model

Now we want to see how our fine-tuned model performs. Therefore we plot the confusion matrix again.

In [None]:
y_preds = np.argmax(preds_output.predictions, axis=1)
plot_confusion_matrix(y_preds, y_valid, labels)

In [None]:
dataset_hidden_finetuned = dataset_encoded.map(extract_hidden_states(model=model_finetuned.distilbert.to(device)), batched=True)
X_train_finetuned = np.array(dataset_hidden_finetuned["train"]["hidden_state"])
y_train_finetuned = np.array(dataset_hidden_finetuned["train"]["label"])
df_train_2d_finetuned = create_2d_embeddings(X=X_train_finetuned, y=y_train_finetuned)
plot_2d_embeddings(df_2d=df_train_2d_finetuned)