In [1]:
!pip install transformers torch datasets evaluate accelerate

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m27.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3

In [135]:
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("go_emotions")

# Inspect the structure
print(dataset)


DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 43410
    })
    validation: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 5426
    })
    test: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 5427
    })
})


In [136]:
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer

# Initialize the tokenizer
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

class MultilabelDataset(Dataset):
    def __init__(self, data, tokenizer, num_labels=28, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.num_labels = num_labels
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Extract example
        example = self.data[idx]
        text = example["text"]
        labels = example["labels"]

        # Tokenize the text
        tokenized = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",
        )

        # Convert labels to one-hot encoding
        one_hot_labels = torch.zeros(self.num_labels, dtype=torch.float32)
        for label in labels:
            one_hot_labels[label] = 1.0

        # Return input_ids, attention_mask, and labels
        return {
            "input_ids": tokenized["input_ids"].squeeze(0),
            "attention_mask": tokenized["attention_mask"].squeeze(0),
            "labels": one_hot_labels,
        }


In [137]:
# Create PyTorch datasets
train_data = MultilabelDataset(dataset["train"], tokenizer)
val_data = MultilabelDataset(dataset["validation"], tokenizer)
test_data = MultilabelDataset(dataset["test"], tokenizer)




In [138]:
from torch.utils.data import DataLoader

batch_size = 32

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_data, batch_size=batch_size)
test_loader = DataLoader(test_data, batch_size=batch_size)


In [139]:
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score
from transformers import AutoModelForSequenceClassification
num_labels = 28

# Load the model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=num_labels
)
model.config.problem_type = "multi_label_classification"
model.to("cuda")



Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [140]:
from torch.optim.lr_scheduler import StepLR

# Define optimizer, loss, and scheduler
optimizer = optim.AdamW(model.parameters(), lr=2e-5)
criterion = nn.BCEWithLogitsLoss()  # Use class weights if applicable
scheduler = StepLR(optimizer, step_size=1, gamma=0.5)

# Training loop
num_epochs = 5
best_f1 = 0.0  # Track the best F1 score
best_epoch = 0

for epoch in range(num_epochs):
    # Training Phase
    model.train()
    total_loss = 0

    for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1} - Training"):
        input_ids = batch["input_ids"].to("cuda")
        attention_mask = batch["attention_mask"].to("cuda")
        labels = batch["labels"].to("cuda")

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        loss = criterion(logits, labels)
        total_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1}, Training Loss: {total_loss / len(train_loader):.4f}")

    # Validation Phase
    model.eval()
    all_probabilities = []
    all_labels = []
    total_val_loss = 0

    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Epoch {epoch + 1} - Validation"):
            input_ids = batch["input_ids"].to("cuda")
            attention_mask = batch["attention_mask"].to("cuda")
            labels = batch["labels"].to("cuda")

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            probabilities = torch.sigmoid(logits)

            loss = criterion(logits, labels)
            total_val_loss += loss.item()

            all_probabilities.append(probabilities.cpu())
            all_labels.append(labels.cpu())

    # Compute metrics
    all_probabilities = torch.cat(all_probabilities)
    all_labels = torch.cat(all_labels)
    predictions = (all_probabilities > 0.30).int()  # Fixed threshold

    accuracy = accuracy_score(all_labels.numpy(), predictions.numpy())
    f1 = f1_score(all_labels.numpy(), predictions.numpy(), average="micro")
    print(f"Epoch {epoch + 1}, Validation Loss: {total_val_loss / len(val_loader):.4f}")
    print(f"Epoch {epoch + 1}, Validation Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}")

    # Save the best model
    if f1 > best_f1:
        best_f1 = f1
        best_epoch = epoch + 1
        model.save_pretrained("./best_model")
        tokenizer.save_pretrained("./best_model")

    scheduler.step()

print(f"Best F1 Score: {best_f1:.4f} at Epoch {best_epoch}")


Epoch 1 - Training: 100%|██████████| 1357/1357 [06:34<00:00,  3.44it/s]


Epoch 1, Training Loss: 0.1363


Epoch 1 - Validation: 100%|██████████| 170/170 [00:19<00:00,  8.73it/s]


Epoch 1, Validation Loss: 0.0917
Epoch 1, Validation Accuracy: 0.4458, F1 Score: 0.5749


Epoch 2 - Training: 100%|██████████| 1357/1357 [06:34<00:00,  3.44it/s]


Epoch 2, Training Loss: 0.0864


Epoch 2 - Validation: 100%|██████████| 170/170 [00:19<00:00,  8.70it/s]


Epoch 2, Validation Loss: 0.0860
Epoch 2, Validation Accuracy: 0.4539, F1 Score: 0.5932


Epoch 3 - Training: 100%|██████████| 1357/1357 [06:34<00:00,  3.44it/s]


Epoch 3, Training Loss: 0.0786


Epoch 3 - Validation: 100%|██████████| 170/170 [00:19<00:00,  8.72it/s]


Epoch 3, Validation Loss: 0.0849
Epoch 3, Validation Accuracy: 0.4488, F1 Score: 0.5944


Epoch 4 - Training: 100%|██████████| 1357/1357 [06:34<00:00,  3.44it/s]


Epoch 4, Training Loss: 0.0748


Epoch 4 - Validation: 100%|██████████| 170/170 [00:19<00:00,  8.76it/s]


Epoch 4, Validation Loss: 0.0848
Epoch 4, Validation Accuracy: 0.4563, F1 Score: 0.5998


Epoch 5 - Training: 100%|██████████| 1357/1357 [06:34<00:00,  3.44it/s]


Epoch 5, Training Loss: 0.0729


Epoch 5 - Validation: 100%|██████████| 170/170 [00:19<00:00,  8.77it/s]


Epoch 5, Validation Loss: 0.0846
Epoch 5, Validation Accuracy: 0.4602, F1 Score: 0.6041
Best F1 Score: 0.6041 at Epoch 5


In [121]:
import torch

if torch.cuda.is_available():
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    print("No GPU available, using CPU instead.")


Using GPU: NVIDIA A100-SXM4-40GB


In [141]:
model.eval()
all_predictions = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to("cuda")
        attention_mask = batch["attention_mask"].to("cuda")
        labels = batch["labels"].to("cuda")

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probabilities = torch.sigmoid(logits)

        predictions = (probabilities > 0.30).int()  # Use the optimal threshold
        all_predictions.append(predictions.cpu())
        all_labels.append(labels.cpu())

all_predictions = torch.cat(all_predictions)
all_labels = torch.cat(all_labels)

# Compute metrics
accuracy = accuracy_score(all_labels.numpy(), all_predictions.numpy())
f1 = f1_score(all_labels.numpy(), all_predictions.numpy(), average="micro")
print(f"Test Accuracy: {accuracy:.4f}, Test F1 Score: {f1:.4f}")


Test Accuracy: 0.4559, Test F1 Score: 0.6072


In [142]:
!zip -r best_model.zip ./best_model


  adding: best_model/ (stored 0%)
  adding: best_model/model.safetensors (deflated 8%)
  adding: best_model/special_tokens_map.json (deflated 42%)
  adding: best_model/vocab.txt (deflated 53%)
  adding: best_model/config.json (deflated 65%)
  adding: best_model/tokenizer_config.json (deflated 76%)
  adding: best_model/tokenizer.json (deflated 71%)
