In [1]:
import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from datasets import Dataset
import evaluate
import pandas as pd
from data_clean import *
import atel
from atel.data import BookCollection

# HuggingFace Trainer

In [2]:
import torch

In [3]:
t = torch.tensor([0.5, 0.94, 0.3])

tensor([1., 1., 0.])

In [15]:
tokenizer = AutoTokenizer.from_pretrained("Maltehb/danish-bert-botxo")
model = AutoModelForSequenceClassification.from_pretrained("Maltehb/danish-bert-botxo", 
                                                           num_labels=21, 
                                                           problem_type="multi_label_classification")

loading configuration file config.json from cache at C:\Users\spetr/.cache\huggingface\hub\models--Maltehb--danish-bert-botxo\snapshots\565d9bd5ca0872bec0b2d7af7887607a96416c2f\config.json
Model config BertConfig {
  "_name_or_path": "Maltehb/danish-bert-botxo",
  "architectures": [
    "BertForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.22.2",
  "type_vocab_size

In [16]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

In [17]:
book_col = BookCollection(data_file="./data/book_col_271120.pkl")

Loaded from disk: ./data/book_col_271120.pkl


In [18]:
df, labels = get_pandas_dataframe(book_col, 'Semantisk univers')

In [19]:
t = Dataset.from_pandas(df)

In [20]:
t2 = t.map(tokenize_function, batched=True)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.08ba/s]


In [21]:
metric = evaluate.load("accuracy")

In [22]:
def compute_metrics_multilabel(eval_pred):
    logits, labels = eval_pred
    return {"accuracy": acc_metric(torch.tensor(logits), torch.tensor(labels))}

In [23]:
training_args = TrainingArguments(
    output_dir="test_trainer", 
    save_strategy='no', 
    evaluation_strategy='epoch',
    report_to='tensorboard',
    num_train_epochs=5
)

PyTorch: setting up devices


In [24]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=t2,
    compute_metrics=compute_metrics_multilabel
)

RuntimeError: CUDA out of memory. Tried to allocate 94.00 MiB (GPU 0; 6.00 GiB total capacity; 5.29 GiB already allocated; 0 bytes free; 5.33 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
trainer.train()

# Native PyTorch

In [None]:
tokenized_datasets = tokenized_datasets.remove_columns(["text"])

In [None]:
tokenized_datasets.set_format("torch")

In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=8)
eval_dataloader = DataLoader(small_eval_dataset, batch_size=8)

In [None]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [None]:
from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [None]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

In [None]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)