In [1]:
import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from datasets import Dataset
import evaluate
import pandas as pd
from data_clean import *
import atel
from atel.data import BookCollection

  from .autonotebook import tqdm as notebook_tqdm


# HuggingFace Trainer

In [2]:
import torch

In [3]:
t = torch.tensor([0.5, 0.94, 0.3])

In [4]:
TARGET     = 'Holistisk vurdering'
set_seed(42)

book_col = BookCollection(data_file="./data/book_col_271120.pkl")
df, labels = get_pandas_dataframe(book_col, TARGET)

NUM_LABELS = len(labels)

Seed has been set to 42
Loaded from disk: ./data/book_col_271120.pkl


In [5]:
NUM_LABELS

6

In [9]:
df

Unnamed: 0,text,labels
0,Jeg er fjollet.,1.0
1,A som i abe. B som i bjørn. C som i cykel. D s...,1.0
2,Her er jeg. Jeg er glad.,0.0
3,Holger er en hund.,1.0
4,Mormor elsker lasagne.,2.0
...,...,...
773,P.S. Krøyer var en dygtig maler som malede man...,3.0
774,Sådan tænder du et bål. 1.Først hugger man nog...,5.0
775,Puppys are so cute and small.,1.0
776,Hej Annette. Jeg har lavet mor men jeg kom til...,2.0


In [27]:
tokenizer = AutoTokenizer.from_pretrained("Maltehb/danish-bert-botxo")
model = AutoModelForSequenceClassification.from_pretrained("Maltehb/danish-bert-botxo",
                                                           num_labels=NUM_LABELS
                                                          )

Some weights of the model checkpoint at Maltehb/danish-bert-botxo were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not 

In [28]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

In [29]:
t = Dataset.from_pandas(df)

In [30]:
t2 = t.map(tokenize_function, batched=True)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  3.37ba/s]


In [31]:
metric = evaluate.load("accuracy")

In [32]:
def compute_metrics_multilabel(eval_pred):
    logits, labels = eval_pred
    return {"accuracy": acc_metric(torch.tensor(logits), torch.tensor(labels))}

In [33]:
t2

Dataset({
    features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 778
})

In [34]:
torch.tensor([t2['labels'][0]]), (torch.tensor([t2['labels'][0]]))

(tensor([[0., 1., 0., 0., 0., 0.]]), tensor([[0., 1., 0., 0., 0., 0.]]))

In [35]:
ls = torch.nn.CrossEntropyLoss()
ls(torch.tensor([-0.4755,  0.0560, -1.3267,  0.3594, -0.0170, -0.5276]), torch.tensor(t2['labels'][0]))

tensor(1.5434)

In [37]:
model(input_ids=torch.tensor([t2['input_ids'][0]]), labels=torch.tensor([t2['labels'][0]]))

ValueError: Expected input batch_size (1) to match target batch_size (6).

In [17]:
training_args = TrainingArguments(
    output_dir="test_trainer", 
    save_strategy='no', 
    evaluation_strategy='epoch',
    report_to='none',
    num_train_epochs=1
)

In [24]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=t2,
    compute_metrics=compute_metrics_multilabel
)

RuntimeError: CUDA out of memory. Tried to allocate 94.00 MiB (GPU 0; 6.00 GiB total capacity; 5.29 GiB already allocated; 0 bytes free; 5.33 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
trainer.train()

# Native PyTorch

In [None]:
tokenized_datasets = tokenized_datasets.remove_columns(["text"])

In [None]:
tokenized_datasets.set_format("torch")

In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=8)
eval_dataloader = DataLoader(small_eval_dataset, batch_size=8)

In [None]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [None]:
from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [None]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

In [None]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)