In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="3"


In [2]:
!huggingface-cli login --token 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

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/joheras/.cache/huggingface/token
Login successful


In [3]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import TrainingArguments
from transformers import AutoModelForSequenceClassification
import evaluate
import numpy as np
from transformers import Trainer
import torch
torch.cuda.set_device(0)

In [4]:
dataset = load_dataset("joheras/spanish-suicide-intent")

In [5]:
model_checkpoint = "bertin-project/bertin-roberta-base-spanish"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [6]:
def tokenize_function(example):
    return tokenizer(example["Text"], truncation=True)

In [7]:
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset

Map:   0%|          | 0/15131 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Text', 'Label', 'dataset', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 136136
    })
    val: Dataset({
        features: ['Text', 'Label', 'dataset', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 15131
    })
    test: Dataset({
        features: ['Text', 'Label', 'dataset', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 37820
    })
})

In [8]:
tokenized_dataset=tokenized_dataset.rename_column("Label","label")

In [9]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


In [10]:
training_args = TrainingArguments(
    output_dir=model_checkpoint[model_checkpoint.find('/')+1:]+"-spanish-suicide-intent-v2",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

In [11]:
def compute_metrics(eval_preds):
    metric_f1 = evaluate.load("f1")
    metric_accuracy = evaluate.load("accuracy")
    metric_precision = evaluate.load("precision")
    metric_recall = evaluate.load("recall")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return {
        "precision": metric_precision.compute(predictions=predictions, references=labels),
        "recall": metric_recall.compute(predictions=predictions, references=labels),
        "f1": metric_f1.compute(predictions=predictions, references=labels),
        "accuracy": metric_accuracy.compute(predictions=predictions, references=labels),
    }


In [12]:
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=4)

Some weights of the model checkpoint at bertin-project/bertin-roberta-base-spanish were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at bertin-project/bertin-roberta-base-spanish and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classif

In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["val"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
/mnt/hd2/SuicidePrevention/Models/bertin-roberta-base-spanish/bertin-roberta-base-spanish-spanish-suicide-intent-v2 is already a clone of https://huggingface.co/joheras/bertin-roberta-base-spanish-spanish-suicide-intent-v2. Make sure you pull the latest changes with `repo.git_pull()`.
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mjoheras[0m. Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01666903557876746, max=1.0)…

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1923,0.224675,{'precision': 0.9232340922358435},{'recall': 0.9384364337635366},{'f1': 0.9307731920841609},{'accuracy': 0.9378097944617011}
2,0.1335,0.23756,{'precision': 0.934763181411975},{'recall': 0.9310191366266133},{'f1': 0.9328874024526199},{'accuracy': 0.9403211948978918}
3,0.0627,0.308792,{'precision': 0.9240174672489083},{'recall': 0.9417000445037829},{'f1': 0.9327749614282566},{'accuracy': 0.9395281210759369}
4,0.0215,0.431672,{'precision': 0.9297806565582217},{'recall': 0.9369529743361519},{'f1': 0.933353036796217},{'accuracy': 0.9403872843830546}
5,0.0076,0.501209,{'precision': 0.9262342973999416},{'recall': 0.9406616229046135},{'f1': 0.9333922131449178},{'accuracy': 0.9401890159275659}


TrainOutput(global_step=42545, training_loss=0.09397967778338245, metrics={'train_runtime': 7934.95, 'train_samples_per_second': 85.783, 'train_steps_per_second': 5.362, 'total_flos': 5.14476081591239e+16, 'train_loss': 0.09397967778338245, 'epoch': 5.0})

In [14]:
trainer.evaluate()

{'eval_loss': 0.22467482089996338,
 'eval_precision': {'precision': 0.9232340922358435},
 'eval_recall': {'recall': 0.9384364337635366},
 'eval_f1': {'f1': 0.9307731920841609},
 'eval_accuracy': {'accuracy': 0.9378097944617011},
 'eval_runtime': 39.6873,
 'eval_samples_per_second': 381.256,
 'eval_steps_per_second': 23.836,
 'epoch': 5.0}