In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="3"


In [2]:
!huggingface-cli login --token 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

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/joheras/.cache/huggingface/token
Login successful


In [3]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import TrainingArguments
from transformers import AutoModelForSequenceClassification
import evaluate
import numpy as np
from transformers import Trainer
import torch
torch.cuda.set_device(0)

In [4]:
dataset = load_dataset("joheras/spanish-suicide-intent-information")

In [5]:
model_checkpoint = "bertin-project/bertin-roberta-base-spanish"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [6]:
def tokenize_function(example):
    return tokenizer(example["Text"], truncation=True)

In [7]:
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset

Map:   0%|          | 0/2983 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Text', 'Label', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 10736
    })
    val: Dataset({
        features: ['Text', 'Label', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 1193
    })
    test: Dataset({
        features: ['Text', 'Label', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 2983
    })
})

In [8]:
tokenized_dataset=tokenized_dataset.rename_column("Label","label")

In [9]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


In [10]:
training_args = TrainingArguments(
    output_dir=model_checkpoint[model_checkpoint.find('/')+1:]+"-spanish-suicide-intent-information-v2",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=30,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

In [11]:
def compute_metrics(eval_preds):
    metric_f1 = evaluate.load("f1")
    metric_accuracy = evaluate.load("accuracy")
    metric_precision = evaluate.load("precision")
    metric_recall = evaluate.load("recall")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return {
        "precision": metric_precision.compute(predictions=predictions, references=labels),
        "recall": metric_recall.compute(predictions=predictions, references=labels),
        "f1": metric_f1.compute(predictions=predictions, references=labels),
        "accuracy": metric_accuracy.compute(predictions=predictions, references=labels),
    }


In [12]:
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=4)

Some weights of the model checkpoint at bertin-project/bertin-roberta-base-spanish were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at bertin-project/bertin-roberta-base-spanish and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifie

In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["val"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
/mnt/hd2/SuicidePrevention/Models/bertin-roberta-base-spanish/bertin-roberta-base-spanish-spanish-suicide-intent-information-v2 is already a clone of https://huggingface.co/joheras/bertin-roberta-base-spanish-spanish-suicide-intent-information-v2. Make sure you pull the latest changes with `repo.git_pull()`.
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mjoheras[0m. Use [1m`wandb login --relogin`[0m to force relogin


You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0557,0.009572,{'precision': 0.994991652754591},{'recall': 1.0},{'f1': 0.9974895397489539},{'accuracy': 0.9974853310980721}
2,0.0118,0.026159,{'precision': 0.9983136593591906},{'recall': 0.9932885906040269},{'f1': 0.9957947855340622},{'accuracy': 0.9958088851634534}
3,0.0014,0.020905,{'precision': 0.9983164983164983},{'recall': 0.9949664429530202},{'f1': 0.9966386554621849},{'accuracy': 0.9966471081307627}
4,0.0014,0.025525,{'precision': 0.9966499162479062},{'recall': 0.9983221476510067},{'f1': 0.997485331098072},{'accuracy': 0.9974853310980721}
5,0.0,0.021451,{'precision': 0.9966499162479062},{'recall': 0.9983221476510067},{'f1': 0.997485331098072},{'accuracy': 0.9974853310980721}
6,0.0,0.021413,{'precision': 0.9966499162479062},{'recall': 0.9983221476510067},{'f1': 0.997485331098072},{'accuracy': 0.9974853310980721}
7,0.0,0.021531,{'precision': 0.9966499162479062},{'recall': 0.9983221476510067},{'f1': 0.997485331098072},{'accuracy': 0.9974853310980721}
8,0.0,0.021787,{'precision': 0.9966499162479062},{'recall': 0.9983221476510067},{'f1': 0.997485331098072},{'accuracy': 0.9974853310980721}
9,0.0,0.022117,{'precision': 0.9966499162479062},{'recall': 0.9983221476510067},{'f1': 0.997485331098072},{'accuracy': 0.9974853310980721}
10,0.0,0.022494,{'precision': 0.9983221476510067},{'recall': 0.9983221476510067},{'f1': 0.9983221476510067},{'accuracy': 0.9983235540653814}


TrainOutput(global_step=20130, training_loss=0.0019489616339066552, metrics={'train_runtime': 4881.6955, 'train_samples_per_second': 65.977, 'train_steps_per_second': 4.124, 'total_flos': 2.3742811537995264e+16, 'train_loss': 0.0019489616339066552, 'epoch': 30.0})

In [15]:
trainer.evaluate()

{'eval_loss': 0.009571865200996399,
 'eval_precision': {'precision': 0.994991652754591},
 'eval_recall': {'recall': 1.0},
 'eval_f1': {'f1': 0.9974895397489539},
 'eval_accuracy': {'accuracy': 0.9974853310980721},
 'eval_runtime': 7.6083,
 'eval_samples_per_second': 156.803,
 'eval_steps_per_second': 9.858,
 'epoch': 30.0}