In [15]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset
from sklearn.metrics import f1_score
import torch
import numpy as np

In [16]:
model_name = "DeepPavlov/rubert-base-cased"
num_labels = 14

dataset = load_dataset("json", data_files="semeval_translated_dataset.jsonl", split="train")
tokenizer = AutoTokenizer.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [17]:
import os

log_dir = "./my_new_logs_dir"
model_dir = "./my_new_model_dir"

os.makedirs(log_dir, exist_ok=True)
os.makedirs(model_dir, exist_ok=True)
print("Dossiers créés :", os.path.isdir(log_dir), os.path.isdir(model_dir))


Dossiers créés : True True


In [18]:
from torch.utils.tensorboard import SummaryWriter

log_dir = "./test_logs_123"
import os
os.makedirs(log_dir, exist_ok=True)
writer = SummaryWriter(log_dir=log_dir)
writer.add_scalar('loss', 0.5, 0)
writer.close()


FailedPreconditionError: ./test_logs_123 is not a directory

In [None]:
def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=256)

dataset = load_dataset("json", data_files="semeval_translated_dataset.jsonl", split="train")
dataset = dataset.map(tokenize, batched=True)
dataset = dataset.train_test_split(test_size=0.1)

def compute_metrics(pred):
    logits, labels = pred
    probs = torch.sigmoid(torch.tensor(logits)).numpy()
    preds = (probs >= 0.5).astype(int)
    f1_micro = f1_score(labels, preds, average="micro")
    f1_macro = f1_score(labels, preds, average="macro")
    return {"f1_micro": f1_micro, "f1_macro": f1_macro}

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    problem_type="multi_label_classification"
)

args = TrainingArguments(
    output_dir="./rubert_semeval_model",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    num_train_epochs=4,
    learning_rate=2e-5,
    weight_decay=0.01,
    warmup_steps=200,
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    fp16=True,
    logging_dir="./logs",
    evaluation_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()
trainer.save_model("./rubert_semeval_model")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


FailedPreconditionError: ./logs is not a directory

Evaluation VS "SkolkovoInstitute/roberta_large_semeval2020_task11"

In [None]:
def compute_metrics(pred, labels):
    probs = torch.sigmoid(torch.tensor(pred)).numpy()
    preds = (probs >= 0.5).astype(int)
    f1_micro = f1_score(labels, preds, average="micro")
    f1_macro = f1_score(labels, preds, average="macro")
    return {"f1_micro": f1_micro, "f1_macro": f1_macro}

def evaluate_model(model_name_or_path, dataset_path, is_russian):
    model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path)
    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
    ds = load_dataset("json", data_files=dataset_path, split="train")

    if not is_russian:
        from deep_translator import GoogleTranslator
        translator = GoogleTranslator(source="ru", target="en")
        ds = ds.map(lambda x: {"text": translator.translate(x["text"])}, batched=False)

    ds = ds.map(lambda x: tokenizer(x["text"], truncation=True, padding="max_length", max_length=256), batched=True)
    ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

    model.eval()
    all_logits, all_labels = [], []

    for batch in torch.utils.data.DataLoader(ds, batch_size=8):
        with torch.no_grad():
            outputs = model(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"])
            logits = outputs.logits.cpu().numpy()
        all_logits.append(logits)
        all_labels.append(batch["labels"].numpy())

    preds = np.concatenate(all_logits, axis=0)
    golds = np.concatenate(all_labels, axis=0)
    return compute_metrics(preds, golds)

rubert_scores = evaluate_model("./rubert_semeval_model", "semeval_translated_dataset.jsonl", is_russian=True)
roberta_scores = evaluate_model(
    model_name_or_path="./RoBERTa pretrained", 
    dataset_path="datasets_propaganda/semeval_translated_dataset.jsonl",
    is_russian=False
)


print("rubert:", rubert_scores)
print("roberta:", roberta_scores)


HFValidationError: Repo id must use alphanumeric chars or '-', '_', '.', '--' and '..' are forbidden, '-' and '.' cannot start or end the name, max length is 96: './rubert_semeval_model'.