0. Load Data and Libraries

In [None]:
from pandas import read_csv
df = read_csv("C:/Users/qhaskovec/OneDrive/GitHub/PhD-Thesis-EU-Solidarity-Statements/EUSpeech_translated_preprocessed.csv")
df = pd.read_csv("solidarity_labels.csv")  # must have 'text' and 'label' columns
label_map = {"no_solidarity": 0, "solidarity": 1}
df["label"] = df["label"].map(label_map)  # if not already 0/1

In [None]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

1. Train The Model

In [None]:
# === Load your labeled data ===
df = pd.read_csv("solidarity_labels.csv")  # must have 'text' and 'label' columns


# === Define model ID and load tokenizer + classification model ===
model_id = "EuroBERT/EuroBERT-2.1B"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=2, trust_remote_code=True)

# === Tokenize function ===
def tokenize_function(example):
    return tokenizer(example["text"], padding="max_length", truncation=True, max_length=128)

# === Convert to Hugging Face dataset and tokenize ===
dataset = Dataset.from_pandas(df)
dataset = dataset.map(tokenize_function, batched=True)
dataset = dataset.remove_columns(["text", "__index_level_0__"])  # drop unused columns if present
dataset.set_format("torch")

# === Split train/eval sets ===
dataset = dataset.train_test_split(test_size=0.1)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]

# === Define training arguments ===
training_args = TrainingArguments(
    output_dir="./bert_solidarity_finetuned",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

# === Define evaluation metric ===
import evaluate
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.tensor(logits).argmax(dim=1)
    return accuracy.compute(predictions=predictions, references=labels)

# === Train model ===
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

# === Save final model ===
model.save_pretrained("./bert_solidarity_finetuned")
tokenizer.save_pretrained("./bert_solidarity_finetuned")

print("Fine-tuning complete.")
