In [3]:
import os 
os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'
os.chdir("../..")

from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from peft import get_peft_model, LoraConfig, TaskType
import evaluate
import numpy as np
from task1.config import ProjectPaths
import pandas as pd
import torch

paths = ProjectPaths()

# === 3. Set device ===
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"


# === 4. Load and preprocess data ===
def load_raw_df(path):
    df = pd.read_csv(path, sep='\t')
    df = df[df['label'].isin(['SUBJ', 'OBJ'])].copy()
    df['labels'] = df['label'].map({'OBJ': 0, 'SUBJ': 1})
    df = df[['sentence', 'labels']]
    return df

# Assuming paths.data_dir is a Path object pointing to the directory containing your language folders
langs = ["english", "arabic", "bulgarian", "italian", "german"]
aliases = ["en", "ar", "bg", "it", "de"]

all_dfs = []

for lang, alias in zip(langs, aliases):
    train_df = load_raw_df(paths.data_dir / lang / f"train_{alias}.tsv")
    train2_df   = load_raw_df(paths.data_dir / lang / f"dev_{alias}.tsv")
    train3_df  = load_raw_df(paths.data_dir / lang / f"dev_test_{alias}.tsv")
    # Add a column for language
    # Append all to a single list
    all_dfs.append(train_df)
    all_dfs.append(train2_df)
    all_dfs.append(train3_df)

# Concatenate all DataFrames into a single big DataFrame

train4_df = load_raw_df(paths.data_dir / "multilingual" / "dev_test_multilingual.tsv" )
train_df = pd.concat(all_dfs, ignore_index=True)

# If needed as a HuggingFace Dataset
train_dataset = Dataset.from_pandas(train_df)

# === 5. Tokenization ===
model_name = "google-bert/bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_fn(examples):
    return tokenizer(
        examples["sentence"],
        padding="max_length",
        truncation=True,
        max_length=128
    )

train_dataset = train_dataset.map(tokenize_fn, batched=True)
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# === 6. Load model and add LoRA ===
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    task_type=TaskType.SEQ_CLS,
    target_modules=["query", "key", "value"]
)

model = get_peft_model(model, lora_config).to(device)

# === 7. Define metrics ===
f1 = evaluate.load("f1")
accuracy = evaluate.load("accuracy")
recall = evaluate.load("recall")
precision = evaluate.load("precision")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy.compute(predictions=preds, references=labels)["accuracy"],
        "f1_macro": f1.compute(predictions=preds, references=labels, average="macro")["f1"],
        "recall": recall.compute(predictions=preds, references=labels)["recall"],
        "precision": precision.compute(predictions=preds, references=labels)["precision"]
    }

# === 8. TrainingArguments ===
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=15,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
)

# === 9. Trainer ===
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    compute_metrics=compute_metrics,
    eval_dataset=train_dataset
)

# === 10. Train ===
trainer.train()

print("Training complete")



Map:   0%|          | 0/10941 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,Recall,Precision
1,0.6546,0.553153,0.716571,0.649917,0.379737,0.719718
2,0.5035,0.542824,0.732383,0.688935,0.486004,0.69698
3,0.3695,0.532246,0.738415,0.685353,0.444142,0.743675
4,0.5935,0.511289,0.751394,0.710802,0.510528,0.734759
5,0.5808,0.511646,0.761174,0.71239,0.473371,0.796914
6,0.3625,0.492404,0.774609,0.742167,0.568987,0.759841
7,0.2403,0.487676,0.784023,0.748979,0.556106,0.79723
8,0.3462,0.470859,0.790147,0.76323,0.613822,0.770762
9,0.5737,0.460562,0.794077,0.760436,0.568244,0.818117
10,0.3953,0.449167,0.802395,0.773615,0.604162,0.812188




Training complete


In [8]:
ml_test_df = load_raw_df(paths.data_dir / "multilingual" / "test_multilingual_labeled.tsv" )
ml_test_ds = Dataset.from_pandas(ml_test_df)
ml_test_ds = ml_test_ds.map(tokenize_fn, batched=True)
ml_test_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

print("Evaluating multilingual")

# === 11. Evaluate on test set ===
test_results = trainer.evaluate(eval_dataset=ml_test_ds)
test_results

Map:   0%|          | 0/1982 [00:00<?, ? examples/s]

Evaluating multilingual


{'eval_loss': 0.6697618365287781,
 'eval_accuracy': 0.722502522704339,
 'eval_f1_macro': 0.6651948878413624,
 'eval_recall': 0.49434571890145396,
 'eval_precision': 0.56353591160221,
 'eval_runtime': 21.3929,
 'eval_samples_per_second': 92.648,
 'eval_steps_per_second': 23.185,
 'epoch': 15.0}

Unnamed: 0,sentence,labels
0,لكنهم مازالوا طلقاء حسم الجدل لفترة طويلة ظل ط...,0
1,وأشدد على أهمية عدم التسرع في اتخاذ أي قرارات ...,0
2,Das Fälschen von Totenscheinen kann als Ordnun...,0
3,"From the lack of vision, the lack of hope.",0
4,من المتوقع أن تفقد البلاد حوالي 10 آلاف ثري ، ...,0


In [10]:
ukr_test_df = load_raw_df(paths.data_dir / "ukrainian" / "test_ukr_labeled.tsv" )
ukr_test_ds = Dataset.from_pandas(ukr_test_df)
ukr_test_ds = ukr_test_ds.map(tokenize_fn, batched=True)
ukr_test_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
print("Evaluating zero-shot ukranian")

# === 11. Evaluate on test set ===
test_results = trainer.evaluate(eval_dataset=ukr_test_ds)
test_results

Map:   0%|          | 0/297 [00:00<?, ? examples/s]

Evaluating zero-shot ukranian


{'eval_loss': 1.115573763847351,
 'eval_accuracy': 0.6666666666666666,
 'eval_f1_macro': 0.6016339470796245,
 'eval_recall': 0.5,
 'eval_precision': 0.3939393939393939,
 'eval_runtime': 3.4638,
 'eval_samples_per_second': 85.745,
 'eval_steps_per_second': 21.653,
 'epoch': 15.0}

In [11]:
ro_test_df = load_raw_df(paths.data_dir / "romanian" / "test_ro_labeled.tsv" )
ro_test_ds = Dataset.from_pandas(ro_test_df)
ro_test_ds = ro_test_ds.map(tokenize_fn, batched=True)
ro_test_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
print("Evaluating zero-shot romanian")

# === 11. Evaluate on test set ===
test_results = trainer.evaluate(eval_dataset=ro_test_ds)
test_results

Map:   0%|          | 0/206 [00:00<?, ? examples/s]

Evaluating zero-shot romanian


{'eval_loss': 0.485209196805954,
 'eval_accuracy': 0.8155339805825242,
 'eval_f1_macro': 0.7643588199879591,
 'eval_recall': 0.6923076923076923,
 'eval_precision': 0.6206896551724138,
 'eval_runtime': 2.3597,
 'eval_samples_per_second': 87.3,
 'eval_steps_per_second': 22.037,
 'epoch': 15.0}

In [12]:
pl_test_df = load_raw_df(paths.data_dir / "polish" / "test_pol_labeled.tsv" )
pl_test_ds = Dataset.from_pandas(pl_test_df)
pl_test_ds = pl_test_ds.map(tokenize_fn, batched=True)
pl_test_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
print("Evaluating zero-shot polish")

# === 11. Evaluate on test set ===
test_results = trainer.evaluate(eval_dataset=pl_test_ds)
test_results

Map:   0%|          | 0/351 [00:00<?, ? examples/s]

Evaluating zero-shot polish


{'eval_loss': 1.3035749197006226,
 'eval_accuracy': 0.6495726495726496,
 'eval_f1_macro': 0.5737978419893975,
 'eval_recall': 0.2484472049689441,
 'eval_precision': 0.9523809523809523,
 'eval_runtime': 3.8654,
 'eval_samples_per_second': 90.806,
 'eval_steps_per_second': 22.766,
 'epoch': 15.0}

In [13]:
gk_test_df = load_raw_df(paths.data_dir / "greek" / "test_gr_labeled.tsv" )
gk_test_ds = Dataset.from_pandas(gk_test_df)
gk_test_ds = gk_test_ds.map(tokenize_fn, batched=True)
gk_test_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
print("Evaluating zero-shot greek")

# === 11. Evaluate on test set ===
test_results = trainer.evaluate(eval_dataset=gk_test_ds)
test_results

Map:   0%|          | 0/282 [00:00<?, ? examples/s]

Evaluating zero-shot greek


{'eval_loss': 0.48859184980392456,
 'eval_accuracy': 0.8191489361702128,
 'eval_f1_macro': 0.677207945236225,
 'eval_recall': 0.4782608695652174,
 'eval_precision': 0.4489795918367347,
 'eval_runtime': 3.1137,
 'eval_samples_per_second': 90.567,
 'eval_steps_per_second': 22.802,
 'epoch': 15.0}