In [1]:
import os 

os.chdir("../..")

from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from peft import get_peft_model, LoraConfig, TaskType
import evaluate
import numpy as np
from task1.config import ProjectPaths
import pandas as pd
import torch

paths = ProjectPaths()

# === 3. Set device ===
device = "mps" if torch.backends.mps.is_available() else "cpu"

# === 4. Load and preprocess data ===
def load_dataset(path):
    df = pd.read_csv(path, sep='\t')
    df = df[df['label'].isin(['SUBJ', 'OBJ'])].copy()
    df['labels'] = df['label'].map({'OBJ': 0, 'SUBJ': 1})
    df = df[['sentence', 'labels']]
    return Dataset.from_pandas(df)

train_dataset = load_dataset(paths.english_data_dir / "train_en.tsv")
val_dataset   = load_dataset(paths.english_data_dir / "dev_en.tsv")
test_dataset  = load_dataset(paths.english_data_dir / "dev_test_en.tsv")



# === 5. Tokenization ===
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_fn(examples):
    return tokenizer(
        examples["sentence"],
        padding="max_length",
        truncation=True,
        max_length=128
    )

train_dataset = train_dataset.map(tokenize_fn, batched=True)
val_dataset = val_dataset.map(tokenize_fn, batched=True)
test_dataset = test_dataset.map(tokenize_fn, batched=True)

train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# === 6. Load model and add LoRA ===
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    task_type=TaskType.SEQ_CLS,
    target_modules=["q_lin", "k_lin", "v_lin", "out_lin"]
)

model = get_peft_model(model, lora_config).to(device)

# === 7. Define metrics ===
f1 = evaluate.load("f1")
accuracy = evaluate.load("accuracy")
recall = evaluate.load("recall")
precision = evaluate.load("precision")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy.compute(predictions=preds, references=labels)["accuracy"],
        "f1_macro": f1.compute(predictions=preds, references=labels, average="macro")["f1"],
        "recall": recall.compute(predictions=preds, references=labels)["recall"],
        "precision": precision.compute(predictions=preds, references=labels)["precision"]
    }

# === 8. TrainingArguments ===
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=15,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
)

# === 9. Trainer ===
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# === 10. Train ===
trainer.train()

print("Training complete")

print("Evaluating on test set")
# === 11. Evaluate on test set ===
test_results = trainer.evaluate(eval_dataset=test_dataset)
test_results

Map:   0%|          | 0/830 [00:00<?, ? examples/s]

Map:   0%|          | 0/462 [00:00<?, ? examples/s]

Map:   0%|          | 0/484 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,Recall,Precision
1,0.5037,0.664474,0.593074,0.551928,0.279167,0.817073
2,0.6104,0.521988,0.748918,0.748687,0.691667,0.798077
3,0.2455,0.56192,0.757576,0.756095,0.654167,0.844086
4,0.2646,0.517118,0.768398,0.768397,0.741667,0.798206
5,0.5366,0.529657,0.785714,0.785085,0.808333,0.785425
6,0.2781,0.560687,0.774892,0.774685,0.716667,0.826923
7,0.2732,0.583825,0.779221,0.778719,0.704167,0.845
8,0.2529,0.642672,0.777056,0.775287,0.6625,0.878453
9,0.1331,0.631,0.779221,0.778956,0.716667,0.834951
10,0.1703,0.710481,0.774892,0.772831,0.654167,0.882022


Training complete
Evaluating on test set


{'eval_loss': 0.6249175071716309,
 'eval_accuracy': 0.7995867768595041,
 'eval_f1_macro': 0.70607348788871,
 'eval_recall': 0.4672131147540984,
 'eval_precision': 0.6404494382022472,
 'eval_runtime': 3.0367,
 'eval_samples_per_second': 159.383,
 'eval_steps_per_second': 39.846,
 'epoch': 15.0}

In [10]:
test_results =trainer.predict(test_dataset)
test_results.metrics

{'test_runtime': 3.1051,
 'test_samples_per_second': 155.871,
 'test_steps_per_second': 38.968}