In [None]:
import torch
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from transformers import PreTrainedTokenizerFast
from typing import Dict, Tuple
from transformers import AutoModelForCausalLM

import pandas as pd

In [None]:
def evaluate_llm_classifier(model_path: str, test_data: pd.DataFrame, tokenizer: PreTrainedTokenizerFast, tokenizer_template: str) -> Tuple[Dict[str, float | int], Dict[str, float]]:

    model = AutoModelForCausalLM.from_pretrained(model_path)(
        model_path=model_path,
        torch_dtype=torch.bfloat16,
        device_map="auto"
    )

    tokenizer.chat_template(tokenizer_template)

    true_labels = []
    predicted_labels = []
    format_violations = []

    for _, row in test_data.iterrows():

        messages = [
            {"role": "system", "content": row["system"]},
            {"role": "user", "content": row["user"]},
        ]

        prompt = tokenizer.apply_chat_template(messages, tokenize=False)

        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=5
            )

        prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)

        pred_label = "true" in prediction.lower()
        true_label = row["assistant"].lower() == "true"

        if prediction != "true" or prediction != "false":
            format_violations.append(messages)

        true_labels.append(true_label)
        predicted_labels.append(pred_label)

        metrics = {
            "accuracy": accuracy_score(true_labels, predicted_labels),
            "precision": precision_score(true_labels, predicted_labels),
            "recall": recall_score(true_labels, predicted_labels),
            "f1_score": f1_score(true_labels, predicted_labels),
            "format_violations": len(format_violations)
        }

        return format_violations, metrics
