In [30]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from seqeval.metrics import f1_score, precision_score, recall_score, accuracy_score
import torch
from tqdm import tqdm


In [47]:
model_paths = {
    "XLM-RoBERTa": "./ner-finetuned-amharic-final",
    "BERT-Multilingual": "./ner-finetuned-bert-multilingual"
}

models = {}
tokenizers = {}

for name, path in model_paths.items():
    try:
        tokenizer = AutoTokenizer.from_pretrained(path)
        model = AutoModelForTokenClassification.from_pretrained(path)
        models[name] = model
        tokenizers[name] = tokenizer
        print(f" Loaded {name}")
    except Exception as e:
        print(f" Error loading {name}: {e}")


 Loaded XLM-RoBERTa
 Loaded BERT-Multilingual


In [None]:
def evaluate_model(model, tokenizer, dataset):
    model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    true_labels = []
    pred_labels = []

    for example in tqdm(dataset):
        tokens = example["tokens"]
        word_labels = example["ner_tags"]

        encoding = tokenizer(tokens, is_split_into_words=True, return_tensors="pt", truncation=True)
        word_ids = encoding.word_ids(0)
        inputs = {k: v.to(device) for k, v in encoding.items()}

        with torch.no_grad():
            outputs = model(**inputs)
            predictions = torch.argmax(outputs.logits, dim=2)

        predicted_ids = predictions[0].cpu().tolist()

        true_label_seq = []
        pred_label_seq = []

        previous_word_idx = None
        for i, word_idx in enumerate(word_ids):
            if word_idx is None or word_idx == previous_word_idx:
                continue
            if word_idx >= len(word_labels):
                continue

            #  FIX: check label type before indexing
            true_label = word_labels[word_idx] if isinstance(word_labels[word_idx], str) else id2label[word_labels[word_idx]]
            pred_label = id2label[predicted_ids[i]]

            true_label_seq.append(true_label)
            pred_label_seq.append(pred_label)
            previous_word_idx = word_idx

        true_labels.append(true_label_seq)
        pred_labels.append(pred_label_seq)

    return {
        "f1": f1_score(true_labels, pred_labels),
        "precision": precision_score(true_labels, pred_labels),
        "recall": recall_score(true_labels, pred_labels),
        "accuracy": accuracy_score(true_labels, pred_labels),
    }


In [43]:
results = {}

for name in models:
    print(f"\n Evaluating {name}...")
    
    scores = {}
    
    try:
        scores = evaluate_model(models[name], tokenizers[name], dataset["test"])
        results[name] = scores
        print(f" {name} Scores:\n{scores}")
        
    except KeyError as e:
        print(f" KeyError while evaluating {name}: {e}")




 Evaluating XLM-RoBERTa...


100%|██████████| 21/21 [00:00<00:00, 42.16it/s]


 XLM-RoBERTa Scores:
{'f1': np.float64(0.8221574344023324), 'precision': np.float64(0.844311377245509), 'recall': np.float64(0.8011363636363636), 'accuracy': 0.9023109243697479}

 Evaluating BERT-Multilingual...


100%|██████████| 21/21 [00:00<00:00, 81.54it/s]

 BERT-Multilingual Scores:
{'f1': np.float64(0.8079096045197739), 'precision': np.float64(0.8033707865168539), 'recall': np.float64(0.8125), 'accuracy': 0.9275210084033614}





In [45]:
import time
import torch

def measure_inference_time(model, tokenizer, dataset, max_samples=50):
    """
    Measure average inference time per sample on the given dataset (limited by max_samples).
    """
    model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    total_time = 0
    count = 0

    for example in dataset.select(range(min(len(dataset), max_samples))):
        tokens = example["tokens"]
        encoding = tokenizer(tokens, is_split_into_words=True, return_tensors="pt", truncation=True)
        inputs = {k: v.to(device) for k, v in encoding.items()}

        start_time = time.time()
        with torch.no_grad():
            outputs = model(**inputs)
        end_time = time.time()

        total_time += (end_time - start_time)
        count += 1

    avg_time = total_time / count if count > 0 else 0
    return avg_time

def get_model_size_mb(model):
    """
    Returns model size in megabytes (approximate, parameters only).
    """
    param_count = sum(p.numel() for p in model.parameters())
    # Assuming 4 bytes per parameter (float32)
    size_mb = param_count * 4 / (1024 ** 2)
    return size_mb

# Run measurement for all models
speed_and_size_results = {}

for name in models:
    print(f"\n Measuring inference time and size for {name}...")
    avg_inference_time = measure_inference_time(models[name], tokenizers[name], dataset["test"])
    model_size_mb = get_model_size_mb(models[name])
    
    speed_and_size_results[name] = {
        "avg_inference_time_sec": avg_inference_time,
        "model_size_mb": model_size_mb
    }
    print(f" {name} - Avg Inference Time: {avg_inference_time:.4f}s, Model Size: {model_size_mb:.2f} MB")

# Display combined results: metrics + speed + size
print("\n=== Combined Model Comparison ===")
for name in results:
    print(f"\nModel: {name}")
    print(f"F1 Score: {results[name]['f1']:.4f}")
    print(f"Precision: {results[name]['precision']:.4f}")
    print(f"Recall: {results[name]['recall']:.4f}")
    print(f"Accuracy: {results[name]['accuracy']:.4f}")
    print(f"Avg Inference Time (s): {speed_and_size_results[name]['avg_inference_time_sec']:.4f}")
    print(f"Model Size (MB): {speed_and_size_results[name]['model_size_mb']:.2f}")



 Measuring inference time and size for XLM-RoBERTa...
 XLM-RoBERTa - Avg Inference Time: 0.0115s, Model Size: 1058.43 MB

 Measuring inference time and size for BERT-Multilingual...
 BERT-Multilingual - Avg Inference Time: 0.0125s, Model Size: 676.24 MB

=== Combined Model Comparison ===

Model: XLM-RoBERTa
F1 Score: 0.8222
Precision: 0.8443
Recall: 0.8011
Accuracy: 0.9023
Avg Inference Time (s): 0.0115
Model Size (MB): 1058.43

Model: BERT-Multilingual
F1 Score: 0.8079
Precision: 0.8034
Recall: 0.8125
Accuracy: 0.9275
Avg Inference Time (s): 0.0125
Model Size (MB): 676.24


In [46]:
print("\n Model Comparison & Selection Summary \n")

for name in results:
    print(f"Model: {name}")
    print(f"  - F1 Score: {results[name]['f1']:.4f}")
    print(f"  - Precision: {results[name]['precision']:.4f}")
    print(f"  - Recall: {results[name]['recall']:.4f}")
    print(f"  - Accuracy: {results[name]['accuracy']:.4f}")
    print(f"  - Average Inference Time (sec): {speed_and_size_results[name]['avg_inference_time_sec']:.4f}")
    print(f"  - Model Size (MB): {speed_and_size_results[name]['model_size_mb']:.2f}")
    print()

print("Recommendation:")
print(
    "Based on the evaluation metrics and resource considerations, "
    "the XLM-RoBERTa model offers slightly better F1 score and precision, "
    "but it is significantly larger in size (~1GB). "
    "The BERT-Multilingual model provides a more compact solution with comparable performance and better recall and accuracy. "
    "For deployment scenarios where resources are limited or faster loading times are critical, BERT-Multilingual is recommended. "
    "However, if the priority is maximizing precision and F1 score without strict size constraints, XLM-RoBERTa is preferred."
)



 Model Comparison & Selection Summary 

Model: XLM-RoBERTa
  - F1 Score: 0.8222
  - Precision: 0.8443
  - Recall: 0.8011
  - Accuracy: 0.9023
  - Average Inference Time (sec): 0.0115
  - Model Size (MB): 1058.43

Model: BERT-Multilingual
  - F1 Score: 0.8079
  - Precision: 0.8034
  - Recall: 0.8125
  - Accuracy: 0.9275
  - Average Inference Time (sec): 0.0125
  - Model Size (MB): 676.24

Recommendation:
Based on the evaluation metrics and resource considerations, the XLM-RoBERTa model offers slightly better F1 score and precision, but it is significantly larger in size (~1GB). The BERT-Multilingual model provides a more compact solution with comparable performance and better recall and accuracy. For deployment scenarios where resources are limited or faster loading times are critical, BERT-Multilingual is recommended. However, if the priority is maximizing precision and F1 score without strict size constraints, XLM-RoBERTa is preferred.
