In [1]:
import json
import pandas as pd
from pathlib import Path
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
def load_results(filepath: str) -> dict:
    """Carga resultados desde un archivo JSON."""
    with open(filepath, 'r', encoding='utf-8') as f:
        return json.load(f)

def calculate_metrics(results: dict) -> dict:
    """Calcula mÃ©tricas de NER."""
    tp, fp, fn = 0, 0, 0
    
    for sample in results['results']:
        expected = {(e['text'].lower(), e['type'].lower()) for e in sample['expected_entities']}
        predicted = {(e['text'].lower(), e['type'].lower()) for e in sample['predicted_entities']}
        
        tp += len(expected & predicted)
        fp += len(predicted - expected)
        fn += len(expected - predicted)
    
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1_micro = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    
    return {
        'model': results['model'],
        'shot_type': results['shot_type'],
        'num_samples': results['num_samples'],
        'precision': precision,
        'recall': recall,
        'f1_micro': f1_micro
    }

In [3]:
results_dir = Path('../results/ner')

result_files = {
    'GPT-4o-mini (few-shot)': results_dir / 'gpt-4o-mini/few_shot_5/gpt-4o-mini_ner_few_shot_5_test_20251212_135905.json',
    'GPT-4o-mini (zero-shot)': results_dir / 'gpt-4o-mini/zero_shot/gpt-4o-mini_ner_zero_shot_test_20251212_130743.json',
    'GPT-5.1 (few-shot)': results_dir / 'gpt-5.1-grande/few_shot_5/gpt-5.1-grande_ner_few_shot_5_test_20251222_125130.json',
    'GPT-5.1 (zero-shot)': results_dir / 'gpt-5.1-grande/zero_shot/gpt-5.1-grande_ner_zero_shot_test_20251222_124732.json',
    'Gemma-2-9B (few-shot)': results_dir / 'google_gemma-2-9b-it/few_shot_5/google_gemma-2-9b-it_ner_few_shot_5_test_20251215_125935.json',
    'Gemma-2-9B (zero-shot)': results_dir / 'google_gemma-2-9b-it/zero_shot/google_gemma-2-9b-it_ner_zero_shot_test_20251215_151733.json',
    'Llama-3.1-8B (few-shot)': results_dir / 'meta-llama_Meta-Llama-3.1-8B-Instruct/few_shot_5/meta-llama_Meta-Llama-3.1-8B-Instruct_ner_few_shot_5_test_20251215_174710.json',
    'Llama-3.1-8B (zero-shot)': results_dir / 'meta-llama_Meta-Llama-3.1-8B-Instruct/zero_shot/meta-llama_Meta-Llama-3.1-8B-Instruct_ner_zero_shot_test_20251215_193047.json',
    'Qwen-2.5-7B (few-shot)': results_dir / 'Qwen_Qwen2.5-7B-Instruct/few_shot_5/Qwen_Qwen2.5-7B-Instruct_ner_few_shot_5_test_20251215_104744.json',
    'Qwen-2.5-7B (zero-shot)': results_dir / 'Qwen_Qwen2.5-7B-Instruct/zero_shot/Qwen_Qwen2.5-7B-Instruct_ner_zero_shot_test_20251215_103242.json',
    'Mistral-7B (few-shot)': results_dir / 'mistralai_Mistral-7B-Instruct-v0.3/few_shot_5/mistralai_Mistral-7B-Instruct-v0.3_ner_few_shot_5_test_20260103_072628.json',
    'Mistral-7B (zero-shot)': results_dir / 'mistralai_Mistral-7B-Instruct-v0.3/zero_shot/mistralai_Mistral-7B-Instruct-v0.3_ner_zero_shot_test_20260103_084221.json',
}

In [4]:
all_metrics = []

for name, filepath in result_files.items():
    
    results = load_results(filepath)
    metrics = calculate_metrics(results)
    metrics['name'] = name
    all_metrics.append(metrics)
    
    print(f"\n- {name}")
    print(f"  Samples: {metrics['num_samples']}")
    print(f"  Precision: {metrics['precision']:.4f}")
    print(f"  Recall: {metrics['recall']:.4f}")
    print(f"  F1 Micro: {metrics['f1_micro']:.4f}")


- GPT-4o-mini (few-shot)
  Samples: 1129
  Precision: 0.2913
  Recall: 0.2998
  F1 Micro: 0.2955

- GPT-4o-mini (zero-shot)
  Samples: 1129
  Precision: 0.2455
  Recall: 0.2263
  F1 Micro: 0.2355

- GPT-5.1 (few-shot)
  Samples: 1129
  Precision: 0.0000
  Recall: 0.0000
  F1 Micro: 0.0000

- GPT-5.1 (zero-shot)
  Samples: 1129
  Precision: 0.0000
  Recall: 0.0000
  F1 Micro: 0.0000

- Gemma-2-9B (few-shot)
  Samples: 1129
  Precision: 0.2483
  Recall: 0.3284
  F1 Micro: 0.2828

- Gemma-2-9B (zero-shot)
  Samples: 1129
  Precision: 0.2298
  Recall: 0.3229
  F1 Micro: 0.2685

- Llama-3.1-8B (few-shot)
  Samples: 1129
  Precision: 0.2255
  Recall: 0.3378
  F1 Micro: 0.2704

- Llama-3.1-8B (zero-shot)
  Samples: 1129
  Precision: 0.1556
  Recall: 0.1705
  F1 Micro: 0.1627

- Qwen-2.5-7B (few-shot)
  Samples: 1129
  Precision: 0.2884
  Recall: 0.2835
  F1 Micro: 0.2859

- Qwen-2.5-7B (zero-shot)
  Samples: 1129
  Precision: 0.2549
  Recall: 0.2011
  F1 Micro: 0.2248

- Mistral-7B (few-shot

In [5]:
df_metrics = pd.DataFrame(all_metrics)
df_metrics = df_metrics[['name', 'num_samples', 'precision', 'recall', 'f1_micro']]
df_metrics = df_metrics.sort_values('f1_micro', ascending=False)

print("\n" + "="*80)
print("RESULTADOS COMPARATIVOS NER")
print("="*80)
print(df_metrics.to_string(index=False))


RESULTADOS COMPARATIVOS NER
                    name  num_samples  precision   recall  f1_micro
  GPT-4o-mini (few-shot)         1129   0.291273 0.299771  0.295461
  Qwen-2.5-7B (few-shot)         1129   0.288417 0.283467  0.285920
   Gemma-2-9B (few-shot)         1129   0.248324 0.328375  0.282793
   Mistral-7B (few-shot)         1129   0.241598 0.316648  0.274078
 Llama-3.1-8B (few-shot)         1129   0.225468 0.337815  0.270437
  Gemma-2-9B (zero-shot)         1129   0.229752 0.322941  0.268490
 GPT-4o-mini (zero-shot)         1129   0.245500 0.226259  0.235487
  Mistral-7B (zero-shot)         1129   0.174234 0.317220  0.224926
 Qwen-2.5-7B (zero-shot)         1129   0.254895 0.201087  0.224816
Llama-3.1-8B (zero-shot)         1129   0.155573 0.170481  0.162686
     GPT-5.1 (zero-shot)         1129   0.000000 0.000000  0.000000
      GPT-5.1 (few-shot)         1129   0.000000 0.000000  0.000000
