In [7]:
import json
import pandas as pd
from pathlib import Path
from sklearn.metrics import (
    f1_score, 
    accuracy_score, 
    classification_report,
    confusion_matrix
)
import seaborn as sns
import matplotlib.pyplot as plt

In [8]:
def load_results(filepath: str) -> dict:
    """Carga resultados desde un archivo JSON."""
    with open(filepath, 'r', encoding='utf-8') as f:
        return json.load(f)

def calculate_metrics(results: dict, exclude_filtered: bool = True) -> dict:
    """Calcula métricas de clasificación."""
    df = pd.DataFrame(results['results'])

    y_true = df['expected'].tolist()
    y_pred = df['prediction'].tolist()
    
    metrics = {
        'model': results['model'],
        'shot_type': results['shot_type'],
        'num_samples': len(df),
        'accuracy': accuracy_score(y_true, y_pred),
        'f1_macro': f1_score(y_true, y_pred, average='macro', zero_division=0),
        'f1_micro': f1_score(y_true, y_pred, average='micro', zero_division=0),
        'f1_weighted': f1_score(y_true, y_pred, average='weighted', zero_division=0)
    }
    
    return metrics, y_true, y_pred

def print_classification_report(y_true, y_pred, model_name: str):
    """Imprime reporte de clasificación detallado."""
    print(f"\n{'='*60}")
    print(f"Classification Report: {model_name}")
    print('='*60)
    print(classification_report(y_true, y_pred, zero_division=0))

def plot_confusion_matrix(y_true, y_pred, model_name: str, labels: list = None):
    """Genera matriz de confusión."""
    if labels is None:
        labels = sorted(set(y_true) | set(y_pred))
    
    cm = confusion_matrix(y_true, y_pred, labels=labels)
    
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=labels, yticklabels=labels)
    plt.title(f'Confusion Matrix: {model_name}')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

In [9]:
results_dir = Path('../results/multinomial')

# Archivos a evaluar
result_files = {
    'GPT-4o-mini (few-shot)': results_dir / 'gpt-4o-mini/few_shot_5/gpt-4o-mini_multinomial_few_shot_5_test_20251205_093258.json',
    'GPT-4o-mini (zero-shot)': results_dir / 'gpt-4o-mini/zero_shot/gpt-4o-mini_multinomial_zero_shot_test_20251205_094208.json',
    'Gemma-2-9B (few-shot)': results_dir / 'google_gemma-2-9b-it/few_shot_5/google_gemma-2-9b-it_multinomial_few_shot_5_test_20251205_111701.json',
    'Gemma-2-9B (zero-shot)': results_dir / 'google_gemma-2-9b-it/zero_shot/google_gemma-2-9b-it_multinomial_zero_shot_test_20251205_110744.json',
    'Llama-3.1-8B (few-shot)': results_dir / 'meta-llama_Meta-Llama-3.1-8B-Instruct/few_shot_5/meta-llama_Meta-Llama-3.1-8B-Instruct_multinomial_few_shot_5_test_20251205_120135.json',
    'Llama-3.1-8B (zero-shot)': results_dir / 'meta-llama_Meta-Llama-3.1-8B-Instruct/zero_shot/meta-llama_Meta-Llama-3.1-8B-Instruct_multinomial_zero_shot_test_20251205_115423.json',
    'Qwen-2.5-7B (few-shot)': results_dir / 'Qwen_Qwen2.5-7B-Instruct/few_shot_5/Qwen_Qwen2.5-7B-Instruct_multinomial_few_shot_5_test_20251205_125932.json',
    'Qwen-2.5-7B (zero-shot)': results_dir / 'Qwen_Qwen2.5-7B-Instruct/zero_shot/Qwen_Qwen2.5-7B-Instruct_multinomial_zero_shot_test_20251205_125035.json',
}

In [10]:
all_metrics = []

for name, filepath in result_files.items():
    
    results = load_results(filepath)
    metrics, y_true, y_pred = calculate_metrics(results)
    metrics['name'] = name
    all_metrics.append(metrics)
    
    print(f"\n- {name}")
    print(f"  Samples: {metrics['num_samples']}")
    print(f"  Accuracy: {metrics['accuracy']:.4f}")
    print(f"  F1 Macro: {metrics['f1_macro']:.4f}")
    print(f"  F1 Micro: {metrics['f1_micro']:.4f}")
    print(f"  F1 Weighted: {metrics['f1_weighted']:.4f}")


- GPT-4o-mini (few-shot)
  Samples: 1253
  Accuracy: 0.8364
  F1 Macro: 0.7858
  F1 Micro: 0.8364
  F1 Weighted: 0.8413

- GPT-4o-mini (zero-shot)
  Samples: 1253
  Accuracy: 0.8228
  F1 Macro: 0.6428
  F1 Micro: 0.8228
  F1 Weighted: 0.8302

- Gemma-2-9B (few-shot)
  Samples: 1253
  Accuracy: 0.8117
  F1 Macro: 0.4800
  F1 Micro: 0.8117
  F1 Weighted: 0.8229

- Gemma-2-9B (zero-shot)
  Samples: 1253
  Accuracy: 0.7733
  F1 Macro: 0.3320
  F1 Micro: 0.7733
  F1 Weighted: 0.7927

- Llama-3.1-8B (few-shot)
  Samples: 1253
  Accuracy: 0.8005
  F1 Macro: 0.4737
  F1 Micro: 0.8005
  F1 Weighted: 0.8178

- Llama-3.1-8B (zero-shot)
  Samples: 1253
  Accuracy: 0.7119
  F1 Macro: 0.2122
  F1 Micro: 0.7119
  F1 Weighted: 0.7410

- Qwen-2.5-7B (few-shot)
  Samples: 1253
  Accuracy: 0.7446
  F1 Macro: 0.5087
  F1 Micro: 0.7446
  F1 Weighted: 0.7560

- Qwen-2.5-7B (zero-shot)
  Samples: 1253
  Accuracy: 0.7606
  F1 Macro: 0.3983
  F1 Micro: 0.7606
  F1 Weighted: 0.7756

- Qwen-2.5-7B (zero-shot)
 

In [11]:
df_metrics = pd.DataFrame(all_metrics)
df_metrics = df_metrics[['name', 'num_samples', 'accuracy', 'f1_macro', 'f1_micro', 'f1_weighted']]
df_metrics = df_metrics.sort_values('f1_macro', ascending=False)

print("\n" + "="*80)
print("RESULTADOS COMPARATIVOS")
print("="*80)
print(df_metrics.to_string(index=False))


RESULTADOS COMPARATIVOS
                    name  num_samples  accuracy  f1_macro  f1_micro  f1_weighted
  GPT-4o-mini (few-shot)         1253  0.836393  0.785786  0.836393     0.841306
 GPT-4o-mini (zero-shot)         1253  0.822825  0.642772  0.822825     0.830182
  Qwen-2.5-7B (few-shot)         1253  0.744613  0.508684  0.744613     0.756016
   Gemma-2-9B (few-shot)         1253  0.811652  0.480022  0.811652     0.822898
 Llama-3.1-8B (few-shot)         1253  0.800479  0.473717  0.800479     0.817824
 Qwen-2.5-7B (zero-shot)         1253  0.760575  0.398305  0.760575     0.775577
  Gemma-2-9B (zero-shot)         1253  0.773344  0.331966  0.773344     0.792709
Llama-3.1-8B (zero-shot)         1253  0.711891  0.212193  0.711891     0.741009
