In [None]:
import json
from pathlib import Path
import pandas as pd
import argparse


def collect_all_results(results_dir: str = "results/multilabel", split_filter: str = None):
    """
    Recolecta todas las métricas de los experimentos.
    
    Args:
        results_dir: Directorio con los resultados
        split_filter: 'dev', 'test', o None para todos
        
    Returns:
        DataFrame con todas las métricas
    """
    results = []
    results_path = Path(results_dir)
    
    # Buscar todos los archivos metrics.json
    for metrics_file in results_path.rglob("*_metrics.json"):
        # Extraer información del path
        parts = metrics_file.parts
        
        model_name = parts[-3]  # gpt-4o-mini
        strategy = parts[-2]    # few-shot-3
        split = metrics_file.stem.replace("_metrics", "")  # dev
        
        # Filtrar por split si se especifica
        if split_filter and split != split_filter:
            continue
        
        # Leer métricas
        with open(metrics_file, 'r') as f:
            metrics = json.load(f)
        
        # Agregar a resultados
        results.append({
            'model': model_name,
            'strategy': strategy,
            'split': split,
            'exact_match': metrics.get('exact_match_ratio', 0),
            'hamming_loss': metrics.get('hamming_loss', 0),
            'f1_macro': metrics.get('f1_macro', 0),
            'f1_micro': metrics.get('f1_micro', 0),
            'precision_macro': metrics.get('precision_macro', 0),
            'recall_macro': metrics.get('recall_macro', 0),
        })
    
    return pd.DataFrame(results)


results_dir = "../results/multilabel"
split_filter = 'test'
    
# Recolectar resultados
df = collect_all_results(results_dir=results_dir, split_filter=split_filter)

if df.empty:
    print(f"No se encontraron resultados en {results_dir}/")
    if split_filter:
        print(f"con split '{split_filter}'")

# Ordenar por modelo, estrategia y split
df = df.sort_values(['model', 'strategy', 'split'])

# Título según split
if split_filter:
    title = f"RESULTADOS - SPLIT: {split_filter.upper()}"
else:
    title = "RESULTADOS - TODOS LOS SPLITS"

# Mostrar tabla completa
print("\n" + "="*100)
print(title)
print("="*100)
print(df.to_string(index=False))


# Mostrar mejor modelo
print("\n" + "="*100)
print("MEJORES RESULTADOS")
print("="*100)
best_f1_macro = df.loc[df['f1_macro'].idxmax()]
best_f1_micro = df.loc[df['f1_micro'].idxmax()]
best_exact_match = df.loc[df['exact_match'].idxmax()]

print(f"Mejor F1 Macro: {best_f1_macro['model']} ({best_f1_macro['strategy']}) = {best_f1_macro['f1_macro']:.4f}")
print(f"Mejor F1 Micro: {best_f1_micro['model']} ({best_f1_micro['strategy']}) = {best_f1_micro['f1_micro']:.4f}")
print(f"Mejor Exact Match: {best_exact_match['model']} ({best_exact_match['strategy']}) = {best_exact_match['exact_match']:.4f}")




RESULTADOS - SPLIT: TEST
                   model   strategy split  exact_match  hamming_loss  f1_macro  f1_micro  precision_macro  recall_macro
Mistral-7B-Instruct-v0.3 few-shot-5  test            0      0.148768  0.243701  0.343944         0.243338      0.403670
Mistral-7B-Instruct-v0.3  zero-shot  test            0      0.169366  0.195355  0.345133         0.213629      0.287609
     Qwen2.5-7B-Instruct few-shot-5  test            0      0.139877  0.246353  0.360048         0.252739      0.397311
     Qwen2.5-7B-Instruct  zero-shot  test            0      0.140229  0.232942  0.357402         0.267161      0.358981
           gemma-2-9b-it few-shot-5  test            0      0.136532  0.296323  0.451362         0.236386      0.482961
           gemma-2-9b-it  zero-shot  test            0      0.145863  0.277955  0.450415         0.220842      0.470913
             gpt-4o-mini few-shot-5  test            0      0.119542  0.300467  0.456800         0.282948      0.399292
             g

# Calcular exact matching

In [26]:
results_dir = Path('../results/multilabel')

# Archivos a evaluar
result_files = {
    'Gemma-2-9B (few-shot)': results_dir / 'gemma-2-9b-it/few-shot-5/test_predictions.parquet',
    'Gemma-2-9B (zero-shot)': results_dir / 'gemma-2-9b-it/zero-shot/test_predictions.parquet',
    'gpt-4o-mini (few-shot)': results_dir / 'gpt-4o-mini/few-shot-5/test_predictions.parquet',
    'gpt-4o-mini (zero-shot)': results_dir / 'gpt-4o-mini/zero-shot/test_predictions.parquet',
    #'Llama-3.1-8B (few-shot)': results_dir / 'meta-llama_Meta-Llama-3.1-8B-Instruct/few_shot_5/meta-llama_Meta-Llama-3.1-8B-Instruct_multinomial_few_shot_5_test_20251205_120135.json',
    #'Llama-3.1-8B (zero-shot)': results_dir / 'meta-llama_Meta-Llama-3.1-8B-Instruct/zero_shot/meta-llama_Meta-Llama-3.1-8B-Instruct_multinomial_zero_shot_test_20251205_115423.json',
    'Mistral-7B-Instruct-v0.3 (few-shot)': results_dir / 'Mistral-7B-Instruct-v0.3/few-shot-5/test_predictions.parquet',
    'Mistral-7B-Instruct-v0.3 (zero-shot)': results_dir / 'Mistral-7B-Instruct-v0.3/zero-shot/test_predictions.parquet',    
    'Qwen-2.5-7B (few-shot)': results_dir / 'Qwen2.5-7B-Instruct/few-shot-5/test_predictions.parquet',
    'Qwen-2.5-7B (zero-shot)': results_dir / 'Qwen2.5-7B-Instruct/zero-shot/test_predictions.parquet',
}

In [27]:
def exact_match_multilabel(df):
    exact = [
        set(t) == set(p)
        for t, p in zip(df["true_labels"], df["predicted_labels"])
    ]
    return sum(exact) / len(exact)

exact_match = exact_match_multilabel(df)
print(f"Exact Match (subset accuracy): {exact_match:.4f}")

Exact Match (subset accuracy): 0.0141


In [28]:
for model in result_files:
    df = pd.read_parquet(result_files[model])
    exact_match = exact_match_multilabel(df)
    print(f'{model}: {exact_match*100}')

Gemma-2-9B (few-shot): 3.345070422535211
Gemma-2-9B (zero-shot): 2.464788732394366
gpt-4o-mini (few-shot): 5.105633802816902
gpt-4o-mini (zero-shot): 3.697183098591549
Mistral-7B-Instruct-v0.3 (few-shot): 1.4084507042253522
Mistral-7B-Instruct-v0.3 (zero-shot): 1.4084507042253522
Qwen-2.5-7B (few-shot): 2.112676056338028
Qwen-2.5-7B (zero-shot): 1.7605633802816902


In [15]:
import pandas as pd

# df = pd.read_parquet('../results/multilabel/gemma-2-9b-it/few-shot-5/test_predictions.parquet')
df = pd.read_parquet('../results/multilabel/gpt-4o-mini/few-shot-5/test_predictions.parquet')
df

Unnamed: 0,input_text,true_labels,predicted_labels
0,Reporte de la Infraestructura Financiera e Ins...,"[E, F, N]","[G, H, O]"
1,Informe de Política Monetaria - Julio 2025. En...,[E],"[E, H]"
2,Informe de Política Monetaria - Julio 2025. En...,[E],"[E, H]"
3,Riesgo de balance en los bancos centrales: el ...,"[E, G]","[E, H, G]"
4,El camino hacia la igualdad de género en Colom...,"[J, N]","[I, J, O]"
...,...,...,...
563,La economía de Barranquilla a comienzos del si...,"[I, L, R]","[R, I, D, H, O]"
564,Informe de Coyuntura Económica Regional : Depa...,"[F, O, R]","[E, F, J, R]"
565,Políticas para reducir las desigualdades regio...,"[D, H, P]","[O, R, H]"
566,Un análisis de riesgo de crédito de las empres...,"[C, G]","[G, E]"


In [5]:
df[200:240]

Unnamed: 0,input_text,true_labels,predicted_labels
200,Informe especial de estabilidad financiera : c...,"[E, R]","[E, G, H]"
201,El canal de crédito bancario de la política mo...,"[E, G]","[E, G]"
202,Geografía económica del archipiélago de San An...,"[Q, Z]","[R, O, I, F]"
203,"Boletín Económico Regional : Nororiente, I tri...","[F, O]","[R, O, L, M]"
204,"Boletín Económico Regional : Eje Cafetero, I t...","[F, O]","[E, R, J, H]"
205,Informe especial de estabilidad financiera : a...,"[G, P, R]","[G, R, H]"
206,Impactos de los fenómenos climáticos sobre el ...,"[C, E]","[E, Q]"
207,Nota de los editores invitados. Crecimiento de...,"[C, R]","[O, P, N, L, I]"
208,Compensación entre riesgos de liquidez y contr...,"[D, E, L]","[E, F, G]"
209,Informe del Secretario de Hacienda al Congreso...,"[F, H]","[H, N]"


In [13]:
df

Unnamed: 0,input_text,true_labels,predicted_labels
0,Reporte de la Infraestructura Financiera e Ins...,"[E, F, N]","[G, H, O]"
1,Informe de Política Monetaria - Julio 2025. En...,[E],"[E, H]"
2,Informe de Política Monetaria - Julio 2025. En...,[E],"[E, H]"
3,Riesgo de balance en los bancos centrales: el ...,"[E, G]","[E, H, G]"
4,El camino hacia la igualdad de género en Colom...,"[J, N]","[I, J, O]"
...,...,...,...
563,La economía de Barranquilla a comienzos del si...,"[I, L, R]","[R, I, D, H, O]"
564,Informe de Coyuntura Económica Regional : Depa...,"[F, O, R]","[E, F, J, R]"
565,Políticas para reducir las desigualdades regio...,"[D, H, P]","[O, R, H]"
566,Un análisis de riesgo de crédito de las empres...,"[C, G]","[G, E]"


In [14]:
def exact_match_multilabel(df):
    exact = [
        set(t) == set(p)
        for t, p in zip(df["true_labels"], df["predicted_labels"])
    ]
    return sum(exact) / len(exact)

exact_match = exact_match_multilabel(df)
print(f"Exact Match (subset accuracy): {exact_match:.4f}")

Exact Match (subset accuracy): 0.0511
