In [10]:
import sys
from pathlib import Path

try:
    # En algunos entornos de notebook, __file__ no existe
    notebook_path = Path(__file__).resolve().parent
except NameError:
    # Si no existe __file__, usar el directorio actual y buscar hacia arriba
    notebook_path = Path.cwd()
    # Buscar el directorio que contiene 'src'
    while not (notebook_path / 'src').exists() and notebook_path != notebook_path.parent:
        notebook_path = notebook_path.parent

project_root = notebook_path
sys.path.insert(0, str(project_root))

print(f"Project root: {project_root}")
print(f"Existe src/: {(project_root / 'src').exists()}")
# Ahora importa SIN los ".." (imports absolutos)
from src.evaluation.multilabel_predictor import MultiLabelPredictor
from src.evaluation.metrics import compute_multilabel_metrics, save_metrics
from src.utils.multilabel_datareader import MultiLabelDataset
from src.utils.jel_categories import get_jel_names
from src.models.huggingface_llm import HuggingFaceLLM
from src.models.llm_multilabel_model import LLMMultilabelModel
from src.prompts.multilabel_prompt import MultiLabelPromptTemplate
import torch


def main():
    print("="*60)
    print("TESTING MULTILABEL CLASSIFICATION FOR JEL GENERAL CATEGORIES")
    print("="*60)
    
    # Load dataset
    print("\n[1/4] Loading dataset...")
    dataset = MultiLabelDataset(data_dir="data/multilabel_banrep")
    print(dataset)
    print(f"Available labels: {dataset.labels}")
    
    # Create LLM
    print("\n[2/4] Loading LLM...")
    llm = HuggingFaceLLM(
        model_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
        load_in_4bit=False,
        torch_dtype=torch.float32 if not torch.cuda.is_available() else torch.float16
    )
    
    # Create prompt template
    print("\n[3/4] Creating prompt template...")
    label_names = get_jel_names(language="es")
    prompt_template = MultiLabelPromptTemplate(
        available_labels=dataset.labels,
        language="es",
        label_descriptions=label_names,
    )
    
    # Create multi-label model
    model = LLMMultiLabelModel(
        llm=llm,
        available_labels=dataset.labels,
        prompt_template=prompt_template,
        batch_size=4
    )
    print(model)
    
    # Create predictor and save results for all splits
    print("\n[4/4] Generating predictions and computing metrics for splits...")
    predictor = MultiLabelPredictor(model, dataset)

    for split in ["dev", "test"]:
        print(f"\nProcessing split: {split}")
        results_df = predictor.predict_split(split=split, batch_size=4)

        # Save predictions
        output_pred = Path(f"results/multilabel/TinyLlama-1.1B/{split}_predictions.parquet")
        output_pred.parent.mkdir(parents=True, exist_ok=True)
        results_df.to_parquet(output_pred, index=False)
        print(f"Predicciones guardadas en: {output_pred}")

        # Compute and save metrics
        metrics = compute_multilabel_metrics(
            true_labels=results_df["true_labels"].tolist(),
            pred_labels=results_df["predicted_labels"].tolist(),
            all_labels=dataset.labels,
        )
        output_metrics = Path(f"results/multilabel/TinyLlama-1.1B/{split}_metrics.json")
        save_metrics(metrics, str(output_metrics))
        print(f"Métricas guardadas en: {output_metrics}")
    print("\n" + "="*60)
    print("TEST COMPLETED")
    print("="*60)


if __name__ == "__main__":
    main()

Project root: /
Existe src/: False


ModuleNotFoundError: No module named 'src'

In [11]:
!pwd
!ls -la

/content
total 16
drwxr-xr-x 1 root root 4096 Dec  9 14:41 .
drwxr-xr-x 1 root root 4096 Dec 13 07:01 ..
drwxr-xr-x 4 root root 4096 Dec  9 14:41 .config
drwxr-xr-x 1 root root 4096 Dec  9 14:42 sample_data
total 16
drwxr-xr-x 1 root root 4096 Dec  9 14:41 .
drwxr-xr-x 1 root root 4096 Dec 13 07:01 ..
drwxr-xr-x 4 root root 4096 Dec  9 14:41 .config
drwxr-xr-x 1 root root 4096 Dec  9 14:42 sample_data


In [13]:
!cd /content pwd

/bin/bash: line 1: cd: too many arguments


In [None]:
# Rutas de resultados y datos
results_dir = Path('../results/multilabel/TinyLlama-1.1B')
data_dir = Path('../data/multilabel_banrep')

# Cargar dataset para obtener las etiquetas disponibles
dataset = MultiLabelDataset(str(data_dir))
all_labels = dataset.labels
print(dataset)

In [None]:
# Cargar predicciones dev/test
df_dev = pd.read_parquet(results_dir / 'dev_predictions.parquet')
df_test = pd.read_parquet(results_dir / 'test_predictions.parquet')
print('Dev samples:', len(df_dev), 'Test samples:', len(df_test))
df_dev.head()

In [None]:
# Calcular métricas para dev y test
metrics_dev = compute_multilabel_metrics(
    df_dev['true_labels'].tolist(),
    df_dev['predicted_labels'].tolist(),
    all_labels
)
metrics_test = compute_multilabel_metrics(
    df_test['true_labels'].tolist(),
    df_test['predicted_labels'].tolist(),
    all_labels
)
print('OK: métricas calculadas')

In [None]:
# Resumen de métricas
summary = pd.DataFrame([
print('
' + '='*60)
print('RESUMEN DE MÉTRICAS')
print('='*60)
print(summary.to_string(index=False))
summary

In [None]:
# Métricas por etiqueta (test) con nombres JEL
jel_names = get_jel_names('es')
per_label_test_df = pd.DataFrame.from_dict(metrics_test['per_label'], orient='index').reset_index().rename(columns={'index': 'label'})
per_label_test_df['name'] = per_label_test_df['label'].map(jel_names)
per_label_test_df = per_label_test_df.sort_values(['support','f1'], ascending=[False, False])
per_label_test_df.head(20)

In [None]:
# Visualizaciones: soporte y F1 para top etiquetas
top_support = per_label_test_df.head(15)
plt.figure(figsize=(10,6))
sns.barplot(data=top_support, x='label', y='support')
plt.title('Top 15 etiquetas por soporte (test)')
plt.tight_layout()
plt.show()

plt.figure(figsize=(10,6))
sns.barplot(data=top_support, x='label', y='f1')
plt.title('F1 para top 15 etiquetas (test)')
plt.tight_layout()
plt.show()

In [None]:
# Guardar métricas en JSON (opcional)
with open(results_dir / 'dev_metrics_notebook.json', 'w', encoding='utf-8') as f:
    json.dump(metrics_dev, f, ensure_ascii=False, indent=2)
with open(results_dir / 'test_metrics_notebook.json', 'w', encoding='utf-8') as f:
    json.dump(metrics_test, f, ensure_ascii=False, indent=2)
print('Guardadas métricas dev/test en JSON')

In [None]:
# Definir modelos y rutas de predicciones para evaluación comparativa
from pathlib import Path
import pandas as pd

# Ajusta estas rutas según tus carpetas en results/multilabel/
multilabel_results_root = Path('../results/multilabel')

result_files = {
    'TinyLlama-1.1B (zero-shot)': multilabel_results_root / 'TinyLlama-1.1B/test_predictions.parquet',
    # Ejemplos adicionales si ya generaste predicciones con estos modelos:
    # 'Llama-3.1-8B (zero-shot)': multilabel_results_root / 'meta-llama_Meta-Llama-3.1-8B-Instruct/test_predictions.parquet',
    # 'Gemma-2-9B (zero-shot)': multilabel_results_root / 'google_gemma-2-9b-it/test_predictions.parquet',
    # 'Qwen-2.5-7B (zero-shot)': multilabel_results_root / 'Qwen_Qwen2.5-7B-Instruct/test_predictions.parquet',
}

# Cargar dataset (si no existe en variables previas) para obtener etiquetas
if 'all_labels' not in globals() or not all_labels:
    dataset = MultiLabelDataset('../data/multilabel_banrep')
    all_labels = dataset.labels
    print(dataset)

In [None]:
# Evaluación comparativa: cargar y calcular métricas para cada modelo
from src.evaluation.metrics import compute_multilabel_metrics

all_metrics = []
failed = []

for name, filepath in result_files.items():
    if not filepath.exists():
        failed.append((name, str(filepath)))
        print(f"[WARN] No existe: {filepath}")
        continue
    
    df = pd.read_parquet(filepath)
    if not {'true_labels','predicted_labels'}.issubset(df.columns):
        failed.append((name, str(filepath)))
        print(f"[WARN] Faltan columnas en {filepath} (true_labels/predicted_labels)")
        continue
    
    m = compute_multilabel_metrics(
        df['true_labels'].tolist(),
        df['predicted_labels'].tolist(),
        all_labels,
    )
    all_metrics.append({
        'name': name,
        'num_samples': len(df),
        'subset_accuracy': m['subset_accuracy'],
        'hamming_loss': m['hamming_loss'],
        'precision_micro': m['precision_micro'],
        'recall_micro': m['recall_micro'],
        'f1_micro': m['f1_micro'],
        'precision_macro': m['precision_macro'],
        'recall_macro': m['recall_macro'],
        'f1_macro': m['f1_macro'],
    })

print(f"\nEvaluados {len(all_metrics)} modelos. Fallidos: {len(failed)}")
if failed:
    print("Fallidos:")
    for name, path in failed:
        print(f"- {name}: {path}")

In [None]:
# Tabla comparativa ordenada por F1 Macro
if all_metrics:
    df_metrics = pd.DataFrame(all_metrics)
    df_metrics = df_metrics[['name','num_samples','subset_accuracy','hamming_loss','f1_macro','f1_micro','precision_macro','recall_macro']]
    df_metrics = df_metrics.sort_values('f1_macro', ascending=False)
    
    print("\n" + "="*80)
    print("RESULTADOS COMPARATIVOS MULTILABEL (JEL)")
    print("="*80)
    print(df_metrics.to_string(index=False))
else:
    print('[WARN] No hay métricas para mostrar. Verifica rutas en result_files.')