# Ensemble de Modelos LLM (Validación y Test)
Combina las predicciones de los mejores modelos mediante votación mayoritaria.
Procesa tanto el conjunto de validación como el de test por separado.

In [None]:
import json
import pandas as pd
import numpy as np
import os
from collections import Counter
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

## 1. Configuración y Carga de Gold Standard

In [None]:
# Cargar gold standard de validación
with open('../preprocessed_data/val_preprocessed_v2.json', 'r', encoding='utf-8') as f:
    val_gold_data = json.load(f)
val_gold_dict = {str(item['id_EXIST']): item['task1'] for item in val_gold_data}

# Cargar datos de test (sin labels)
with open('../preprocessed_data/test_preprocessed_v2.json', 'r', encoding='utf-8') as f:
    test_data = json.load(f)
test_ids = {str(item['id_EXIST']) for item in test_data}

print(f"Validation samples: {len(val_gold_dict)}")
print(f"Test samples: {len(test_ids)}")

In [None]:
# Configuración de modelos - buscar en results_v2
# Estructura: results_v2/{MODEL_NAME}/predictions/{dev_predictions_temp.json, BeingChillingWeWillWin_*.json}
model_configs = [
    {'name': 'Ministral8B_ft', 'path': '../results_v2/Ministral8B_ft/predictions'},
    {'name': 'Ministral8B', 'path': '../results_v2/Ministral8B/predictions'},
    {'name': 'f2llm4B', 'path': '../results_v2/F2LLM-4B/predictions'},
    {'name': 'f2llm4B_clean', 'path': '../results_v2/F2LLM-4B_clean/predictions'},
    {'name': 'KaLM', 'path': '../results_v2/KaLM/predictions'},
    {'name': 'KaLM_clean', 'path': '../results_v2/KaLM_clean/predictions'}
]

def load_predictions(model_configs, split='val'):
    """Load predictions for validation or test split"""
    predictions = {}
    
    for model_cfg in model_configs:
        model_name = model_cfg['name']
        pred_dir = model_cfg['path']
        
        # Intentar cargar predicciones
        if split == 'val':
            # Buscar dev_predictions_temp.json o val_predictions*.json
            candidates = [
                os.path.join(pred_dir, 'dev_predictions_temp.json'),
                os.path.join(pred_dir, 'val_predictions_temp.json'),
                os.path.join(pred_dir, f'BeingChillingWeWillWin_{model_name}_val.json')
            ]
        else:  # test
            # Buscar archivos de test
            candidates = [
                os.path.join(pred_dir, f'BeingChillingWeWillWin_{model_name}.json'),
                os.path.join(pred_dir, f'{model_name}_test.json')
            ]
        
        loaded = False
        for path in candidates:
            try:
                with open(path, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                    predictions[model_name] = {str(item['id']): item['value'] for item in data}
                    print(f"✓ Loaded {model_name} {split}: {len(predictions[model_name])} samples from {os.path.basename(path)}")
                    loaded = True
                    break
            except FileNotFoundError:
                continue
            except (KeyError, json.JSONDecodeError) as e:
                print(f"✗ Error loading {path}: {e}")
                continue
        
        if not loaded:
            print(f"✗ Could not find predictions for {model_name} ({split})")
    
    return predictions

# Cargar predicciones de validación y test
val_predictions = load_predictions(model_configs, split='val')
test_predictions = load_predictions(model_configs, split='test')

print(f"\n{len(val_predictions)} models loaded for validation")
print(f"{len(test_predictions)} models loaded for test")

## 2. Evaluar Modelos Individuales en Validación

In [None]:
val_results = []

for name, preds in val_predictions.items():
    common_ids = set(preds.keys()) & set(val_gold_dict.keys())
    
    if len(common_ids) == 0:
        print(f"Warning: No matching IDs for {name} in validation set")
        continue
    
    y_true = [val_gold_dict[id_] for id_ in common_ids]
    y_pred = [preds[id_] for id_ in common_ids]
    
    val_results.append({
        'Model': name,
        'Samples': len(common_ids),
        'Accuracy': accuracy_score(y_true, y_pred),
        'Precision': precision_score(y_true, y_pred, pos_label='YES', zero_division=0),
        'Recall': recall_score(y_true, y_pred, pos_label='YES', zero_division=0),
        'F1': f1_score(y_true, y_pred, pos_label='YES', zero_division=0)
    })

if len(val_results) > 0:
    df_val_results = pd.DataFrame(val_results).sort_values('F1', ascending=False)
    print("\n" + "="*70)
    print("VALIDATION - Individual Model Results:")
    print("="*70)
    print(df_val_results.to_string(index=False))
else:
    df_val_results = pd.DataFrame()
    print("Warning: No validation results available")



## 3. Ensemble por Votación Mayoritaria

In [None]:
def voting_ensemble(predictions_dict, ids):
    """Apply majority voting ensemble"""
    ensemble_preds = {}
    for id_ in ids:
        votes = [preds[id_] for preds in predictions_dict.values() if id_ in preds]
        if votes:
            ensemble_preds[id_] = Counter(votes).most_common(1)[0][0]
    return ensemble_preds

def evaluate_ensemble(predictions_dict, gold_dict, config_models):
    """Evaluate ensemble configuration"""
    filtered_preds = {k: v for k, v in predictions_dict.items() if k in config_models}
    
    if len(filtered_preds) == 0:
        return None
    
    common_ids = set(filtered_preds[list(filtered_preds.keys())[0]].keys())
    for preds in filtered_preds.values():
        common_ids &= set(preds.keys())
    
    if len(common_ids) == 0:
        return None
    
    ensemble_preds = voting_ensemble(filtered_preds, common_ids)
    gold_ids = set(gold_dict.keys()) & common_ids
    
    if len(gold_ids) == 0:
        return {
            'ensemble_preds': ensemble_preds,
            'metrics': None
        }
    
    y_true = [gold_dict[id_] for id_ in gold_ids]
    y_pred = [ensemble_preds[id_] for id_ in gold_ids]
    
    return {
        'ensemble_preds': ensemble_preds,
        'metrics': {
            'Samples': len(gold_ids),
            'Accuracy': accuracy_score(y_true, y_pred),
            'Precision': precision_score(y_true, y_pred, pos_label='YES', zero_division=0),
            'Recall': recall_score(y_true, y_pred, pos_label='YES', zero_division=0),
            'F1': f1_score(y_true, y_pred, pos_label='YES', zero_division=0)
        }
    }

In [None]:
# Configuraciones de ensemble
ensemble_configs = [
    {'name': 'All_Models', 'models': list(val_predictions.keys())},
    {'name': 'Top_3_Best_F1', 'models': df_val_results.head(3)['Model'].tolist() if len(df_val_results) >= 3 else list(val_predictions.keys())[:3]},
    {'name': 'Ministral_Only', 'models': [k for k in val_predictions.keys() if 'Ministral' in k or 'ministral' in k]},
    {'name': 'Fine_tuned_Models', 'models': [k for k in val_predictions.keys() if 'ft' in k or 'clean' in k]}
]

print("\n" + "="*70)
print("Ensemble Configurations:")
print("="*70)
for cfg in ensemble_configs:
    print(f"{cfg['name']}: {cfg['models']}")

## 4. Evaluar Ensembles en Validación

In [None]:
val_ensemble_results = []
val_ensemble_preds_dict = {}

for config in ensemble_configs:
    if not config['models']:
        continue
    
    result = evaluate_ensemble(val_predictions, val_gold_dict, config['models'])
    
    if result is None:
        continue
    
    val_ensemble_preds_dict[config['name']] = result['ensemble_preds']
    
    if result['metrics'] is not None:
        val_ensemble_results.append({
            'Ensemble': config['name'],
            'N_Models': len(config['models']),
            **result['metrics']
        })

if len(val_ensemble_results) > 0:
    df_val_ensemble = pd.DataFrame(val_ensemble_results).sort_values('F1', ascending=False)
    print("\n" + "="*70)
    print("VALIDATION - Ensemble Results (Majority Voting):")
    print("="*70)
    print(df_val_ensemble.to_string(index=False))
else:
    df_val_ensemble = pd.DataFrame()
    print("No validation ensemble results available")

No valid results with F1 scores


## 5. Generar Ensembles para Test

In [None]:
test_ensemble_preds_dict = {}

for config in ensemble_configs:
    if not config['models']:
        continue
    
    # Filtrar solo los modelos que tienen predicciones de test
    available_models = [m for m in config['models'] if m in test_predictions]
    if len(available_models) == 0:
        print(f"No test predictions available for {config['name']}")
        continue
    
    filtered_preds = {k: v for k, v in test_predictions.items() if k in available_models}
    
    # Encontrar IDs comunes
    common_ids = set(filtered_preds[list(filtered_preds.keys())[0]].keys())
    for preds in filtered_preds.values():
        common_ids &= set(preds.keys())
    
    if len(common_ids) == 0:
        print(f"No common test IDs for {config['name']}")
        continue
    
    # Generar ensemble
    ensemble_preds = voting_ensemble(filtered_preds, common_ids)
    test_ensemble_preds_dict[config['name']] = ensemble_preds
    
    print(f"✓ Generated test ensemble '{config['name']}': {len(ensemble_preds)} predictions")

print(f"\nTotal test ensembles created: {len(test_ensemble_preds_dict)}")

No common IDs found for confusion matrix


## 6. Comparación Validación: Individual vs Ensemble

In [None]:
if len(df_val_results) > 0 or len(df_val_ensemble) > 0:
    dfs_to_concat = []
    if len(df_val_results) > 0:
        dfs_to_concat.append(df_val_results.assign(Type='Individual'))
    if len(df_val_ensemble) > 0:
        dfs_to_concat.append(df_val_ensemble.rename(columns={'Ensemble': 'Model', 'N_Models': 'Note'}).assign(Type='Ensemble'))
    
    all_val_results = pd.concat(dfs_to_concat, ignore_index=True)
    
    # Convert numeric columns
    numeric_cols = ['Accuracy', 'Precision', 'Recall', 'F1']
    for col in numeric_cols:
        if col in all_val_results.columns:
            all_val_results[col] = pd.to_numeric(all_val_results[col], errors='coerce')
    
    all_val_results = all_val_results.dropna(subset=['F1'])
    
    if len(all_val_results) > 0:
        print("\n" + "="*70)
        print("VALIDATION - Top 10 Models (Individual + Ensemble):")
        print("="*70)
        print(all_val_results.nlargest(10, 'F1')[['Type', 'Model', 'F1', 'Accuracy', 'Precision', 'Recall']].to_string(index=False))
        
        # Visualización
        plt.figure(figsize=(12, 6))
        data_to_plot = all_val_results.nlargest(10, 'F1')
        x = range(len(data_to_plot))
        plt.bar(x, data_to_plot['F1'], alpha=0.7, color=['green' if t == 'Ensemble' else 'blue' for t in data_to_plot['Type']])
        plt.xlabel('Model')
        plt.ylabel('F1 Score')
        plt.title('Validation F1 Score: Individual vs Ensemble')
        plt.xticks(x, data_to_plot['Model'], rotation=45, ha='right')
        plt.legend(['Ensemble', 'Individual'])
        plt.tight_layout()
        plt.grid(axis='y', alpha=0.3)
        plt.show()
    else:
        print("No valid validation results with F1 scores")
else:
    print("No validation results available for comparison")

No data available for agreement analysis


## 7. Matriz de Confusión del Mejor Ensemble

In [40]:
if len(df_val_ensemble) > 0 and df_val_ensemble['F1'].notna().any():
    best_idx = df_val_ensemble['F1'].idxmax()
    best_config_name = df_val_ensemble.iloc[best_idx]['Ensemble']
    best_config = next((cfg for cfg in ensemble_configs if cfg['name'] == best_config_name), None)
else:
    best_config = ensemble_configs[0] if len(ensemble_configs) > 0 else None

if best_config is None or best_config['name'] not in val_ensemble_preds_dict:
    print("No ensemble predictions available for confusion matrix")
else:
    ensemble_preds = val_ensemble_preds_dict[best_config['name']]
    common_ids = set(ensemble_preds.keys()) & set(val_gold_dict.keys())
    
    if len(common_ids) == 0:
        print("No common IDs found for confusion matrix")
    else:
        y_true = [val_gold_dict[id_] for id_ in common_ids]
        y_pred = [ensemble_preds[id_] for id_ in common_ids]
        
        cm = confusion_matrix(y_true, y_pred, labels=['NO', 'YES'])
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['NO', 'YES'], yticklabels=['NO', 'YES'])
        plt.title(f'Confusion Matrix: {best_config["name"]} (Validation)')
        plt.ylabel('True Label')
        plt.xlabel('Predicted Label')
        plt.show()
        
        print(f"\nClassification Report - {best_config['name']}:")
        print(classification_report(y_true, y_pred, labels=['NO', 'YES']))

Saved: ../results/BeingChillingWeWillWin_ensemble_All_Models.json
Models: Ministral8B_ft, Ministral8B, f2llm4B, f2llm4B_clean, KaLM, KaLM_clean
Predictions: 934


## 8. Guardar Predicciones

In [41]:
# Crear directorios de salida
os.makedirs('../results_v2/ensemble/val', exist_ok=True)
os.makedirs('../results_v2/ensemble/test', exist_ok=True)

saved_files = []

# Guardar ensembles de validación
for config_name, preds in val_ensemble_preds_dict.items():
    if len(preds) > 0:
        output_data = [{'test_case': 'EXIST2025', 'id': id_, 'value': pred} for id_, pred in preds.items()]
        output_path = f'../results_v2/ensemble/val/BeingChillingWeWillWin_ensemble_{config_name}_val.json'
        
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(output_data, f, ensure_ascii=False, indent=2)
        
        saved_files.append(output_path)
        print(f"✓ Saved validation: {output_path} ({len(preds)} predictions)")

# Guardar ensembles de test
for config_name, preds in test_ensemble_preds_dict.items():
    if len(preds) > 0:
        output_data = [{'test_case': 'EXIST2025', 'id': id_, 'value': pred} for id_, pred in preds.items()]
        output_path = f'../results_v2/ensemble/test/BeingChillingWeWillWin_ensemble_{config_name}.json'
        
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(output_data, f, ensure_ascii=False, indent=2)
        
        saved_files.append(output_path)
        print(f"✓ Saved test: {output_path} ({len(preds)} predictions)")

print(f"\nTotal files saved: {len(saved_files)}")

Cannot generate summary: missing individual or ensemble results


## 9. Resumen Final

In [None]:
best_individual = df_val_results.iloc[0] if len(df_val_results) > 0 else None
best_ensemble = df_val_ensemble.iloc[0] if len(df_val_ensemble) > 0 else None

print("\n" + "="*70)
print("FINAL SUMMARY - VALIDATION SET")
print("="*70)

if best_individual is not None:
    print(f"\nBest Individual Model: {best_individual['Model']}")
    print(f"  F1: {best_individual['F1']:.4f}")
    print(f"  Accuracy: {best_individual['Accuracy']:.4f}")
    print(f"  Precision: {best_individual['Precision']:.4f}")
    print(f"  Recall: {best_individual['Recall']:.4f}")
else:
    print("\nNo individual model results available")

if best_ensemble is not None:
    print(f"\nBest Ensemble: {best_ensemble['Ensemble']}")
    print(f"  Models: {best_ensemble['N_Models']}")
    print(f"  F1: {best_ensemble['F1']:.4f}")
    print(f"  Accuracy: {best_ensemble['Accuracy']:.4f}")
    print(f"  Precision: {best_ensemble['Precision']:.4f}")
    print(f"  Recall: {best_ensemble['Recall']:.4f}")
else:
    print("\nNo ensemble results available")

if best_individual is not None and best_ensemble is not None:
    improvement = best_ensemble['F1'] - best_individual['F1']
    improvement_pct = (improvement / best_individual['F1'] * 100) if best_individual['F1'] > 0 else 0
    print(f"\nEnsemble Improvement over Best Individual:")
    print(f"  F1 gain: {improvement:+.4f} ({improvement_pct:+.2f}%)")
    print(f"  Status: {'✓ MEJORA' if improvement > 0 else '✗ NO mejora'}")

print("\n" + "="*70)
print(f"Validation ensembles saved: {len(val_ensemble_preds_dict)}")
print(f"Test ensembles saved: {len(test_ensemble_preds_dict)}")
print("="*70)