# Ensemble de Modelos
Combina las predicciones de los mejores modelos mediante votación mayoritaria y promedio de probabilidades.

In [30]:
import json
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

## 1. Cargar Predicciones

In [31]:
with open('../preprocessed_data/val_preprocessed_v2.json', 'r', encoding='utf-8') as f:
    gold_data = json.load(f)

gold_dict = {item['id_EXIST']: item['task1'] for item in gold_data}

In [32]:
models = {
    'Ministral8B_ft': '../results/BeingChillingWeWillWin_3Ministral8B_ft.json',
    'Ministral8B': '../results/BeingChillingWeWillWin_3Ministral8B.json',
    'f2llm4B': '../results/BeingChillingWeWillWin_f2llm4B.json',
    'f2llm4B_clean': '../results/BeingChillingWeWillWin_f2llm4Bclean.json',
    'KaLM': '../results/BeingChillingWeWillWin_KaLM.json',
    'KaLM_clean': '../results/BeingChillingWeWillWin_KaLMclean.json'
}

predictions = {}
for name, path in models.items():
    try:
        with open(path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            predictions[name] = {item['id']: item['value'] for item in data}
    except (FileNotFoundError, KeyError) as e:
        print(f"Warning: Could not load {name} - {e}")

## 2. Evaluar Modelos Individuales

In [33]:
results = []

for name, preds in predictions.items():
    common_ids = set(preds.keys()) & set(gold_dict.keys())
    
    if len(common_ids) == 0:
        continue
    
    y_true = [gold_dict[id_] for id_ in common_ids]
    y_pred = [preds[id_] for id_ in common_ids]
    
    results.append({
        'Model': name,
        'Samples': len(common_ids),
        'Accuracy': accuracy_score(y_true, y_pred),
        'Precision': precision_score(y_true, y_pred, pos_label='YES'),
        'Recall': recall_score(y_true, y_pred, pos_label='YES'),
        'F1': f1_score(y_true, y_pred, pos_label='YES')
    })

if len(results) > 0:
    df_results = pd.DataFrame(results).sort_values('F1', ascending=False)
    print("Individual Model Results:")
    print(df_results.to_string(index=False))
else:
    df_results = pd.DataFrame()
    print("Warning: No matching IDs between predictions and gold standard (test vs validation set)")



## 3. Ensemble por Votación Mayoritaria

In [34]:
def voting_ensemble(predictions_dict, ids):
    ensemble_preds = {}
    for id_ in ids:
        votes = [preds[id_] for preds in predictions_dict.values() if id_ in preds]
        if votes:
            ensemble_preds[id_] = Counter(votes).most_common(1)[0][0]
    return ensemble_preds

In [35]:
ensemble_configs = [
    {'name': 'All_Models', 'models': list(predictions.keys())},
    {'name': 'Top_3_Best_F1', 'models': df_results.head(3)['Model'].tolist() if len(results) >= 3 and len(df_results) > 0 else list(predictions.keys())[:3]},
    {'name': 'Ministral_Only', 'models': [k for k in predictions.keys() if 'Ministral' in k or '3M' in k]},
    {'name': 'Fine_tuned_Models', 'models': [k for k in predictions.keys() if 'ft' in k or 'clean' in k]}
]

ensemble_results = []

for config in ensemble_configs:
    if not config['models']:
        continue
    
    filtered_preds = {k: v for k, v in predictions.items() if k in config['models']}
    common_ids = set(filtered_preds[list(filtered_preds.keys())[0]].keys())
    for preds in filtered_preds.values():
        common_ids &= set(preds.keys())
    
    if len(common_ids) == 0:
        continue
    
    ensemble_preds = voting_ensemble(filtered_preds, common_ids)
    gold_ids = set(gold_dict.keys()) & common_ids
    
    if len(gold_ids) == 0:
        ensemble_results.append({
            'Ensemble': config['name'],
            'N_Models': len(config['models']),
            'Samples': len(common_ids),
            'Accuracy': None,
            'Precision': None,
            'Recall': None,
            'F1': None
        })
    else:
        y_true = [gold_dict[id_] for id_ in gold_ids]
        y_pred = [ensemble_preds[id_] for id_ in gold_ids]
        
        ensemble_results.append({
            'Ensemble': config['name'],
            'N_Models': len(config['models']),
            'Samples': len(gold_ids),
            'Accuracy': accuracy_score(y_true, y_pred),
            'Precision': precision_score(y_true, y_pred, pos_label='YES'),
            'Recall': recall_score(y_true, y_pred, pos_label='YES'),
            'F1': f1_score(y_true, y_pred, pos_label='YES')
        })

df_ensemble = pd.DataFrame(ensemble_results)
if not df_ensemble.empty and df_ensemble['F1'].notna().any():
    df_ensemble = df_ensemble.sort_values('F1', ascending=False)
    print("Ensemble Results (Majority Voting):")
    print(df_ensemble.to_string(index=False))

## 4. Comparación Final

In [36]:
if len(df_results) > 0 or len(df_ensemble) > 0:
    dfs_to_concat = []
    if len(df_results) > 0:
        dfs_to_concat.append(df_results.assign(Type='Individual'))
    if len(df_ensemble) > 0:
        dfs_to_concat.append(df_ensemble.rename(columns={'Ensemble': 'Model', 'N_Models': 'Samples'}).assign(Type='Ensemble'))
    
    all_results = pd.concat(dfs_to_concat, ignore_index=True)
    
    # Convert numeric columns to float and filter out rows with null F1
    numeric_cols = ['Accuracy', 'Precision', 'Recall', 'F1']
    for col in numeric_cols:
        if col in all_results.columns:
            all_results[col] = pd.to_numeric(all_results[col], errors='coerce')
    
    all_results = all_results.dropna(subset=['F1'])
    
    if len(all_results) > 0:
        print("Top 10 Models (Individual + Ensemble):")
        print(all_results.nlargest(10, 'F1')[['Type', 'Model', 'F1', 'Accuracy', 'Precision', 'Recall']].to_string(index=False))
    else:
        print("No valid results with F1 scores")
else:
    print("No results available for comparison")

No valid results with F1 scores


In [37]:
if 'all_results' in dir() and len(all_results) > 0:
    plt.figure(figsize=(12, 6))
    data_to_plot = all_results.nlargest(10, 'F1')
    x = range(len(data_to_plot))

    plt.bar(x, data_to_plot['F1'], alpha=0.7, color=['green' if t == 'Ensemble' else 'blue' for t in data_to_plot['Type']])
    plt.xlabel('Model')
    plt.ylabel('F1 Score')
    plt.title('F1 Score Comparison: Individual vs Ensemble')
    plt.xticks(x, data_to_plot['Model'], rotation=45, ha='right')
    plt.legend(['Ensemble', 'Individual'])
    plt.tight_layout()
    plt.grid(axis='y', alpha=0.3)
    plt.show()
else:
    print("No results available for visualization")

No results available for visualization


## 5. Matriz de Confusión del Mejor Modelo

In [38]:
if len(df_ensemble) > 0 and df_ensemble['F1'].notna().any():
    best_config = ensemble_configs[df_ensemble['F1'].idxmax()]
else:
    best_config = ensemble_configs[0] if len(ensemble_configs) > 0 else None

if best_config is None:
    print("No ensemble configuration available")
else:
    filtered_preds = {k: v for k, v in predictions.items() if k in best_config['models']}
    common_ids = set(gold_dict.keys())
    for preds in filtered_preds.values():
        common_ids &= set(preds.keys())
    
    if len(common_ids) == 0:
        print("No common IDs found for confusion matrix")
    else:
        ensemble_preds = voting_ensemble(filtered_preds, common_ids)
        y_true = [gold_dict[id_] for id_ in common_ids]
        y_pred = [ensemble_preds[id_] for id_ in common_ids]
        
        cm = confusion_matrix(y_true, y_pred, labels=['NO', 'YES'])
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['NO', 'YES'], yticklabels=['NO', 'YES'])
        plt.title(f'Confusion Matrix: {best_config["name"]}')
        plt.ylabel('True Label')
        plt.xlabel('Predicted Label')
        plt.show()
        
        print("\nClassification Report:")
        print(classification_report(y_true, y_pred, labels=['NO', 'YES']))

No common IDs found for confusion matrix


## 6. Análisis de Acuerdo entre Modelos

In [39]:
if 'common_ids' in dir() and 'filtered_preds' in dir() and 'ensemble_preds' in dir() and len(common_ids) > 0:
    agreement_analysis = []

    for id_ in common_ids:
        votes = [preds[id_] for preds in filtered_preds.values()]
        yes_votes = votes.count('YES')
        total_votes = len(votes)
        
        agreement_analysis.append({
            'id': id_,
            'true': gold_dict[id_],
            'ensemble': ensemble_preds[id_],
            'yes_votes': yes_votes,
            'total_votes': total_votes,
            'agreement': 'unanimous' if yes_votes in [0, total_votes] else 'split'
        })

    df_agreement = pd.DataFrame(agreement_analysis)

    print("\nAgreement Analysis:")
    print(f"Unanimous cases: {(df_agreement['agreement'] == 'unanimous').sum()} ({(df_agreement['agreement'] == 'unanimous').mean()*100:.1f}%)")
    print(f"Split cases: {(df_agreement['agreement'] == 'split').sum()} ({(df_agreement['agreement'] == 'split').mean()*100:.1f}%)")

    unanimous_correct = ((df_agreement['agreement'] == 'unanimous') & (df_agreement['true'] == df_agreement['ensemble'])).sum()
    split_correct = ((df_agreement['agreement'] == 'split') & (df_agreement['true'] == df_agreement['ensemble'])).sum()

    print(f"\nAccuracy in unanimous cases: {unanimous_correct}/{(df_agreement['agreement'] == 'unanimous').sum()}")
    print(f"Accuracy in split cases: {split_correct}/{(df_agreement['agreement'] == 'split').sum()}")
else:
    print("No data available for agreement analysis")

No data available for agreement analysis


## 7. Guardar Mejor Ensemble

In [40]:
if len(ensemble_configs) > 0:
    if len(df_ensemble) > 0 and df_ensemble['F1'].notna().any():
        best_idx = df_ensemble['F1'].idxmax()
        best_config = ensemble_configs[best_idx]
    else:
        best_config = ensemble_configs[0]
    
    filtered_preds = {k: v for k, v in predictions.items() if k in best_config['models']}
    common_ids = set(filtered_preds[list(filtered_preds.keys())[0]].keys())
    for preds in filtered_preds.values():
        common_ids &= set(preds.keys())
    
    if len(common_ids) > 0:
        ensemble_preds = voting_ensemble(filtered_preds, common_ids)
        output_data = [{'test_case': 'EXIST2025', 'id': id_, 'value': pred} for id_, pred in ensemble_preds.items()]
        output_path = f'../results/BeingChillingWeWillWin_ensemble_{best_config["name"]}.json'
        
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(output_data, f, ensure_ascii=False)
        
        print(f"Saved: {output_path}")
        print(f"Models: {', '.join(best_config['models'])}")
        print(f"Predictions: {len(ensemble_preds)}")
    else:
        print("No common IDs to save")
else:
    print("No ensemble configurations available")

Saved: ../results/BeingChillingWeWillWin_ensemble_All_Models.json
Models: Ministral8B_ft, Ministral8B, f2llm4B, f2llm4B_clean, KaLM, KaLM_clean
Predictions: 934


## 8. Resumen Final

In [41]:
best_individual = df_results.iloc[0] if len(df_results) > 0 else None
best_ensemble = df_ensemble.iloc[0] if len(df_ensemble) > 0 else None

if best_individual is not None and best_ensemble is not None:
    print("\n" + "="*60)
    print("FINAL SUMMARY")
    print("="*60)
    print(f"\nBest Individual: {best_individual['Model']}")
    print(f"F1: {best_individual['F1']:.4f} | Accuracy: {best_individual['Accuracy']:.4f}")
    print(f"\nBest Ensemble: {best_ensemble['Ensemble']}")
    print(f"F1: {best_ensemble['F1']:.4f} | Accuracy: {best_ensemble['Accuracy']:.4f}")
    improvement = best_ensemble['F1'] - best_individual['F1']
    improvement_pct = (improvement / best_individual['F1'] * 100)
    print(f"\nImprovement: {improvement:.4f} (+{improvement_pct:.2f}%)")
    print("="*60)
else:
    print("Cannot generate summary: missing individual or ensemble results")

Cannot generate summary: missing individual or ensemble results
