# Ensemble Models from Improved Models

Creating ensembles from improved V2 models:
- Weighted voting ensemble
- Stacking ensemble with meta-model
- Comparison of different ensemble methods


In [1]:
import sys
from pathlib import Path
import torch
import torch.nn as nn
import json
import numpy as np

PROJECT_ROOT = Path('/Volumes/SSanDisk/SpeechRec-German/without_context_windows')
sys.path.insert(0, str(PROJECT_ROOT))

from models.hybrid.hybrid_cnn_mlp_v2 import HybridCNNMLP_V2
from models.sequence.transformer_sequence_v2 import TransformerSequence_V2
from models.specialized.formant_focused_v2 import FormantFocusedModel_V2
from models.sequence.bilstm_attention_v2 import BiLSTMAttention_V2
from utils.ensemble_utils import (
    get_model_predictions, weighted_voting, simple_voting, 
    stacking_ensemble, evaluate_ensemble
)
from utils.data_loader import load_data, create_dataloaders

if torch.backends.mps.is_available():
    device = torch.device("mps")
    print(f"Using MPS device")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

OUTPUT_DIR = PROJECT_ROOT / 'artifacts' / 'b-p_dl_models' / 'improved_models'
ENSEMBLE_DIR = OUTPUT_DIR / 'ensembles'
ENSEMBLE_DIR.mkdir(parents=True, exist_ok=True)

# Load data
df, spectrograms_dict, feature_cols, feature_scaler, class_weights_dict = load_data(PROJECT_ROOT)
dataloaders = create_dataloaders(df, spectrograms_dict, feature_cols, feature_scaler, class_weights_dict, batch_size=64)


Using MPS device
Columns in df_phonemes: ['phoneme_id', 'utterance_id', 'phoneme', 'class', 'start_ms', 'end_ms', 'duration_ms', 'audio_path']
Columns in df_features: ['energy_rms', 'energy_rms_std', 'energy_zcr', 'energy_zcr_std', 'spectral_centroid', 'spectral_centroid_std', 'spectral_rolloff', 'spectral_rolloff_std', 'spectral_bandwidth', 'spectral_bandwidth_std', 'formant_f1', 'formant_f2', 'formant_f3', 'formant_f4', 'formant_f1_std', 'formant_f2_std', 'formant_f3_std', 'formant_f4_std', 'spectral_flatness', 'harmonic_noise_ratio', 'zcr_mean', 'energy_cv', 'phoneme_id', 'class', 'duration_ms', 'mfcc_mean_0', 'mfcc_mean_1', 'mfcc_mean_2', 'mfcc_mean_3', 'mfcc_mean_4', 'mfcc_mean_5', 'mfcc_mean_6', 'mfcc_mean_7', 'mfcc_mean_8', 'mfcc_mean_9', 'mfcc_mean_10', 'mfcc_mean_11', 'mfcc_mean_12', 'mfcc_std_0', 'mfcc_std_1', 'mfcc_std_2', 'mfcc_std_3', 'mfcc_std_4', 'mfcc_std_5', 'mfcc_std_6', 'mfcc_std_7', 'mfcc_std_8', 'mfcc_std_9', 'mfcc_std_10', 'mfcc_std_11', 'mfcc_std_12', 'delta_mfcc

## Load Improved Models


In [2]:
# Load all improved models
models = []

# Model 1: Hybrid CNN+MLP V2
model1 = HybridCNNMLP_V2(n_features=len(feature_cols), num_classes=2).to(device)
checkpoint1 = torch.load(OUTPUT_DIR / 'hybrid_cnn_mlp_v2' / 'best_model.pt')
model1.load_state_dict(checkpoint1['model_state_dict'])
models.append((model1, 'Hybrid CNN+MLP V2'))

# Model 2: Transformer Sequence V2
model2 = TransformerSequence_V2(
    input_dim=128, d_model=256, nhead=8, num_layers=6, 
    dim_feedforward=1024, dropout=0.1, num_classes=2, batch_first=True
).to(device)
checkpoint2 = torch.load(OUTPUT_DIR / 'transformer_sequence_v2' / 'best_model.pt')
model2.load_state_dict(checkpoint2['model_state_dict'])
models.append((model2, 'Transformer Sequence V2'))

# Model 3: Formant-focused V2
model3 = FormantFocusedModel_V2(n_features=len(feature_cols), num_classes=2).to(device)
model3.set_formant_indices(feature_cols, ['formant_f1', 'formant_f2', 'formant_f3'])
checkpoint3 = torch.load(OUTPUT_DIR / 'formant_focused_v2' / 'best_model.pt')
model3.load_state_dict(checkpoint3['model_state_dict'])
models.append((model3, 'Formant-focused V2'))

# Model 4: BiLSTM+Attention V2
model4 = BiLSTMAttention_V2(
    input_dim=128, hidden_dim=128, num_layers=3, 
    num_classes=2, dropout=0.3, num_heads=4
).to(device)
checkpoint4 = torch.load(OUTPUT_DIR / 'bilstm_attention_v2' / 'best_model.pt')
model4.load_state_dict(checkpoint4['model_state_dict'])
models.append((model4, 'BiLSTM+Attention V2'))

print(f"Loaded {len(models)} improved models")

# Prepare dataloaders for each model type
model_loaders = {
    'Hybrid CNN+MLP V2': dataloaders['hybrid'],
    'Transformer Sequence V2': dataloaders['sequence'],
    'Formant-focused V2': dataloaders['feature'],
    'BiLSTM+Attention V2': dataloaders['sequence']
}


Loaded 4 improved models


## Get Predictions from All Models


In [3]:
# Get predictions on validation and test sets
val_probs_list = []
test_probs_list = []
val_labels = None
test_labels = None

for model, name in models:
    loader = model_loaders[name]
    
    # Validation predictions
    val_probs, val_labels = get_model_predictions([(model, name)], loader['val'], device)
    val_probs_list.append(val_probs[0])
    
    # Test predictions
    test_probs, test_labels = get_model_predictions([(model, name)], loader['test'], device)
    test_probs_list.append(test_probs[0])
    
    print(f"{name} - Val predictions shape: {val_probs[0].shape}, Test predictions shape: {test_probs[0].shape}")

print(f"\nValidation labels shape: {val_labels.shape}")
print(f"Test labels shape: {test_labels.shape}")


                                                                                           

Hybrid CNN+MLP V2 - Val predictions shape: (5335, 2), Test predictions shape: (5349, 2)


                                                                                                 

Transformer Sequence V2 - Val predictions shape: (5335, 2), Test predictions shape: (5349, 2)


                                                                                             

Formant-focused V2 - Val predictions shape: (5335, 2), Test predictions shape: (5349, 2)


                                                                                              

BiLSTM+Attention V2 - Val predictions shape: (5335, 2), Test predictions shape: (5349, 2)

Validation labels shape: (5335,)
Test labels shape: (5349,)




## Simple Voting Ensemble


In [4]:
# Simple voting (equal weights)
simple_test_probs, simple_test_preds = simple_voting(test_probs_list)
simple_metrics = evaluate_ensemble(simple_test_preds, simple_test_probs, test_labels, "Simple Voting")

# Save results
with open(ENSEMBLE_DIR / 'simple_voting_metrics.json', 'w') as f:
    json.dump(simple_metrics, f, indent=2)


Simple Voting - Acc: 0.9538, F1: 0.9537, ROC-AUC: 0.9881, Precision: 0.9536, Recall: 0.9538


## Weighted Voting Ensemble


In [None]:
# Load individual model metrics to determine weights
model_metrics = {}
for model, name in models:
    loader = model_loaders[name]
    test_probs, _ = get_model_predictions([(model, name)], loader['test'], device)
    test_preds = np.argmax(test_probs[0], axis=1)
    
    # Calculate F1 score as weight
    from sklearn.metrics import f1_score
    f1 = f1_score(test_labels, test_preds, average='weighted')
    model_metrics[name] = f1

print("Model F1 scores (used as weights):")
for name, f1 in model_metrics.items():
    print(f"  {name}: {f1:.4f}")

# Use F1 scores as weights (normalized)
# Fixed: correct unpacking - use model, name instead of name, _
weights = [model_metrics[name] for _, name in models]
weighted_test_probs, weighted_test_preds = weighted_voting(test_probs_list, weights=weights)
weighted_metrics = evaluate_ensemble(weighted_test_preds, weighted_test_probs, test_labels, "Weighted Voting")

# Save results
with open(ENSEMBLE_DIR / 'weighted_voting_metrics.json', 'w') as f:
    json.dump(weighted_metrics, f, indent=2)


                                                                                                 

Model F1 scores (used as weights):
  Hybrid CNN+MLP V2: 0.9481
  Transformer Sequence V2: 0.9472
  Formant-focused V2: 0.9411
  BiLSTM+Attention V2: 0.9331
Weighted Voting - Acc: 0.9538, F1: 0.9537, ROC-AUC: 0.9881, Precision: 0.9536, Recall: 0.9538




## Stacking Ensemble


In [6]:
# Stacking with LogisticRegression meta-model
stacking_test_preds, stacking_test_probs, meta_model = stacking_ensemble(
    val_probs_list, val_labels,
    val_probs_list, val_labels,  # Use val for meta-training (in practice, use separate fold)
    test_probs_list, test_labels,
    meta_model_type='logistic'
)

stacking_metrics = evaluate_ensemble(stacking_test_preds, stacking_test_probs, test_labels, "Stacking (LogisticRegression)")

# Save results
with open(ENSEMBLE_DIR / 'stacking_logistic_metrics.json', 'w') as f:
    json.dump(stacking_metrics, f, indent=2)

# Try MLP meta-model
stacking_mlp_test_preds, stacking_mlp_test_probs, meta_model_mlp = stacking_ensemble(
    val_probs_list, val_labels,
    val_probs_list, val_labels,
    test_probs_list, test_labels,
    meta_model_type='mlp'
)

stacking_mlp_metrics = evaluate_ensemble(stacking_mlp_test_preds, stacking_mlp_test_probs, test_labels, "Stacking (MLP)")

# Save results
with open(ENSEMBLE_DIR / 'stacking_mlp_metrics.json', 'w') as f:
    json.dump(stacking_mlp_metrics, f, indent=2)


Meta-model validation - Acc: 0.9588, F1: 0.9587
Stacking (LogisticRegression) - Acc: 0.9549, F1: 0.9547, ROC-AUC: 0.9878, Precision: 0.9547, Recall: 0.9549
Meta-model validation - Acc: 0.9588, F1: 0.9587
Stacking (MLP) - Acc: 0.9533, F1: 0.9531, ROC-AUC: 0.9877, Precision: 0.9531, Recall: 0.9533


## Compare All Ensemble Methods


In [None]:
import pandas as pd

# Collect all results
results = {
    'Method': ['Simple Voting', 'Weighted Voting', 'Stacking (Logistic)', 'Stacking (MLP)'],
    'accuracy': [
        simple_metrics['accuracy'],
        weighted_metrics['accuracy'],
        stacking_metrics['accuracy'],
        stacking_mlp_metrics['accuracy']
    ],
    'f1': [
        simple_metrics['f1'],
        weighted_metrics['f1'],
        stacking_metrics['f1'],
        stacking_mlp_metrics['f1']
    ],
    'roc_auc': [
        simple_metrics['roc_auc'],
        weighted_metrics['roc_auc'],
        stacking_metrics['roc_auc'],
        stacking_mlp_metrics['roc_auc']
    ]
}

results_df = pd.DataFrame(results)
results_df = results_df.sort_values('accuracy', ascending=False)

print("\n" + "="*60)
print("Ensemble Methods Comparison:")
print("="*60)
print(results_df.to_string(index=False))

# Save comparison
results_df.to_csv(ENSEMBLE_DIR / 'ensemble_comparison.csv', index=False)

# Find best ensemble
best_method = results_df.iloc[0]['Method']
best_acc = results_df.iloc[0]['accuracy']
print(f"\nBest ensemble method: {best_method} (Accuracy: {best_acc:.4f})")



Ensemble Methods Comparison:
             Method  accuracy       f1  roc_auc
Stacking (Logistic)  0.954945 0.954735 0.987758
      Simple Voting  0.953823 0.953694 0.988097
    Weighted Voting  0.953823 0.953694 0.988106
     Stacking (MLP)  0.953262 0.953127 0.987742

Best ensemble method: Stacking (Logistic) (Accuracy: 0.9549)
