# Improved Hybrid CNN+MLP Training (V3) with Probabilities

Training enhanced version of Hybrid CNN+MLP model with probability outputs:
- Enhanced CNN: 64→128→256→512 channels with channel attention
- Enhanced MLP: 512→512→256→128 neurons
- Residual connections with attention in CNN branch
- Improved fusion layers: 512+128→512→256→128→64→2
- Better training: 100-120 epochs, warmup, cosine annealing, gradient clipping
- **Saves probabilities for each phoneme for error analysis**


In [1]:
import sys
from pathlib import Path
import torch
import torch.nn as nn
import torch.nn.functional as F
import json
import pandas as pd
import numpy as np

# Determine project root (parent of notebooks directory)
PROJECT_ROOT = Path.cwd().parent if Path.cwd().name in ['notebooks', 'b-p_first_experiments'] else Path.cwd()
sys.path.insert(0, str(PROJECT_ROOT))

from models.hybrid.hybrid_cnn_mlp_v3 import HybridCNNMLP_V3
from utils.training_utils import train_model, evaluate_model, WarmupCosineScheduler, LabelSmoothingCrossEntropy
from utils.data_loader import load_data, create_dataloaders

if torch.backends.mps.is_available():
    device = torch.device("mps")
    print(f"Using MPS device")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

df, spectrograms_dict, feature_cols, feature_scaler, class_weights_dict = load_data(PROJECT_ROOT)
dataloaders = create_dataloaders(df, spectrograms_dict, feature_cols, feature_scaler, class_weights_dict, batch_size=64)

train_hybrid_loader = dataloaders['hybrid']['train']
val_hybrid_loader = dataloaders['hybrid']['val']
test_hybrid_loader = dataloaders['hybrid']['test']

OUTPUT_DIR = PROJECT_ROOT / 'artifacts' / 'b-p_dl_models' / 'improved_models'
class_weights = torch.tensor([class_weights_dict.get('0', class_weights_dict.get(0, 1.0)), 
                              class_weights_dict.get('1', class_weights_dict.get(1, 1.0))], dtype=torch.float32).to(device)


Using MPS device
Columns in df_phonemes: ['phoneme_id', 'utterance_id', 'phoneme', 'class', 'start_ms', 'end_ms', 'duration_ms', 'audio_path']
Columns in df_features: ['energy_rms', 'energy_rms_std', 'energy_zcr', 'energy_zcr_std', 'spectral_centroid', 'spectral_centroid_std', 'spectral_rolloff', 'spectral_rolloff_std', 'spectral_bandwidth', 'spectral_bandwidth_std', 'formant_f1', 'formant_f2', 'formant_f3', 'formant_f4', 'formant_f1_std', 'formant_f2_std', 'formant_f3_std', 'formant_f4_std', 'spectral_flatness', 'harmonic_noise_ratio', 'zcr_mean', 'energy_cv', 'phoneme_id', 'class', 'duration_ms', 'mfcc_mean_0', 'mfcc_mean_1', 'mfcc_mean_2', 'mfcc_mean_3', 'mfcc_mean_4', 'mfcc_mean_5', 'mfcc_mean_6', 'mfcc_mean_7', 'mfcc_mean_8', 'mfcc_mean_9', 'mfcc_mean_10', 'mfcc_mean_11', 'mfcc_mean_12', 'mfcc_std_0', 'mfcc_std_1', 'mfcc_std_2', 'mfcc_std_3', 'mfcc_std_4', 'mfcc_std_5', 'mfcc_std_6', 'mfcc_std_7', 'mfcc_std_8', 'mfcc_std_9', 'mfcc_std_10', 'mfcc_std_11', 'mfcc_std_12', 'delta_mfcc

## Model: Enhanced Hybrid CNN+MLP V3


In [2]:
model = HybridCNNMLP_V3(n_features=len(feature_cols), num_classes=2, dropout=0.3).to(device)

# Print model info
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Model: {model.get_config()['model_type']}")
print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")

# Loss function with label smoothing
criterion = LabelSmoothingCrossEntropy(smoothing=0.1, weight=class_weights)

# Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)

# Learning rate scheduler with warmup and cosine annealing
num_epochs = 100
warmup_epochs = 5
scheduler = WarmupCosineScheduler(optimizer, warmup_epochs=warmup_epochs, total_epochs=num_epochs, min_lr=1e-6)

save_dir = OUTPUT_DIR / 'hybrid_cnn_mlp_v3'
save_dir.mkdir(parents=True, exist_ok=True)

print(f"\nTraining configuration:")
print(f"- Epochs: {num_epochs}")
print(f"- Warmup epochs: {warmup_epochs}")
print(f"- Initial LR: {optimizer.param_groups[0]['lr']}")
print(f"- Label smoothing: 0.1")
print(f"- Gradient clipping: 1.0")
print(f"- Early stopping patience: 15")
print(f"- Dropout: 0.3")


Model: HybridCNNMLP_V3
Total parameters: 5,857,218
Trainable parameters: 5,857,218

Training configuration:
- Epochs: 100
- Warmup epochs: 5
- Initial LR: 0.001
- Label smoothing: 0.1
- Gradient clipping: 1.0
- Early stopping patience: 15
- Dropout: 0.3


In [3]:
history, best_epoch = train_model(
    model, train_hybrid_loader, val_hybrid_loader, criterion, optimizer, scheduler,
    device, num_epochs=num_epochs, save_dir=save_dir, model_name='hybrid_cnn_mlp_v3', 
    early_stopping_patience=15, max_grad_norm=1.0
)

checkpoint = torch.load(save_dir / 'best_model.pt')
model.load_state_dict(checkpoint['model_state_dict'])
test_metrics, test_preds, test_labels, test_probs = evaluate_model(model, test_hybrid_loader, criterion, device)

with open(save_dir / 'test_metrics.json', 'w') as f:
    json.dump(test_metrics, f, indent=2)

print(f"\n{'='*60}")
print(f"Final Test Results:")
print(f"{'='*60}")
print(f"Accuracy: {test_metrics['accuracy']:.4f}")
print(f"F1-score: {test_metrics['f1']:.4f}")
print(f"ROC-AUC: {test_metrics['roc_auc']:.4f}")
print(f"Precision: {test_metrics['precision']:.4f}")
print(f"Recall: {test_metrics['recall']:.4f}")
print(f"Best epoch: {best_epoch}")



Epoch 1/100
--------------------------------------------------


                                                           

Train Loss: 0.3987, Train Acc: 0.8976
Val Loss: 0.3458, Val Acc: 0.9312
Val F1: 0.9318, Val ROC-AUC: 0.9780
Learning Rate: 0.000200
✓ New best model saved! (F1: 0.9318)

Epoch 2/100
--------------------------------------------------


                                                           

Train Loss: 0.3473, Train Acc: 0.9262
Val Loss: 0.3247, Val Acc: 0.9248
Val F1: 0.9264, Val ROC-AUC: 0.9841
Learning Rate: 0.000400

Epoch 3/100
--------------------------------------------------


                                                           

Train Loss: 0.3378, Train Acc: 0.9311
Val Loss: 0.3210, Val Acc: 0.9428
Val F1: 0.9431, Val ROC-AUC: 0.9844
Learning Rate: 0.000600
✓ New best model saved! (F1: 0.9431)

Epoch 4/100
--------------------------------------------------


                                                           

Train Loss: 0.3359, Train Acc: 0.9371
Val Loss: 0.3268, Val Acc: 0.9301
Val F1: 0.9314, Val ROC-AUC: 0.9849
Learning Rate: 0.000800

Epoch 5/100
--------------------------------------------------


                                                           

Train Loss: 0.3373, Train Acc: 0.9368
Val Loss: 0.3216, Val Acc: 0.9426
Val F1: 0.9431, Val ROC-AUC: 0.9834
Learning Rate: 0.001000
✓ New best model saved! (F1: 0.9431)

Epoch 6/100
--------------------------------------------------


                                                           

Train Loss: 0.3378, Train Acc: 0.9362
Val Loss: 0.3269, Val Acc: 0.9305
Val F1: 0.9315, Val ROC-AUC: 0.9821
Learning Rate: 0.001000

Epoch 7/100
--------------------------------------------------


                                                           

Train Loss: 0.3291, Train Acc: 0.9406
Val Loss: 0.3275, Val Acc: 0.9297
Val F1: 0.9309, Val ROC-AUC: 0.9829
Learning Rate: 0.000999

Epoch 8/100
--------------------------------------------------


                                                           

Train Loss: 0.3262, Train Acc: 0.9429
Val Loss: 0.3339, Val Acc: 0.9162
Val F1: 0.9182, Val ROC-AUC: 0.9847
Learning Rate: 0.000998

Epoch 9/100
--------------------------------------------------


                                                           

Train Loss: 0.3195, Train Acc: 0.9486
Val Loss: 0.3301, Val Acc: 0.9231
Val F1: 0.9248, Val ROC-AUC: 0.9846
Learning Rate: 0.000996

Epoch 10/100
--------------------------------------------------


                                                           

Train Loss: 0.3163, Train Acc: 0.9501
Val Loss: 0.3266, Val Acc: 0.9224
Val F1: 0.9241, Val ROC-AUC: 0.9841
Learning Rate: 0.000993

Epoch 11/100
--------------------------------------------------


                                                           

Train Loss: 0.3131, Train Acc: 0.9504
Val Loss: 0.3265, Val Acc: 0.9378
Val F1: 0.9385, Val ROC-AUC: 0.9838
Learning Rate: 0.000990

Epoch 12/100
--------------------------------------------------


                                                           

Train Loss: 0.3118, Train Acc: 0.9518
Val Loss: 0.3644, Val Acc: 0.8879
Val F1: 0.8916, Val ROC-AUC: 0.9823
Learning Rate: 0.000987

Epoch 13/100
--------------------------------------------------


                                                           

Train Loss: 0.3089, Train Acc: 0.9548
Val Loss: 0.3209, Val Acc: 0.9441
Val F1: 0.9444, Val ROC-AUC: 0.9839
Learning Rate: 0.000983
✓ New best model saved! (F1: 0.9444)

Epoch 14/100
--------------------------------------------------


                                                           

Train Loss: 0.3044, Train Acc: 0.9578
Val Loss: 0.3224, Val Acc: 0.9348
Val F1: 0.9359, Val ROC-AUC: 0.9853
Learning Rate: 0.000978

Epoch 15/100
--------------------------------------------------


                                                           

Train Loss: 0.3048, Train Acc: 0.9578
Val Loss: 0.3184, Val Acc: 0.9400
Val F1: 0.9407, Val ROC-AUC: 0.9841
Learning Rate: 0.000973

Epoch 16/100
--------------------------------------------------


                                                           

Train Loss: 0.2979, Train Acc: 0.9618
Val Loss: 0.3312, Val Acc: 0.9284
Val F1: 0.9297, Val ROC-AUC: 0.9842
Learning Rate: 0.000967

Epoch 17/100
--------------------------------------------------


                                                           

Train Loss: 0.2989, Train Acc: 0.9612
Val Loss: 0.3244, Val Acc: 0.9500
Val F1: 0.9501, Val ROC-AUC: 0.9857
Learning Rate: 0.000961
✓ New best model saved! (F1: 0.9501)

Epoch 18/100
--------------------------------------------------


                                                           

Train Loss: 0.2953, Train Acc: 0.9637
Val Loss: 0.3569, Val Acc: 0.9008
Val F1: 0.9037, Val ROC-AUC: 0.9841
Learning Rate: 0.000955

Epoch 19/100
--------------------------------------------------


                                                           

Train Loss: 0.2928, Train Acc: 0.9640
Val Loss: 0.3300, Val Acc: 0.9261
Val F1: 0.9276, Val ROC-AUC: 0.9834
Learning Rate: 0.000947

Epoch 20/100
--------------------------------------------------


                                                           

Train Loss: 0.2935, Train Acc: 0.9647
Val Loss: 0.3269, Val Acc: 0.9466
Val F1: 0.9468, Val ROC-AUC: 0.9834
Learning Rate: 0.000940

Epoch 21/100
--------------------------------------------------


                                                           

Train Loss: 0.2888, Train Acc: 0.9682
Val Loss: 0.3227, Val Acc: 0.9395
Val F1: 0.9401, Val ROC-AUC: 0.9843
Learning Rate: 0.000932

Epoch 22/100
--------------------------------------------------


                                                           

Train Loss: 0.2902, Train Acc: 0.9666
Val Loss: 0.3195, Val Acc: 0.9488
Val F1: 0.9490, Val ROC-AUC: 0.9849
Learning Rate: 0.000923

Epoch 23/100
--------------------------------------------------


                                                           

Train Loss: 0.2867, Train Acc: 0.9692
Val Loss: 0.3221, Val Acc: 0.9516
Val F1: 0.9516, Val ROC-AUC: 0.9858
Learning Rate: 0.000914
✓ New best model saved! (F1: 0.9516)

Epoch 24/100
--------------------------------------------------


                                                           

Train Loss: 0.2844, Train Acc: 0.9709
Val Loss: 0.3221, Val Acc: 0.9492
Val F1: 0.9495, Val ROC-AUC: 0.9848
Learning Rate: 0.000905

Epoch 25/100
--------------------------------------------------


                                                           

Train Loss: 0.2842, Train Acc: 0.9712
Val Loss: 0.3304, Val Acc: 0.9385
Val F1: 0.9392, Val ROC-AUC: 0.9833
Learning Rate: 0.000895

Epoch 26/100
--------------------------------------------------


                                                           

Train Loss: 0.2798, Train Acc: 0.9724
Val Loss: 0.3291, Val Acc: 0.9410
Val F1: 0.9413, Val ROC-AUC: 0.9836
Learning Rate: 0.000884

Epoch 27/100
--------------------------------------------------


                                                           

Train Loss: 0.2803, Train Acc: 0.9739
Val Loss: 0.3300, Val Acc: 0.9509
Val F1: 0.9508, Val ROC-AUC: 0.9855
Learning Rate: 0.000874

Epoch 28/100
--------------------------------------------------


                                                           

Train Loss: 0.2751, Train Acc: 0.9761
Val Loss: 0.3262, Val Acc: 0.9466
Val F1: 0.9468, Val ROC-AUC: 0.9852
Learning Rate: 0.000862

Epoch 29/100
--------------------------------------------------


                                                           

Train Loss: 0.2740, Train Acc: 0.9770
Val Loss: 0.3242, Val Acc: 0.9466
Val F1: 0.9469, Val ROC-AUC: 0.9843
Learning Rate: 0.000851

Epoch 30/100
--------------------------------------------------


                                                           

Train Loss: 0.2741, Train Acc: 0.9780
Val Loss: 0.3360, Val Acc: 0.9344
Val F1: 0.9352, Val ROC-AUC: 0.9830
Learning Rate: 0.000839

Epoch 31/100
--------------------------------------------------


                                                           

Train Loss: 0.2719, Train Acc: 0.9782
Val Loss: 0.3720, Val Acc: 0.9425
Val F1: 0.9417, Val ROC-AUC: 0.9825
Learning Rate: 0.000826

Epoch 32/100
--------------------------------------------------


                                                           

Train Loss: 0.2713, Train Acc: 0.9782
Val Loss: 0.3208, Val Acc: 0.9438
Val F1: 0.9443, Val ROC-AUC: 0.9851
Learning Rate: 0.000814

Epoch 33/100
--------------------------------------------------


                                                           

Train Loss: 0.2703, Train Acc: 0.9789
Val Loss: 0.3292, Val Acc: 0.9460
Val F1: 0.9463, Val ROC-AUC: 0.9810
Learning Rate: 0.000801

Epoch 34/100
--------------------------------------------------


                                                           

Train Loss: 0.2684, Train Acc: 0.9801
Val Loss: 0.3492, Val Acc: 0.9470
Val F1: 0.9463, Val ROC-AUC: 0.9833
Learning Rate: 0.000787

Epoch 35/100
--------------------------------------------------


                                                           

Train Loss: 0.2671, Train Acc: 0.9812
Val Loss: 0.3295, Val Acc: 0.9455
Val F1: 0.9456, Val ROC-AUC: 0.9835
Learning Rate: 0.000774

Epoch 36/100
--------------------------------------------------


                                                           

Train Loss: 0.2647, Train Acc: 0.9828
Val Loss: 0.3331, Val Acc: 0.9486
Val F1: 0.9488, Val ROC-AUC: 0.9847
Learning Rate: 0.000760

Epoch 37/100
--------------------------------------------------


                                                           

Train Loss: 0.2641, Train Acc: 0.9820
Val Loss: 0.3427, Val Acc: 0.9471
Val F1: 0.9470, Val ROC-AUC: 0.9759
Learning Rate: 0.000745

Epoch 38/100
--------------------------------------------------


                                                           

Train Loss: 0.2641, Train Acc: 0.9831
Val Loss: 0.3327, Val Acc: 0.9425
Val F1: 0.9429, Val ROC-AUC: 0.9849
Learning Rate: 0.000731

Early stopping at epoch 38
Best F1: 0.9516 at epoch 23


                                                           


Final Test Results:
Accuracy: 0.9469
F1-score: 0.9467
ROC-AUC: 0.9849
Precision: 0.9467
Recall: 0.9469
Best epoch: 23




## Save Predictions with Probabilities for Each Phoneme


In [4]:
# Get test dataset to extract phoneme metadata
test_df = df[df['split'] == 'test'].reset_index(drop=True)

# Create predictions dataframe with probabilities
predictions_data = []
for idx, row in test_df.iterrows():
    predictions_data.append({
        'phoneme_id': row['phoneme_id'],
        'utterance_id': row['utterance_id'],
        'phoneme': row['phoneme'],
        'true_class': row['class'],
        'true_class_encoded': int(test_labels[idx]),
        'predicted_class_encoded': int(test_preds[idx]),
        'predicted_class': 'b' if test_preds[idx] == 0 else 'p',
        'prob_class_0': float(test_probs[idx][0]),  # Probability of class 'b'
        'prob_class_1': float(test_probs[idx][1]),  # Probability of class 'p'
        'max_prob': float(np.max(test_probs[idx])),
        'is_correct': int(test_labels[idx] == test_preds[idx]),
        'confidence': float(np.max(test_probs[idx])) if test_labels[idx] == test_preds[idx] else float(test_probs[idx][test_preds[idx]]),
        'duration_ms': row.get('duration_ms', None)
    })

predictions_df = pd.DataFrame(predictions_data)

# Save to CSV
predictions_df.to_csv(save_dir / 'test_predictions_with_probs.csv', index=False)
print(f"Saved predictions with probabilities to: {save_dir / 'test_predictions_with_probs.csv'}")
print(f"Total predictions: {len(predictions_df)}")
print(f"Correct predictions: {predictions_df['is_correct'].sum()}")
print(f"Incorrect predictions: {(~predictions_df['is_correct'].astype(bool)).sum()}")

# Save summary statistics
summary_stats = {
    'total_samples': len(predictions_df),
    'correct_predictions': int(predictions_df['is_correct'].sum()),
    'incorrect_predictions': int((~predictions_df['is_correct'].astype(bool)).sum()),
    'accuracy': float(predictions_df['is_correct'].mean()),
    'avg_confidence_correct': float(predictions_df[predictions_df['is_correct'] == 1]['confidence'].mean()),
    'avg_confidence_incorrect': float(predictions_df[predictions_df['is_correct'] == 0]['confidence'].mean()),
    'min_confidence_incorrect': float(predictions_df[predictions_df['is_correct'] == 0]['confidence'].min()),
    'max_confidence_incorrect': float(predictions_df[predictions_df['is_correct'] == 0]['confidence'].max()),
    'high_confidence_errors': int(((predictions_df['is_correct'] == 0) & (predictions_df['confidence'] > 0.8)).sum()),
    'low_confidence_errors': int(((predictions_df['is_correct'] == 0) & (predictions_df['confidence'] < 0.6)).sum()),
}

with open(save_dir / 'predictions_summary.json', 'w') as f:
    json.dump(summary_stats, f, indent=2)

print(f"\nSummary Statistics:")
print(f"- Average confidence (correct): {summary_stats['avg_confidence_correct']:.4f}")
print(f"- Average confidence (incorrect): {summary_stats['avg_confidence_incorrect']:.4f}")
print(f"- High confidence errors (>0.8): {summary_stats['high_confidence_errors']}")
print(f"- Low confidence errors (<0.6): {summary_stats['low_confidence_errors']}")


Saved predictions with probabilities to: /Volumes/SSanDisk/SpeechRec-German/artifacts/b-p_dl_models/improved_models/hybrid_cnn_mlp_v3/test_predictions_with_probs.csv
Total predictions: 5349
Correct predictions: 5065
Incorrect predictions: 284

Summary Statistics:
- Average confidence (correct): 0.9152
- Average confidence (incorrect): 0.7870
- High confidence errors (>0.8): 153
- Low confidence errors (<0.6): 36


## Save Predictions for Validation Set (for analysis)


In [5]:
# Get validation predictions
val_metrics, val_preds, val_labels, val_probs = evaluate_model(model, val_hybrid_loader, criterion, device)
val_df = df[df['split'] == 'val'].reset_index(drop=True)

val_predictions_data = []
for idx, row in val_df.iterrows():
    val_predictions_data.append({
        'phoneme_id': row['phoneme_id'],
        'utterance_id': row['utterance_id'],
        'phoneme': row['phoneme'],
        'true_class': row['class'],
        'true_class_encoded': int(val_labels[idx]),
        'predicted_class_encoded': int(val_preds[idx]),
        'predicted_class': 'b' if val_preds[idx] == 0 else 'p',
        'prob_class_0': float(val_probs[idx][0]),
        'prob_class_1': float(val_probs[idx][1]),
        'max_prob': float(np.max(val_probs[idx])),
        'is_correct': int(val_labels[idx] == val_preds[idx]),
        'confidence': float(np.max(val_probs[idx])) if val_labels[idx] == val_preds[idx] else float(val_probs[idx][val_preds[idx]]),
        'duration_ms': row.get('duration_ms', None)
    })

val_predictions_df = pd.DataFrame(val_predictions_data)
val_predictions_df.to_csv(save_dir / 'val_predictions_with_probs.csv', index=False)
print(f"Saved validation predictions to: {save_dir / 'val_predictions_with_probs.csv'}")


                                                           

Saved validation predictions to: /Volumes/SSanDisk/SpeechRec-German/artifacts/b-p_dl_models/improved_models/hybrid_cnn_mlp_v3/val_predictions_with_probs.csv
