# Improved Hybrid CNN+MLP Training (V4.3 Enhanced) with Context Windows - g/k Phoneme Pair

**Enhanced version** with Focal Loss, SpecAugment, and improved architecture:

**Key improvements in V4.3:**
1. **Focal Loss**: Replaces LabelSmoothingCrossEntropy to focus on hard examples (70%+ high-confidence errors)
2. **SpecAugment**: Frequency and time masking for spectrogram augmentation during training
3. **Enhanced Architecture**:
   - Multi-head attention in cross-attention fusion (with dropout)
   - Residual connections in MLP branch
   - Enhanced SE blocks in CNN branch
4. **Code improvements**:
   - Fixed file loading duplication
   - Optimized data loading and processing
   - Added error handling and validation
   - Improved reproducibility with seed setting
   - Vectorized operations for better performance

**Expected improvements:**
- Better handling of hard examples (Focal Loss)
- Improved generalization (SpecAugment)
- Better feature fusion (Multi-head attention with dropout)
- More stable training (Residual connections)
- Better code quality and maintainability


In [None]:
import sys
from pathlib import Path
import torch
import torch.nn as nn
import torch.nn.functional as F
import json
import pandas as pd
import numpy as np
import h5py
import joblib
import random
import logging
from torch.utils.data import DataLoader, WeightedRandomSampler, Dataset
from tqdm import tqdm
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix
)
from sklearn.preprocessing import StandardScaler, LabelEncoder
import math

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Set seed for reproducibility
def set_seed(seed=42):
    """Set random seed for reproducibility"""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)
logger.info("Random seed set to 42 for reproducibility")

# Configuration
CONFIG = {
    'specaugment': {
        'F': 27,
        'T': 40,
        'm_F': 2,
        'm_T': 2
    },
    'training': {
        'batch_size': 64,
        'num_epochs': 200,
        'warmup_epochs': 5,
        'initial_lr': 5e-4,
        'min_lr': 1e-6,
        'weight_decay': 1e-4,
        'dropout': 0.4,
        'max_grad_norm': 1.0,
        'early_stopping_patience': 20
    },
    'focal_loss': {
        'alpha': 0.25,
        'gamma': 2.0
    }
}

# Project root
PROJECT_ROOT = Path('/Volumes/SSanDisk/SpeechRec-German')

# Data directory (with context v2 - includes VOT, burst features)
DATA_DIR = PROJECT_ROOT / 'artifacts' / 'g-k_dl_models_with_context_v2'
FEATURES_DIR = DATA_DIR / 'features'

# Validate directories exist
if not DATA_DIR.exists():
    raise FileNotFoundError(f"Data directory not found: {DATA_DIR}")
if not FEATURES_DIR.exists():
    raise FileNotFoundError(f"Features directory not found: {FEATURES_DIR}")

# Device setup
if torch.backends.mps.is_available():
    device = torch.device("mps")
    logger.info("Using MPS device")
elif torch.cuda.is_available():
    device = torch.device("cuda")
    logger.info("Using CUDA device")
else:
    device = torch.device("cpu")
    logger.info("Using CPU device")

logger.info(f"Data directory: {DATA_DIR}")
logger.info(f"Features directory: {FEATURES_DIR}")


2026-01-04 17:47:48,615 - INFO - Random seed set to 42 for reproducibility
2026-01-04 17:47:48,633 - INFO - Using MPS device
2026-01-04 17:47:48,633 - INFO - Data directory: /Volumes/SSanDisk/SpeechRec-German/artifacts/g-k_dl_models_with_context_v2
2026-01-04 17:47:48,634 - INFO - Features directory: /Volumes/SSanDisk/SpeechRec-German/artifacts/g-k_dl_models_with_context_v2/features


## Load Data with Context Windows (V2 - with VOT and Burst Features)


In [None]:
# Load feature columns
feature_cols_path = DATA_DIR / 'feature_cols.json'
if not feature_cols_path.exists():
    raise FileNotFoundError(f"Feature columns file not found: {feature_cols_path}")
with open(feature_cols_path, 'r') as f:
    feature_cols = json.load(f)

# Load feature scaler
scaler_path = DATA_DIR / 'feature_scaler.joblib'
if not scaler_path.exists():
    raise FileNotFoundError(f"Scaler file not found: {scaler_path}")
feature_scaler = joblib.load(scaler_path)

# Load class weights
class_weights_path = DATA_DIR / 'class_weights.json'
if not class_weights_path.exists():
    raise FileNotFoundError(f"Class weights file not found: {class_weights_path}")
with open(class_weights_path, 'r') as f:
    class_weights_dict = json.load(f)

# Load features DataFrame (from 02.2 - includes VOT, burst features)
features_path = FEATURES_DIR / 'features.parquet'
if not features_path.exists():
    raise FileNotFoundError(f"Features file not found: {features_path}")
df = pd.read_parquet(features_path)
logger.info(f"Dataset shape: {df.shape}")
logger.info(f"Feature columns (loaded): {len(feature_cols)}")

# Filter feature_cols to only include columns that exist in DataFrame
original_feature_cols = feature_cols.copy()  # Save original before filtering
original_feature_count = len(feature_cols)
feature_cols = [col for col in feature_cols if col in df.columns and pd.api.types.is_numeric_dtype(df[col])]

if len(feature_cols) != original_feature_count:
    missing_cols = set(original_feature_cols) - set(feature_cols)
    logger.warning(f"{original_feature_count - len(feature_cols)} feature columns are missing from DataFrame")
    if missing_cols:
        logger.warning(f"Missing columns: {list(missing_cols)[:10]}...")
        
    if 'duration_ms_features' in missing_cols:
        logger.info("Note: 'duration_ms_features' is missing - this is expected if duration_ms wasn't duplicated during merge.")
        logger.info("      This column is not a real feature and can be safely ignored.")

logger.info(f"Feature columns (filtered): {len(feature_cols)}")

# Verify feature count matches scaler
if hasattr(feature_scaler, 'n_features_in_'):
    if len(feature_cols) != feature_scaler.n_features_in_:
        logger.warning(f"Feature count mismatch. Scaler expects {feature_scaler.n_features_in_} features, but we have {len(feature_cols)}")
        logger.info("This is OK if some features were removed from the dataset. The scaler will be retrained on available features.")

# Check what metadata columns we have
metadata_cols = ['phoneme_id', 'class', 'duration_ms', 'phoneme', 'utterance_id']
present_metadata = [col for col in metadata_cols if col in df.columns]
logger.info(f"Metadata columns present: {present_metadata}")

# Handle class column
if 'class' not in df.columns:
    if 'phoneme' in df.columns:
        df['class'] = df['phoneme']
        logger.info("Created 'class' column from 'phoneme'")
    else:
        raise ValueError("Neither 'class' nor 'phoneme' column found in features.parquet.")

# Filter to only ɡ and k classes
# Note: 'ɡ' is U+0261 (LATIN SMALL LETTER SCRIPT G), not regular 'g' (U+0067)
if not df['class'].isin(['ɡ', 'k']).all():
    df = df[df['class'].isin(['ɡ', 'k'])].copy()
    logger.info(f"Dataset after filtering to ɡ/k: {len(df)} samples")

# Encode target
le = LabelEncoder()
df['class_encoded'] = le.fit_transform(df['class'])  # g=0, k=1
class_encoding = dict(zip(le.classes_, le.transform(le.classes_)))
logger.info(f"Class encoding: {class_encoding}")
logger.info(f"Class distribution:\n{df['class'].value_counts()}")

# Load split indices
split_indices_path = DATA_DIR / 'split_indices.json'
if not split_indices_path.exists():
    raise FileNotFoundError(f"Split indices file not found: {split_indices_path}")
with open(split_indices_path, 'r') as f:
    split_indices = json.load(f)

# Reset index
df = df.reset_index(drop=True)

# Create split column based on indices
df['split'] = 'train'
if len(df) > max(split_indices['val'] + split_indices['test']):
    df.loc[split_indices['val'], 'split'] = 'val'
    df.loc[split_indices['test'], 'split'] = 'test'
else:
    logger.warning("Split indices may not match DataFrame indices. Using phoneme_id matching...")
    val_ids = set(df.loc[split_indices['val'], 'phoneme_id'].values) if len(df) > max(split_indices['val']) else set()
    test_ids = set(df.loc[split_indices['test'], 'phoneme_id'].values) if len(df) > max(split_indices['test']) else set()
    df.loc[df['phoneme_id'].isin(val_ids), 'split'] = 'val'
    df.loc[df['phoneme_id'].isin(test_ids), 'split'] = 'test'

logger.info(f"Split distribution:\n{df['split'].value_counts()}")

# Load spectrograms with error handling
spectrograms_path = FEATURES_DIR / 'spectrograms.h5'
if not spectrograms_path.exists():
    raise FileNotFoundError(f"Spectrograms file not found: {spectrograms_path}")

spectrograms_dict = {}
try:
    with h5py.File(spectrograms_path, 'r') as f:
        phoneme_ids = list(f.keys())
        for phoneme_id in tqdm(phoneme_ids, desc="Loading spectrograms"):
            try:
                spectrograms_dict[phoneme_id] = f[phoneme_id][:]
            except Exception as e:
                logger.warning(f"Failed to load spectrogram for {phoneme_id}: {e}")
                continue
except Exception as e:
    raise RuntimeError(f"Failed to load spectrograms: {e}")

logger.info(f"Loaded {len(spectrograms_dict):,} spectrograms")
if spectrograms_dict:
    sample_shape = list(spectrograms_dict.values())[0].shape
    logger.info(f"Spectrogram shape: {sample_shape}")

# Filter to only phonemes with spectrograms
df['phoneme_id_str'] = df['phoneme_id'].astype(str)
df['has_spectrogram'] = df['phoneme_id_str'].isin(spectrograms_dict.keys())
df = df[df['has_spectrogram']].copy()
logger.info(f"Dataset after filtering for spectrograms: {len(df)} samples")


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
2026-01-04 17:47:48,917 - INFO - Dataset shape: (37986, 134)
2026-01-04 17:47:48,917 - INFO - Feature columns (loaded): 130
2026-01-04 17:47:48,921 - INFO - Note: 'duration_ms_features' is missing - this is expected if duration_ms wasn't duplicated during merge.
2026-01-04 17:47:48,922 - INFO -       This column is not a real feature and can be safely ignored.
2026-01-04 17:47:48,922 - INFO - Feature columns (filtered): 129
2026-01-04 17:47:48,922 - INFO - This is OK if some features were removed from the dataset. The scaler will be retrained on available features.
2026-01-04 17:47:48,923 - INFO - Metadata columns present: ['phoneme_id', 'class', 'duration_ms']
2026-01-04 17:47:48,930 - INFO - Class encoding: {'k': np.int64(0), 'ɡ': np.int64(1)}
2026-01-04 17:47:48,933 - INFO - Class distribution:
class
ɡ    22645
k    15341
Name: count, dtype: int64
2026-01-04 17:47:48,944 - INFO - Split distri

## Define SpecAugment for Spectrogram Augmentation


In [None]:
class SpecAugment:
    """
    SpecAugment: Simple spectrogram augmentation for speech recognition.
    Applies frequency masking and time masking to spectrograms.
    Adaptively adjusts parameters based on spectrogram dimensions.
    Optimized version with better memory handling.
    """
    def __init__(self, F=27, T=40, m_F=2, m_T=2):
        """
        Args:
            F: Maximum frequency mask width (will be clamped to H-1)
            T: Maximum time mask width (will be clamped to W-1)
            m_F: Number of frequency masks
            m_T: Number of time masks
        """
        self.F = F
        self.T = T
        self.m_F = m_F
        self.m_T = m_T
    
    def __call__(self, spectrogram):
        """
        Apply SpecAugment to spectrogram.
        Args:
            spectrogram: numpy array of shape (C, H, W) or (H, W)
        Returns:
            Augmented spectrogram
        """
        # Handle different input shapes - only copy if necessary
        if len(spectrogram.shape) == 2:
            # (H, W) -> (1, H, W)
            spec = np.expand_dims(spectrogram, axis=0)
            squeeze_output = True
        else:
            # Only copy if we need to modify
            spec = np.array(spectrogram, copy=True)
            squeeze_output = False
        
        C, H, W = spec.shape
        
        # Adaptively adjust mask sizes to fit spectrogram dimensions
        # Ensure we can always apply at least some masking
        max_F = max(1, min(self.F, H - 1))  # At least 1, at most H-1
        max_T = max(1, min(self.T, W - 1))  # At least 1, at most W-1
        
        # Apply frequency masking
        for _ in range(self.m_F):
            if max_F > 0 and H > 0:
                f = np.random.randint(0, max_F + 1)
                if f > 0 and H - f > 0:
                    f0 = np.random.randint(0, H - f + 1)
                    spec[:, f0:f0+f, :] = 0
        
        # Apply time masking
        for _ in range(self.m_T):
            if max_T > 0 and W > 0:
                t = np.random.randint(0, max_T + 1)
                if t > 0 and W - t > 0:
                    t0 = np.random.randint(0, W - t + 1)
                    spec[:, :, t0:t0+t] = 0
        
        if squeeze_output:
            spec = np.squeeze(spec, axis=0)
        
        return spec

logger.info("SpecAugment class defined successfully!")


2026-01-04 17:47:56,223 - INFO - SpecAugment class defined successfully!


## Create Dataset Classes with SpecAugment


In [None]:
from torch.utils.data import Dataset
from sklearn.preprocessing import StandardScaler

class HybridDataset(Dataset):
    """Dataset for hybrid models using both spectrograms and features with SpecAugment"""
    def __init__(self, df, spectrograms_dict, feature_cols, scaler=None, split='train', fit_scaler=False, transform=None, use_specaugment=False):
        self.df = df[df['split'] == split].reset_index(drop=True)
        self.spectrograms_dict = spectrograms_dict
        self.transform = transform
        self.split = split
        self.use_specaugment = use_specaugment and (split == 'train')  # Only apply to training
        
        # Initialize SpecAugment if needed
        if self.use_specaugment:
            specaug_config = CONFIG['specaugment']
            self.specaugment = SpecAugment(
                F=specaug_config['F'],
                T=specaug_config['T'],
                m_F=specaug_config['m_F'],
                m_T=specaug_config['m_T']
            )
        
        self.feature_cols = [col for col in feature_cols if col in self.df.columns and pd.api.types.is_numeric_dtype(self.df[col])]
        if len(self.feature_cols) != len(feature_cols):
            missing = set(feature_cols) - set(self.feature_cols)
            logger.warning(f"Warning: {len(missing)} feature columns missing from DataFrame: {list(missing)[:5]}...")
        
        X_features = self.df[self.feature_cols].values.astype(np.float32)
        X_features = np.nan_to_num(X_features, nan=0.0, posinf=0.0, neginf=0.0)
        
        if fit_scaler:
            self.scaler = StandardScaler()
            X_features = self.scaler.fit_transform(X_features)
        elif scaler is not None:
            if hasattr(scaler, 'n_features_in_') and X_features.shape[1] != scaler.n_features_in_:
                logger.warning(f"Feature count mismatch ({X_features.shape[1]} vs {scaler.n_features_in_}). Retraining scaler on current features.")
                self.scaler = StandardScaler()
                X_features = self.scaler.fit_transform(X_features)
            else:
                self.scaler = scaler
                X_features = self.scaler.transform(X_features)
        else:
            self.scaler = None
        
        self.X_features = torch.from_numpy(X_features)
        self.y = torch.from_numpy(self.df['class_encoded'].values).long()
        
        # Pre-compute normalization statistics for spectrograms (optional - can use global stats)
        # For now, we'll normalize per-sample, but this could be improved with global stats
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        phoneme_id = str(row['phoneme_id'])
        
        if phoneme_id not in self.spectrograms_dict:
            raise KeyError(f"Spectrogram not found for phoneme_id: {phoneme_id}")
        
        spectrogram = self.spectrograms_dict[phoneme_id].astype(np.float32)
        if len(spectrogram.shape) == 2:
            spectrogram = np.expand_dims(spectrogram, axis=0)
        
        # Normalize spectrogram (per-sample normalization)
        # TODO: Consider using global statistics for better stability
        spec_min = spectrogram.min()
        spec_max = spectrogram.max()
        if spec_max - spec_min > 1e-8:
            spectrogram = (spectrogram - spec_min) / (spec_max - spec_min)
        else:
            spectrogram = np.zeros_like(spectrogram)
        
        # Apply SpecAugment during training
        if self.use_specaugment:
            spectrogram = self.specaugment(spectrogram)
        
        if self.transform:
            spectrogram = self.transform(spectrogram)
        
        features = self.X_features[idx]
        label = self.y[idx]
        
        return (torch.from_numpy(spectrogram), features), label

# Check and retrain scaler if needed
train_df = df[df['split'] == 'train'].reset_index(drop=True)
train_feature_cols = [col for col in feature_cols if col in train_df.columns and pd.api.types.is_numeric_dtype(train_df[col])]
feature_cols = train_feature_cols

if hasattr(feature_scaler, 'n_features_in_') and len(feature_cols) != feature_scaler.n_features_in_:
    logger.info(f"Feature count mismatch detected: {len(feature_cols)} features in DataFrame vs {feature_scaler.n_features_in_} in scaler")
    logger.info("Retraining scaler on train split with current features...")
    X_train_features = train_df[feature_cols].values.astype(np.float32)
    X_train_features = np.nan_to_num(X_train_features, nan=0.0, posinf=0.0, neginf=0.0)
    feature_scaler = StandardScaler()
    feature_scaler.fit(X_train_features)
    logger.info(f"Scaler retrained on {len(feature_cols)} features")
else:
    logger.info(f"Using existing scaler with {feature_scaler.n_features_in_} features")

# Create datasets with SpecAugment for training
train_hybrid_ds = HybridDataset(df, spectrograms_dict, feature_cols, scaler=feature_scaler, split='train', use_specaugment=True)
val_hybrid_ds = HybridDataset(df, spectrograms_dict, feature_cols, scaler=feature_scaler, split='val', use_specaugment=False)
test_hybrid_ds = HybridDataset(df, spectrograms_dict, feature_cols, scaler=feature_scaler, split='test', use_specaugment=False)

logger.info(f"Train dataset: {len(train_hybrid_ds)} samples (with SpecAugment)")
logger.info(f"Val dataset: {len(val_hybrid_ds)} samples")
logger.info(f"Test dataset: {len(test_hybrid_ds)} samples")

# Create weighted sampler
train_labels = df[df['split'] == 'train']['class_encoded'].values
class_weights_array = np.array([class_weights_dict.get(str(i), class_weights_dict.get(i, 1.0)) for i in range(2)])
sample_weights = np.array([class_weights_array[label] for label in train_labels])
sampler = WeightedRandomSampler(weights=sample_weights, num_samples=len(sample_weights), replacement=True)

# Create DataLoaders
BATCH_SIZE = CONFIG['training']['batch_size']
train_hybrid_loader = DataLoader(train_hybrid_ds, batch_size=BATCH_SIZE, sampler=sampler, num_workers=0)
val_hybrid_loader = DataLoader(val_hybrid_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)
test_hybrid_loader = DataLoader(test_hybrid_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)

logger.info(f"Train batches: {len(train_hybrid_loader)}")
logger.info(f"Val batches: {len(val_hybrid_loader)}")
logger.info(f"Test batches: {len(test_hybrid_loader)}")

# Test a batch
sample_batch = next(iter(train_hybrid_loader))
logger.info(f"Sample batch - Spectrogram shape: {sample_batch[0][0].shape}, Features shape: {sample_batch[0][1].shape}, Labels shape: {sample_batch[1].shape}")


2026-01-04 17:47:56,261 - INFO - Feature count mismatch detected: 129 features in DataFrame vs 130 in scaler
2026-01-04 17:47:56,262 - INFO - Retraining scaler on train split with current features...
2026-01-04 17:47:56,472 - INFO - Scaler retrained on 129 features
2026-01-04 17:47:56,761 - INFO - Train dataset: 26605 samples (with SpecAugment)
2026-01-04 17:47:56,761 - INFO - Val dataset: 5683 samples
2026-01-04 17:47:56,761 - INFO - Test dataset: 5698 samples
2026-01-04 17:47:56,773 - INFO - Train batches: 416
2026-01-04 17:47:56,774 - INFO - Val batches: 89
2026-01-04 17:47:56,774 - INFO - Test batches: 90
2026-01-04 17:47:56,808 - INFO - Sample batch - Spectrogram shape: torch.Size([64, 1, 128, 7]), Features shape: torch.Size([64, 129]), Labels shape: torch.Size([64])


## Define Enhanced Model Architecture V4.3 with Multi-Head Attention and Residual Connections


In [None]:
# Define Residual Block for CNN
class ResidualBlock2D(nn.Module):
    """Residual block for CNN branch"""
    def __init__(self, in_channels, out_channels, stride=1):
        super(ResidualBlock2D, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1, stride=stride)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1)
        self.bn2 = nn.BatchNorm2d(out_channels)
        
        self.shortcut = nn.Sequential()
        if stride != 1 or in_channels != out_channels:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride),
                nn.BatchNorm2d(out_channels)
            )
    
    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out


# Enhanced Channel Attention Module (SE block with improved design)
class EnhancedChannelAttention(nn.Module):
    """Enhanced Channel attention module with improved design"""
    def __init__(self, channels, reduction=8):
        super(EnhancedChannelAttention, self).__init__()
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        self.max_pool = nn.AdaptiveMaxPool2d(1)
        
        # Use smaller reduction for better capacity
        reduced_dim = max(1, channels // reduction)
        
        self.fc = nn.Sequential(
            nn.Linear(channels, reduced_dim, bias=False),
            nn.ReLU(),
            nn.Linear(reduced_dim, channels, bias=False),
            nn.Sigmoid()
        )
    
    def forward(self, x):
        b, c, _, _ = x.size()
        avg_out = self.fc(self.avg_pool(x).view(b, c))
        max_out = self.fc(self.max_pool(x).view(b, c))
        out = avg_out + max_out
        return x * out.view(b, c, 1, 1)


# Define Feature Attention Module (SE-like for MLP features)
class FeatureAttention(nn.Module):
    """Squeeze-and-Excitation attention for feature vectors"""
    def __init__(self, n_features, reduction=8):
        super(FeatureAttention, self).__init__()
        self.reduction = reduction
        reduced_dim = max(1, n_features // reduction)
        
        self.fc = nn.Sequential(
            nn.Linear(n_features, reduced_dim, bias=False),
            nn.ReLU(),
            nn.Linear(reduced_dim, n_features, bias=False),
            nn.Sigmoid()
        )
    
    def forward(self, x):
        attention_weights = self.fc(x)
        return x * attention_weights


# Define Multi-Scale Convolution Block
class MultiScaleConvBlock(nn.Module):
    """Multi-scale convolution with parallel 3x3 and 5x5 kernels"""
    def __init__(self, in_channels, out_channels):
        super(MultiScaleConvBlock, self).__init__()
        self.conv3x3 = nn.Sequential(
            nn.Conv2d(in_channels, out_channels // 2, kernel_size=3, padding=1),
            nn.BatchNorm2d(out_channels // 2),
            nn.ReLU()
        )
        self.conv5x5 = nn.Sequential(
            nn.Conv2d(in_channels, out_channels // 2, kernel_size=5, padding=2),
            nn.BatchNorm2d(out_channels // 2),
            nn.ReLU()
        )
    
    def forward(self, x):
        out3x3 = self.conv3x3(x)
        out5x5 = self.conv5x5(x)
        return torch.cat([out3x3, out5x5], dim=1)


# Multi-Head Cross-Attention Fusion Module (with dropout)
class MultiHeadCrossAttentionFusion(nn.Module):
    """Multi-head cross-attention between CNN and MLP outputs with dropout"""
    def __init__(self, cnn_dim, mlp_dim, hidden_dim=256, num_heads=4, dropout=0.1):
        super(MultiHeadCrossAttentionFusion, self).__init__()
        self.cnn_dim = cnn_dim
        self.mlp_dim = mlp_dim
        self.hidden_dim = hidden_dim
        self.num_heads = num_heads
        self.head_dim = hidden_dim // num_heads
        self.dropout = dropout
        
        assert hidden_dim % num_heads == 0, "hidden_dim must be divisible by num_heads"
        
        # Projections for multi-head attention
        self.cnn_to_qkv = nn.Linear(cnn_dim, hidden_dim * 3)
        self.mlp_to_qkv = nn.Linear(mlp_dim, hidden_dim * 3)
        
        self.norm1 = nn.LayerNorm(hidden_dim)
        self.norm2 = nn.LayerNorm(hidden_dim)
        self.dropout_layer = nn.Dropout(dropout)
        
        # Projections back to original dimensions
        self.cnn_proj = nn.Linear(hidden_dim, cnn_dim)
        self.mlp_proj = nn.Linear(hidden_dim, mlp_dim)
        
    def forward(self, cnn_out, mlp_out):
        # cnn_out: (batch, cnn_dim)
        # mlp_out: (batch, mlp_dim)
        batch_size = cnn_out.size(0)
        
        # CNN enhanced by MLP (multi-head attention)
        cnn_qkv = self.cnn_to_qkv(cnn_out).reshape(batch_size, 3, self.num_heads, self.head_dim).permute(1, 0, 2, 3)
        cnn_q, cnn_k, cnn_v = cnn_qkv[0], cnn_qkv[1], cnn_qkv[2]  # (batch, num_heads, head_dim)
        
        mlp_qkv = self.mlp_to_qkv(mlp_out).reshape(batch_size, 3, self.num_heads, self.head_dim).permute(1, 0, 2, 3)
        mlp_q, mlp_k, mlp_v = mlp_qkv[0], mlp_qkv[1], mlp_qkv[2]
        
        # Cross-attention: CNN queries attend to MLP keys/values
        scores = torch.matmul(cnn_q, mlp_k.transpose(-2, -1)) / math.sqrt(self.head_dim)
        attn_weights = F.softmax(scores, dim=-1)
        attn_weights = self.dropout_layer(attn_weights)  # Add dropout
        cnn_attended = torch.matmul(attn_weights, mlp_v)  # (batch, num_heads, head_dim)
        cnn_attended = cnn_attended.transpose(1, 2).contiguous().view(batch_size, self.hidden_dim)
        cnn_enhanced = cnn_out + self.cnn_proj(self.norm1(cnn_attended))
        
        # MLP enhanced by CNN (multi-head attention)
        scores2 = torch.matmul(mlp_q, cnn_k.transpose(-2, -1)) / math.sqrt(self.head_dim)
        attn_weights2 = F.softmax(scores2, dim=-1)
        attn_weights2 = self.dropout_layer(attn_weights2)  # Add dropout
        mlp_attended = torch.matmul(attn_weights2, cnn_v)
        mlp_attended = mlp_attended.transpose(1, 2).contiguous().view(batch_size, self.hidden_dim)
        mlp_enhanced = mlp_out + self.mlp_proj(self.norm2(mlp_attended))
        
        return cnn_enhanced, mlp_enhanced


# Define Hybrid CNN+MLP Model V4.3
class HybridCNNMLP_V4_3(nn.Module):
    """
    Enhanced Hybrid model: CNN for spectrograms + MLP for features
    Version 4.3 Improvements:
    - Multi-Head Cross-Attention Fusion (with dropout)
    - Residual connections in MLP branch
    - Enhanced SE blocks in CNN branch
    - SpecAugment support
    - Focal Loss support
    - Input validation
    """
    
    def __init__(self, n_features=129, num_classes=2, dropout=0.4):
        super(HybridCNNMLP_V4_3, self).__init__()
        self.n_features = n_features
        self.num_classes = num_classes
        
        # Multi-Scale CNN branch with enhanced attention
        self.cnn_initial = nn.Sequential(
            nn.Conv2d(1, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2, 2)  # (64, 64, 3)
        )
        
        # Multi-scale block
        self.multiscale = MultiScaleConvBlock(64, 128)
        
        self.cnn_branch = nn.Sequential(
            ResidualBlock2D(128, 128),
            EnhancedChannelAttention(128, reduction=8),
            nn.MaxPool2d(2, 2),  # (128, 32, 1)
            
            ResidualBlock2D(128, 256),
            EnhancedChannelAttention(256, reduction=8),
            ResidualBlock2D(256, 512),
            EnhancedChannelAttention(512, reduction=8),
            nn.AdaptiveAvgPool2d((1, 1)),
            nn.Flatten()
        )
        
        # MLP branch with feature attention and residual connections
        self.feature_attention = FeatureAttention(n_features, reduction=8)
        
        # First layer
        self.mlp_layer1 = nn.Sequential(
            nn.Linear(n_features, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(dropout)
        )
        
        # Second layer with residual
        self.mlp_layer2 = nn.Sequential(
            nn.Linear(256, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(dropout * 0.75)
        )
        self.mlp_residual1 = nn.Linear(256, 512)  # For residual connection
        
        # Third layer with residual
        self.mlp_layer3 = nn.Sequential(
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(dropout * 0.5)
        )
        self.mlp_residual2 = nn.Linear(512, 256)  # For residual connection
        
        # Final layer
        self.mlp_layer4 = nn.Linear(256, 128)
        
        # Multi-head cross-attention fusion (with dropout)
        self.cross_attention = MultiHeadCrossAttentionFusion(
            cnn_dim=512, mlp_dim=128, hidden_dim=256, num_heads=4, dropout=0.1
        )
        
        # Enhanced Fusion layer
        self.fusion = nn.Sequential(
            nn.Linear(512 + 128, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(dropout),
            
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(dropout * 0.75),
            
            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(dropout * 0.5),
            
            nn.Linear(64, num_classes)
        )
        
    def forward(self, x):
        spectrogram, features = x
        
        # Input validation
        assert len(spectrogram.shape) == 4, f"Expected 4D spectrogram, got {len(spectrogram.shape)}D"
        assert spectrogram.shape[1] == 1, f"Expected 1 channel, got {spectrogram.shape[1]}"
        assert len(features.shape) == 2, f"Expected 2D features, got {len(features.shape)}D"
        assert features.shape[1] == self.n_features, f"Expected {self.n_features} features, got {features.shape[1]}"
        
        # CNN branch with multi-scale
        cnn_init = self.cnn_initial(spectrogram)
        cnn_multiscale = self.multiscale(cnn_init)
        cnn_out = self.cnn_branch(cnn_multiscale)  # (batch, 512)
        
        # MLP branch with feature attention and residual connections
        features_attended = self.feature_attention(features)
        
        mlp = self.mlp_layer1(features_attended)  # (batch, 256)
        mlp_input1 = mlp  # Save input for residual
        mlp = self.mlp_layer2(mlp) + self.mlp_residual1(mlp_input1)  # (batch, 512) with residual
        mlp_input2 = mlp  # Save input for residual
        mlp = self.mlp_layer3(mlp) + self.mlp_residual2(mlp_input2)  # (batch, 256) with residual
        mlp_out = self.mlp_layer4(mlp)  # (batch, 128)
        
        # Multi-head cross-attention fusion
        cnn_enhanced, mlp_enhanced = self.cross_attention(cnn_out, mlp_out)
        
        # Concatenate enhanced outputs
        fused = torch.cat([cnn_enhanced, mlp_enhanced], dim=1)  # (batch, 640)
        
        # Final classification
        out = self.fusion(fused)  # (batch, 2)
        
        return out
    
    def get_config(self):
        """Return model configuration"""
        return {
            'model_type': 'HybridCNNMLP_V4_3',
            'num_classes': 2,
            'n_features': self.n_features,
            'input_shapes': {
                'spectrogram': (1, 128, 7),
                'features': (self.n_features,)
            },
            'version': '4.3'
        }

logger.info("Model architecture V4.3 (Enhanced) defined successfully!")


2026-01-04 17:47:56,832 - INFO - Model architecture V4.3 (Enhanced) defined successfully!


## Define Focal Loss for Hard Examples


In [None]:
class FocalLoss(nn.Module):
    """
    Focal Loss for addressing class imbalance and hard examples.
    FL(p_t) = -alpha * (1 - p_t)^gamma * log(p_t)
    Optimized version with better alpha handling.
    """
    def __init__(self, alpha=0.25, gamma=2.0, weight=None, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.weight = weight  # Class weights
        self.reduction = reduction
        
        # Pre-register alpha tensor if it's a scalar
        if isinstance(alpha, (float, int)):
            self.register_buffer('alpha_tensor', torch.tensor(alpha))
        elif isinstance(alpha, torch.Tensor):
            self.register_buffer('alpha_tensor', alpha)
        else:
            self.alpha_tensor = None
    
    def forward(self, pred, target):
        """
        Args:
            pred: (N, C) logits
            target: (N,) class indices
        Returns:
            Focal loss value
        """
        log_prob = F.log_softmax(pred, dim=1)
        prob = torch.exp(log_prob)
        
        # Get probability of true class
        prob_t = prob.gather(1, target.unsqueeze(1)).squeeze(1)
        
        # Compute focal weight: (1 - p_t)^gamma
        focal_weight = (1 - prob_t) ** self.gamma
        
        # Compute cross entropy
        ce_loss = -log_prob.gather(1, target.unsqueeze(1)).squeeze(1)
        
        # Apply class weights if provided
        if self.weight is not None:
            class_weights = self.weight[target]
            ce_loss = ce_loss * class_weights
        
        # Apply alpha weighting
        if self.alpha_tensor is not None:
            if len(self.alpha_tensor.shape) == 0:
                # Scalar alpha - apply uniformly
                alpha_t = self.alpha_tensor
            else:
                # Per-class alpha
                alpha_t = self.alpha_tensor[target]
        else:
            alpha_t = 1.0
        
        focal_loss = alpha_t * focal_weight * ce_loss
        
        if self.reduction == 'mean':
            return focal_loss.mean()
        elif self.reduction == 'sum':
            return focal_loss.sum()
        else:
            return focal_loss

logger.info("FocalLoss class defined successfully!")


2026-01-04 17:47:56,843 - INFO - FocalLoss class defined successfully!


## Define Training Utilities


In [None]:
# Training utilities
def train_epoch(model, dataloader, criterion, optimizer, device, max_grad_norm=None):
    """Train for one epoch with optional gradient clipping"""
    model.train()
    running_loss = 0.0
    all_preds = []
    all_labels = []
    
    for batch in tqdm(dataloader, desc="Training", leave=False):
        if isinstance(batch[0], (tuple, list)) and len(batch[0]) == 2:
            inputs = tuple(x.to(device) for x in batch[0])
        else:
            inputs = batch[0].to(device)
        
        labels = batch[1].to(device)
        
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        
        if max_grad_norm is not None:
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        
        optimizer.step()
        
        running_loss += loss.item()
        preds = torch.argmax(outputs, dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(labels.cpu().numpy())
    
    avg_loss = running_loss / len(dataloader)
    accuracy = accuracy_score(all_labels, all_preds)
    
    return avg_loss, accuracy


def validate(model, dataloader, criterion, device):
    """Validate model"""
    model.eval()
    running_loss = 0.0
    all_preds = []
    all_labels = []
    all_probs = []
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Validating", leave=False):
            if isinstance(batch[0], (tuple, list)) and len(batch[0]) == 2:
                inputs = tuple(x.to(device) for x in batch[0])
            else:
                inputs = batch[0].to(device)
            
            labels = batch[1].to(device)
            
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            
            running_loss += loss.item()
            probs = torch.softmax(outputs, dim=1).cpu().numpy()
            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())
            all_probs.extend(probs)
    
    avg_loss = running_loss / len(dataloader)
    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, average='weighted', zero_division=0)
    recall = recall_score(all_labels, all_preds, average='weighted', zero_division=0)
    f1 = f1_score(all_labels, all_preds, average='weighted', zero_division=0)
    
    try:
        roc_auc = roc_auc_score(all_labels, np.array(all_probs)[:, 1])
    except ValueError as e:
        logger.warning(f"Could not compute ROC-AUC: {e}")
        roc_auc = 0.0
    
    metrics = {
        'loss': avg_loss,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'roc_auc': roc_auc
    }
    
    return metrics, all_preds, all_labels, all_probs


def train_model(model, train_loader, val_loader, criterion, optimizer, scheduler,
                device, num_epochs, save_dir, model_name, early_stopping_patience=20, max_grad_norm=None):
    """Train model with early stopping and optional gradient clipping"""
    save_dir = Path(save_dir)
    save_dir.mkdir(parents=True, exist_ok=True)
    
    best_val_f1 = 0.0
    best_epoch = 0
    patience_counter = 0
    training_history = []
    
    for epoch in range(num_epochs):
        logger.info(f"\nEpoch {epoch+1}/{num_epochs}")
        logger.info("-" * 50)
        
        train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, device, max_grad_norm)
        val_metrics, _, _, _ = validate(model, val_loader, criterion, device)
        
        if scheduler is not None:
            scheduler.step()
        
        current_lr = optimizer.param_groups[0]['lr']
        epoch_metrics = {
            'epoch': epoch + 1,
            'train_loss': train_loss,
            'train_accuracy': train_acc,
            'val_loss': val_metrics['loss'],
            'val_accuracy': val_metrics['accuracy'],
            'val_precision': val_metrics['precision'],
            'val_recall': val_metrics['recall'],
            'val_f1': val_metrics['f1'],
            'val_roc_auc': val_metrics['roc_auc'],
            'learning_rate': current_lr
        }
        training_history.append(epoch_metrics)
        
        logger.info(f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}")
        logger.info(f"Val Loss: {val_metrics['loss']:.4f}, Val Acc: {val_metrics['accuracy']:.4f}")
        logger.info(f"Val F1: {val_metrics['f1']:.4f}, Val ROC-AUC: {val_metrics['roc_auc']:.4f}")
        logger.info(f"Learning Rate: {current_lr:.6f}")
        
        if val_metrics['f1'] > best_val_f1:
            best_val_f1 = val_metrics['f1']
            best_epoch = epoch + 1
            patience_counter = 0
            
            torch.save({
                'epoch': epoch + 1,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'val_f1': best_val_f1,
                'val_metrics': val_metrics
            }, save_dir / 'best_model.pt')
            
            logger.info(f"✓ New best model saved! (F1: {best_val_f1:.4f})")
        else:
            patience_counter += 1
            if patience_counter >= early_stopping_patience:
                logger.info(f"\nEarly stopping at epoch {epoch+1}")
                logger.info(f"Best F1: {best_val_f1:.4f} at epoch {best_epoch}")
                break
    
    with open(save_dir / 'training_history.json', 'w') as f:
        json.dump(training_history, f, indent=2)
    
    config = model.get_config() if hasattr(model, 'get_config') else {}
    config.update({
        'best_epoch': best_epoch,
        'best_val_f1': best_val_f1,
        'num_epochs': num_epochs
    })
    with open(save_dir / 'config.json', 'w') as f:
        json.dump(config, f, indent=2)
    
    return training_history, best_epoch


def evaluate_model(model, test_loader, criterion, device):
    """Evaluate model on test set"""
    metrics, preds, labels, probs = validate(model, test_loader, criterion, device)
    
    precision_per_class = precision_score(labels, preds, average=None, zero_division=0)
    recall_per_class = recall_score(labels, preds, average=None, zero_division=0)
    f1_per_class = f1_score(labels, preds, average=None, zero_division=0)
    
    metrics['precision_g'] = float(precision_per_class[0])
    metrics['precision_k'] = float(precision_per_class[1])
    metrics['recall_g'] = float(recall_per_class[0])
    metrics['recall_k'] = float(recall_per_class[1])
    metrics['f1_g'] = float(f1_per_class[0])
    metrics['f1_k'] = float(f1_per_class[1])
    metrics['confusion_matrix'] = confusion_matrix(labels, preds).tolist()
    
    return metrics, preds, labels, probs


class WarmupCosineScheduler:
    """Learning rate scheduler with warmup and cosine annealing"""
    def __init__(self, optimizer, warmup_epochs, total_epochs, min_lr=1e-6):
        self.optimizer = optimizer
        self.warmup_epochs = warmup_epochs
        self.total_epochs = total_epochs
        self.min_lr = min_lr
        self.base_lr = optimizer.param_groups[0]['lr']
        self.current_epoch = 0
    
    def step(self):
        self.current_epoch += 1
        
        if self.current_epoch <= self.warmup_epochs:
            lr = self.base_lr * (self.current_epoch / self.warmup_epochs)
        else:
            progress = (self.current_epoch - self.warmup_epochs) / (self.total_epochs - self.warmup_epochs)
            lr = self.min_lr + (self.base_lr - self.min_lr) * 0.5 * (1 + math.cos(math.pi * progress))
        
        for param_group in self.optimizer.param_groups:
            param_group['lr'] = lr
    
    def get_last_lr(self):
        return [self.optimizer.param_groups[0]['lr']]

logger.info("Training utilities defined successfully!")


2026-01-04 17:47:56,865 - INFO - Training utilities defined successfully!


## Create Model and Training Configuration


In [None]:
# Create model V4.3 with automatic feature count detection
model = HybridCNNMLP_V4_3(
    n_features=len(feature_cols), 
    num_classes=2, 
    dropout=CONFIG['training']['dropout']
).to(device)

# Print model info
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
logger.info(f"Model: {model.get_config()['model_type']}")
logger.info(f"Total parameters: {total_params:,}")
logger.info(f"Trainable parameters: {trainable_params:,}")
logger.info(f"Number of features: {len(feature_cols)}")

# Prepare class weights for loss function
class_weights = torch.tensor([
    class_weights_dict.get('0', class_weights_dict.get(0, 1.0)), 
    class_weights_dict.get('1', class_weights_dict.get(1, 1.0))
], dtype=torch.float32).to(device)

# Loss function: Focal Loss with class weights
focal_config = CONFIG['focal_loss']
criterion = FocalLoss(
    alpha=focal_config['alpha'], 
    gamma=focal_config['gamma'], 
    weight=class_weights, 
    reduction='mean'
)

# Optimizer
train_config = CONFIG['training']
optimizer = torch.optim.Adam(
    model.parameters(), 
    lr=train_config['initial_lr'], 
    weight_decay=train_config['weight_decay']
)

# Learning rate scheduler with warmup and cosine annealing
scheduler = WarmupCosineScheduler(
    optimizer, 
    warmup_epochs=train_config['warmup_epochs'], 
    total_epochs=train_config['num_epochs'], 
    min_lr=train_config['min_lr']
)

# Output directory
OUTPUT_DIR = DATA_DIR / 'improved_models'
save_dir = OUTPUT_DIR / 'hybrid_cnn_mlp_v4_3_enhanced'
save_dir.mkdir(parents=True, exist_ok=True)

logger.info(f"\nTraining configuration:")
logger.info(f"- Epochs: {train_config['num_epochs']}")
logger.info(f"- Warmup epochs: {train_config['warmup_epochs']}")
logger.info(f"- Initial LR: {train_config['initial_lr']}")
logger.info(f"- Loss function: Focal Loss (alpha={focal_config['alpha']}, gamma={focal_config['gamma']})")
logger.info(f"- Gradient clipping: {train_config['max_grad_norm']}")
logger.info(f"- Early stopping patience: {train_config['early_stopping_patience']}")
logger.info(f"- Dropout: {train_config['dropout']}")
logger.info(f"- SpecAugment: Enabled for training")
logger.info(f"- Context windows: ±100ms (V2 with VOT and burst features)")
logger.info(f"- Save directory: {save_dir}")


2026-01-04 17:47:56,978 - INFO - Model: HybridCNNMLP_V4_3
2026-01-04 17:47:56,979 - INFO - Total parameters: 6,579,554
2026-01-04 17:47:56,979 - INFO - Trainable parameters: 6,579,554
2026-01-04 17:47:56,980 - INFO - Number of features: 129
2026-01-04 17:47:58,028 - INFO - 
Training configuration:
2026-01-04 17:47:58,028 - INFO - - Epochs: 200
2026-01-04 17:47:58,028 - INFO - - Warmup epochs: 5
2026-01-04 17:47:58,029 - INFO - - Initial LR: 0.0005
2026-01-04 17:47:58,029 - INFO - - Loss function: Focal Loss (alpha=0.25, gamma=2.0)
2026-01-04 17:47:58,029 - INFO - - Gradient clipping: 1.0
2026-01-04 17:47:58,030 - INFO - - Early stopping patience: 20
2026-01-04 17:47:58,030 - INFO - - Dropout: 0.4
2026-01-04 17:47:58,031 - INFO - - SpecAugment: Enabled for training
2026-01-04 17:47:58,031 - INFO - - Context windows: ±100ms (V2 with VOT and burst features)
2026-01-04 17:47:58,031 - INFO - - Save directory: /Volumes/SSanDisk/SpeechRec-German/artifacts/g-k_dl_models_with_context_v2/improve

## Train Model


In [None]:
# Train model
train_config = CONFIG['training']
history, best_epoch = train_model(
    model, train_hybrid_loader, val_hybrid_loader, criterion, optimizer, scheduler,
    device, num_epochs=train_config['num_epochs'], save_dir=save_dir, 
    model_name='hybrid_cnn_mlp_v4_3_enhanced', 
    early_stopping_patience=train_config['early_stopping_patience'], 
    max_grad_norm=train_config['max_grad_norm']
)

# Load best model and evaluate on test set
checkpoint = torch.load(save_dir / 'best_model.pt')
model.load_state_dict(checkpoint['model_state_dict'])
test_metrics, test_preds, test_labels, test_probs = evaluate_model(
    model, test_hybrid_loader, criterion, device
)

# Save test metrics
with open(save_dir / 'test_metrics.json', 'w') as f:
    json.dump(test_metrics, f, indent=2)

logger.info(f"\n{'='*60}")
logger.info(f"Final Test Results:")
logger.info(f"{'='*60}")
logger.info(f"Accuracy: {test_metrics['accuracy']:.4f}")
logger.info(f"F1-score: {test_metrics['f1']:.4f}")
logger.info(f"ROC-AUC: {test_metrics['roc_auc']:.4f}")
logger.info(f"Precision: {test_metrics['precision']:.4f}")
logger.info(f"Recall: {test_metrics['recall']:.4f}")
logger.info(f"Best epoch: {best_epoch}")


2026-01-04 17:47:58,042 - INFO - 
Epoch 1/200
2026-01-04 17:47:58,043 - INFO - --------------------------------------------------
2026-01-04 17:49:31,903 - INFO - Train Loss: 0.0222, Train Acc: 0.8628
2026-01-04 17:49:31,904 - INFO - Val Loss: 0.0129, Val Acc: 0.9319
2026-01-04 17:49:31,905 - INFO - Val F1: 0.9319, Val ROC-AUC: 0.9781
2026-01-04 17:49:31,905 - INFO - Learning Rate: 0.000100
2026-01-04 17:49:32,189 - INFO - ✓ New best model saved! (F1: 0.9319)
2026-01-04 17:49:32,191 - INFO - 
Epoch 2/200
2026-01-04 17:49:32,191 - INFO - --------------------------------------------------
2026-01-04 17:50:33,692 - INFO - Train Loss: 0.0148, Train Acc: 0.9158
2026-01-04 17:50:33,692 - INFO - Val Loss: 0.0122, Val Acc: 0.9298
2026-01-04 17:50:33,692 - INFO - Val F1: 0.9301, Val ROC-AUC: 0.9814
2026-01-04 17:50:33,693 - INFO - Learning Rate: 0.000200
2026-01-04 17:50:33,693 - INFO - 
Epoch 3/200
2026-01-04 17:50:33,693 - INFO - --------------------------------------------------
2026-01-04 1

## Save Predictions with Probabilities (Optimized with Vectorized Operations)


In [None]:
# Get test dataset to extract phoneme metadata
test_df = df[df['split'] == 'test'].reset_index(drop=True)

# Use vectorized operations instead of iterrows for better performance
# Ensure alignment between test_df and predictions
assert len(test_df) == len(test_labels) == len(test_preds) == len(test_probs), \
    f"Length mismatch: test_df={len(test_df)}, labels={len(test_labels)}, preds={len(test_preds)}, probs={len(test_probs)}"

# Create predictions dataframe using vectorized operations
predictions_data = {
    'phoneme_id': test_df['phoneme_id'].values,
    'utterance_id': test_df.get('utterance_id', pd.Series([None] * len(test_df))).values,
    'phoneme': test_df.get('phoneme', test_df.get('class', pd.Series([None] * len(test_df)))).values,
    'true_class': test_df['class'].values,
    'true_class_encoded': test_labels,
    'predicted_class_encoded': test_preds,
    'predicted_class': np.where(test_preds == 0, 'k', 'ɡ'),  # k=0, ɡ=1
    'prob_class_0': [p[0] for p in test_probs],
    'prob_class_1': [p[1] for p in test_probs],
    'max_prob': [np.max(p) for p in test_probs],
    'is_correct': (test_labels == test_preds).astype(int),
    'duration_ms': test_df.get('duration_ms', pd.Series([None] * len(test_df))).values
}

# Calculate confidence (max prob for correct, predicted prob for incorrect)
confidence = []
for i, (label, pred, prob) in enumerate(zip(test_labels, test_preds, test_probs)):
    if label == pred:
        confidence.append(float(np.max(prob)))
    else:
        confidence.append(float(prob[pred]))

predictions_data['confidence'] = confidence
predictions_df = pd.DataFrame(predictions_data)

# Save to CSV
predictions_df.to_csv(save_dir / 'test_predictions_with_probs.csv', index=False)
logger.info(f"Saved predictions with probabilities to: {save_dir / 'test_predictions_with_probs.csv'}")
logger.info(f"Total predictions: {len(predictions_df)}")
logger.info(f"Correct predictions: {predictions_df['is_correct'].sum()}")
logger.info(f"Incorrect predictions: {(~predictions_df['is_correct'].astype(bool)).sum()}")

# Save summary statistics
summary_stats = {
    'total_samples': len(predictions_df),
    'correct_predictions': int(predictions_df['is_correct'].sum()),
    'incorrect_predictions': int((~predictions_df['is_correct'].astype(bool)).sum()),
    'accuracy': float(predictions_df['is_correct'].mean()),
    'avg_confidence_correct': float(predictions_df[predictions_df['is_correct'] == 1]['confidence'].mean()),
    'avg_confidence_incorrect': float(predictions_df[predictions_df['is_correct'] == 0]['confidence'].mean()),
    'min_confidence_incorrect': float(predictions_df[predictions_df['is_correct'] == 0]['confidence'].min()),
    'max_confidence_incorrect': float(predictions_df[predictions_df['is_correct'] == 0]['confidence'].max()),
    'high_confidence_errors': int(((predictions_df['is_correct'] == 0) & (predictions_df['confidence'] > 0.8)).sum()),
    'low_confidence_errors': int(((predictions_df['is_correct'] == 0) & (predictions_df['confidence'] < 0.6)).sum()),
}

with open(save_dir / 'predictions_summary.json', 'w') as f:
    json.dump(summary_stats, f, indent=2)

logger.info(f"\nSummary Statistics:")
logger.info(f"- Average confidence (correct): {summary_stats['avg_confidence_correct']:.4f}")
logger.info(f"- Average confidence (incorrect): {summary_stats['avg_confidence_incorrect']:.4f}")
logger.info(f"- High confidence errors (>0.8): {summary_stats['high_confidence_errors']}")
logger.info(f"- Low confidence errors (<0.6): {summary_stats['low_confidence_errors']}")


2026-01-04 21:54:49,415 - INFO - Saved predictions with probabilities to: /Volumes/SSanDisk/SpeechRec-German/artifacts/g-k_dl_models_with_context_v2/improved_models/hybrid_cnn_mlp_v4_3_enhanced/test_predictions_with_probs.csv
2026-01-04 21:54:49,416 - INFO - Total predictions: 5698
2026-01-04 21:54:49,417 - INFO - Correct predictions: 5416
2026-01-04 21:54:49,418 - INFO - Incorrect predictions: 282
2026-01-04 21:54:49,421 - INFO - 
Summary Statistics:
2026-01-04 21:54:49,422 - INFO - - Average confidence (correct): 0.8890
2026-01-04 21:54:49,422 - INFO - - Average confidence (incorrect): 0.6558
2026-01-04 21:54:49,422 - INFO - - High confidence errors (>0.8): 49
2026-01-04 21:54:49,423 - INFO - - Low confidence errors (<0.6): 122


## Save Validation Predictions


In [None]:
# Get validation predictions
val_metrics, val_preds, val_labels, val_probs = evaluate_model(model, val_hybrid_loader, criterion, device)
val_df = df[df['split'] == 'val'].reset_index(drop=True)

# Ensure alignment
assert len(val_df) == len(val_labels) == len(val_preds) == len(val_probs), \
    f"Length mismatch: val_df={len(val_df)}, labels={len(val_labels)}, preds={len(val_preds)}, probs={len(val_probs)}"

# Use vectorized operations
val_predictions_data = {
    'phoneme_id': val_df['phoneme_id'].values,
    'utterance_id': val_df.get('utterance_id', pd.Series([None] * len(val_df))).values,
    'phoneme': val_df.get('phoneme', val_df.get('class', pd.Series([None] * len(val_df)))).values,
    'true_class': val_df['class'].values,
    'true_class_encoded': val_labels,
    'predicted_class_encoded': val_preds,
    'predicted_class': np.where(val_preds == 0, 'k', 'ɡ'),  # k=0, ɡ=1
    'prob_class_0': [p[0] for p in val_probs],
    'prob_class_1': [p[1] for p in val_probs],
    'max_prob': [np.max(p) for p in val_probs],
    'is_correct': (val_labels == val_preds).astype(int),
    'duration_ms': val_df.get('duration_ms', pd.Series([None] * len(val_df))).values
}

# Calculate confidence
val_confidence = []
for i, (label, pred, prob) in enumerate(zip(val_labels, val_preds, val_probs)):
    if label == pred:
        val_confidence.append(float(np.max(prob)))
    else:
        val_confidence.append(float(prob[pred]))

val_predictions_data['confidence'] = val_confidence
val_predictions_df = pd.DataFrame(val_predictions_data)
val_predictions_df.to_csv(save_dir / 'val_predictions_with_probs.csv', index=False)
logger.info(f"Saved validation predictions to: {save_dir / 'val_predictions_with_probs.csv'}")


                                                           

AttributeError: 'bool' object has no attribute 'astype'

## Confusion Matrix Analysis

Visualize confusion matrix to understand model errors per class

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Create confusion matrix for test set
cm = confusion_matrix(test_labels, test_preds)

# Get class names from label encoder
class_names = le.classes_

# Create figure with two subplots
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Plot 1: Confusion matrix with counts
sns.heatmap(
    cm, 
    annot=True, 
    fmt='d', 
    cmap='Blues',
    xticklabels=class_names,
    yticklabels=class_names,
    ax=axes[0]
)
axes[0].set_xlabel('Predicted Class', fontsize=12)
axes[0].set_ylabel('True Class', fontsize=12)
axes[0].set_title('Confusion Matrix (Counts)', fontsize=14, fontweight='bold')

# Plot 2: Confusion matrix with percentages
cm_percent = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100
sns.heatmap(
    cm_percent, 
    annot=True, 
    fmt='.1f', 
    cmap='Blues',
    xticklabels=class_names,
    yticklabels=class_names,
    ax=axes[1]
)
axes[1].set_xlabel('Predicted Class', fontsize=12)
axes[1].set_ylabel('True Class', fontsize=12)
axes[1].set_title('Confusion Matrix (Percentages)', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.savefig(save_dir / 'confusion_matrix.png', dpi=300, bbox_inches='tight')
logger.info(f"Confusion matrix saved to: {save_dir / 'confusion_matrix.png'}")
plt.show()

# Print detailed statistics
logger.info(f"\n{'='*60}")
logger.info(f"Confusion Matrix Analysis:")
logger.info(f"{'='*60}")
for i, true_class in enumerate(class_names):
    total_true = cm[i].sum()
    correct = cm[i, i]
    errors = total_true - correct
    accuracy_per_class = (correct / total_true * 100) if total_true > 0 else 0
    
    logger.info(f"\nTrue Class: {true_class}")
    logger.info(f"  Total samples: {total_true}")
    logger.info(f"  Correctly predicted: {correct} ({accuracy_per_class:.2f}%)")
    logger.info(f"  Incorrectly predicted: {errors}")
    
    # Show error breakdown
    for j, pred_class in enumerate(class_names):
        if i != j and cm[i, j] > 0:
            error_pct = (cm[i, j] / total_true * 100) if total_true > 0 else 0
            logger.info(f"    → Misclassified as '{pred_class}': {cm[i, j]} ({error_pct:.2f}%)")

# Calculate overall metrics
total_samples = cm.sum()
correct_predictions = cm.trace()
total_errors = total_samples - correct_predictions

logger.info(f"\n{'='*60}")
logger.info(f"Overall Statistics:")
logger.info(f"  Total test samples: {total_samples}")
logger.info(f"  Correct predictions: {correct_predictions} ({correct_predictions/total_samples*100:.2f}%)")
logger.info(f"  Total errors: {total_errors} ({total_errors/total_samples*100:.2f}%)")
