# B-P Phoneme DL Data Preparation (with Context)

Preparation of PyTorch datasets for deep learning models using phoneme audio with extended context windows:
- **Extract features from context audio files** (±100ms context from `phoneme_wav_with_context`)
- Extract spectrograms from context audio files
- Create PyTorch Dataset classes for different input types
- Train/Val/Test split with stratification
- Data normalization
- DataLoader creation with batch sampling
- Handle class imbalance

**Key difference from 02.2:** 
- Features are extracted **anew** from context audio files (~300ms duration) instead of using old features
- Uses extended context windows (±100ms) for better capture of coarticulation, formant transitions, and VOT
- All features (MFCC, energy, spectral, formants, quality metrics) reflect the extended context


In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
import h5py
import librosa
import soundfile as sf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.ensemble import IsolationForest
from tqdm import tqdm
import warnings
import json
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
from scipy import signal
warnings.filterwarnings('ignore')

# Try to import optional libraries
try:
    import parselmouth
    HAS_PARSELMOUTH = True
except ImportError:
    HAS_PARSELMOUTH = False
    print("Warning: parselmouth not installed. Will use LPC for formant extraction.")

# Configuration
# Determine project root (parent of notebooks directory)
PROJECT_ROOT = Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()
# Audio files with context are in the main artifacts directory
PHONEME_WAV_DIR = PROJECT_ROOT / 'artifacts' / 'phoneme_wav_with_context'  # With context!
PHONEMES_FILE = PROJECT_ROOT / 'artifacts' / 'phoneme_intervals.csv'  # Load from fresh phoneme intervals file

# Output directories
OUTPUT_DIR = PROJECT_ROOT / 'artifacts' / 'b-p_dl_models_with_context'
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
FEATURES_OUTPUT_DIR = OUTPUT_DIR / 'features'
FEATURES_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Feature extraction parameters
SAMPLE_RATE = 16000
N_MELS = 128
HOP_LENGTH = 512
MFCC_N_COEFFS = 13
SPECTROGRAM_WINDOW_MS = 200  # For spectrograms (will be longer with context)

# Files to save/load
FEATURES_FILE = FEATURES_OUTPUT_DIR / 'features.parquet'
SPECTROGRAMS_FILE = FEATURES_OUTPUT_DIR / 'spectrograms.h5'

# Flag to control whether to extract features or load existing
EXTRACT_FEATURES = True  # Set to False to skip extraction and load existing features

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
torch.manual_seed(RANDOM_STATE)

# Device setup
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print(f"Using MPS device")
elif torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Using CUDA device")
else:
    device = torch.device("cpu")
    print(f"Using CPU device")

print(f"Project root: {PROJECT_ROOT}")
print(f"Phoneme audio directory (with context): {PHONEME_WAV_DIR}")
print(f"Output directory: {OUTPUT_DIR}")
print(f"Features output directory: {FEATURES_OUTPUT_DIR}")
print(f"Extract features: {EXTRACT_FEATURES}")


Using MPS device
Project root: /Volumes/SSanDisk/SpeechRec-German
Phoneme audio directory (with context): /Volumes/SSanDisk/SpeechRec-German/artifacts/phoneme_wav_with_context
Output directory: /Volumes/SSanDisk/SpeechRec-German/artifacts/b-p_dl_models_with_context
Features output directory: /Volumes/SSanDisk/SpeechRec-German/artifacts/b-p_dl_models_with_context/features
Extract features: True


## 1. Feature Extraction Functions


In [None]:
# Feature extraction functions (extracted from context audio files)
def extract_mfcc_features(audio, sr=SAMPLE_RATE, n_mfcc=MFCC_N_COEFFS):
    """Extract MFCC features and their deltas."""
    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc, hop_length=HOP_LENGTH)
    
    # Handle short audio files: adjust delta width based on available frames
    n_frames = mfcc.shape[1]
    default_width = 9
    
    if n_frames < default_width:
        if n_frames < 3:
            calculated_width = 3
        elif n_frames < 9:
            calculated_width = n_frames if n_frames % 2 == 1 else n_frames - 1
            calculated_width = max(3, calculated_width)
        else:
            calculated_width = 9
        if calculated_width % 2 == 0:
            calculated_width = max(3, calculated_width - 1)
        delta_mfcc = librosa.feature.delta(mfcc, width=calculated_width, mode='nearest')
        delta2_mfcc = librosa.feature.delta(mfcc, order=2, width=calculated_width, mode='nearest')
    else:
        delta_mfcc = librosa.feature.delta(mfcc)
        delta2_mfcc = librosa.feature.delta(mfcc, order=2)
    
    return {
        'mfcc_mean': np.mean(mfcc, axis=1),
        'mfcc_std': np.std(mfcc, axis=1),
        'delta_mfcc_mean': np.mean(delta_mfcc, axis=1),
        'delta_mfcc_std': np.std(delta_mfcc, axis=1),
        'delta2_mfcc_mean': np.mean(delta2_mfcc, axis=1),
        'delta2_mfcc_std': np.std(delta2_mfcc, axis=1),
    }

def extract_energy_features(audio, sr=SAMPLE_RATE):
    """Extract energy-related features."""
    rms = librosa.feature.rms(y=audio, hop_length=HOP_LENGTH)[0]
    zcr = librosa.feature.zero_crossing_rate(audio, hop_length=HOP_LENGTH)[0]
    
    return {
        'energy_rms': np.mean(rms),
        'energy_rms_std': np.std(rms),
        'energy_zcr': np.mean(zcr),
        'energy_zcr_std': np.std(zcr),
    }

def extract_spectral_features(audio, sr=SAMPLE_RATE):
    """Extract spectral features."""
    centroid = librosa.feature.spectral_centroid(y=audio, sr=sr, hop_length=HOP_LENGTH)[0]
    rolloff = librosa.feature.spectral_rolloff(y=audio, sr=sr, hop_length=HOP_LENGTH)[0]
    bandwidth = librosa.feature.spectral_bandwidth(y=audio, sr=sr, hop_length=HOP_LENGTH)[0]
    contrast = librosa.feature.spectral_contrast(y=audio, sr=sr, hop_length=HOP_LENGTH)
    
    return {
        'spectral_centroid': np.mean(centroid),
        'spectral_centroid_std': np.std(centroid),
        'spectral_rolloff': np.mean(rolloff),
        'spectral_rolloff_std': np.std(rolloff),
        'spectral_bandwidth': np.mean(bandwidth),
        'spectral_bandwidth_std': np.std(bandwidth),
        'spectral_contrast_mean': np.mean(contrast, axis=1),
    }

def extract_formants_lpc(audio, sr=SAMPLE_RATE, n_formants=4, order=10):
    """Extract formants using LPC (Linear Predictive Coding)."""
    # Pre-emphasis filter
    audio_pre = signal.lfilter([1, -0.97], 1, audio)
    
    # Frame the signal
    frame_length = int(0.025 * sr)  # 25ms frames
    hop_length_frame = int(0.010 * sr)  # 10ms hop
    
    formants_list = []
    
    for i in range(0, len(audio_pre) - frame_length, hop_length_frame):
        frame = audio_pre[i:i+frame_length]
        windowed = frame * signal.windows.hann(len(frame))
        
        try:
            # Compute autocorrelation
            autocorr = np.correlate(windowed, windowed, mode='full')
            autocorr = autocorr[len(autocorr)//2:len(autocorr)//2+order+1]
            
            # Levinson-Durbin recursion
            a = np.zeros(order + 1)
            a[0] = 1.0
            e = autocorr[0]
            
            for j in range(1, order + 1):
                k = -np.sum(a[:j] * autocorr[j:0:-1]) / e
                a[1:j+1] = a[1:j+1] + k * a[j-1::-1]
                a[j] = k
                e = e * (1 - k * k)
            
            roots = np.roots(a)
            roots = roots[np.imag(roots) >= 0]
            angles = np.angle(roots)
            freqs = angles * (sr / (2 * np.pi))
            magnitudes = np.abs(roots)
            
            # Filter: formants should have high magnitude and be in valid frequency range
            freq_mag_pairs = [(f, m) for f, m in zip(freqs, magnitudes) if 50 < f < sr/2 and m > 0.7]
            freq_mag_pairs.sort(key=lambda x: x[1], reverse=True)
            freqs = [f for f, m in freq_mag_pairs]
            
            formants = freqs[:n_formants]
            while len(formants) < n_formants:
                formants.append(0.0)
            
            formants_list.append(formants[:n_formants])
        except Exception:
            formants_list.append([0.0] * n_formants)
    
    if len(formants_list) == 0:
        return {
            'formant_f1': 0.0, 'formant_f2': 0.0, 'formant_f3': 0.0, 'formant_f4': 0.0,
            'formant_f1_std': 0.0, 'formant_f2_std': 0.0, 'formant_f3_std': 0.0, 'formant_f4_std': 0.0,
        }
    
    formants_array = np.array(formants_list)
    
    return {
        'formant_f1': np.mean(formants_array[:, 0]) if len(formants_array) > 0 and np.any(formants_array[:, 0] > 0) else 0.0,
        'formant_f2': np.mean(formants_array[:, 1]) if len(formants_array) > 0 and np.any(formants_array[:, 1] > 0) else 0.0,
        'formant_f3': np.mean(formants_array[:, 2]) if len(formants_array) > 0 and np.any(formants_array[:, 2] > 0) else 0.0,
        'formant_f4': np.mean(formants_array[:, 3]) if len(formants_array) > 0 and np.any(formants_array[:, 3] > 0) else 0.0,
        'formant_f1_std': np.std(formants_array[:, 0]) if len(formants_array) > 0 and np.any(formants_array[:, 0] > 0) else 0.0,
        'formant_f2_std': np.std(formants_array[:, 1]) if len(formants_array) > 0 and np.any(formants_array[:, 1] > 0) else 0.0,
        'formant_f3_std': np.std(formants_array[:, 2]) if len(formants_array) > 0 and np.any(formants_array[:, 2] > 0) else 0.0,
        'formant_f4_std': np.std(formants_array[:, 3]) if len(formants_array) > 0 and np.any(formants_array[:, 3] > 0) else 0.0,
    }

def extract_formants_parselmouth(audio, sr=SAMPLE_RATE, n_formants=4):
    """Extract formants using Parselmouth (Praat)."""
    if not HAS_PARSELMOUTH:
        return extract_formants_lpc(audio, sr, n_formants)
    
    try:
        sound = parselmouth.Sound(audio, sampling_frequency=sr)
        formant = sound.to_formant_burg(time_step=0.01)
        
        formants_list = []
        times = np.arange(0, sound.duration, 0.01)
        
        for t in times:
            formants = []
            for i in range(1, n_formants + 1):
                try:
                    f = formant.get_value_at_time(i, t)
                    if f > 0:
                        formants.append(f)
                    else:
                        formants.append(0.0)
                except:
                    formants.append(0.0)
            formants_list.append(formants)
        
        formants_array = np.array(formants_list)
        
        return {
            'formant_f1': np.mean(formants_array[:, 0]) if len(formants_array) > 0 and np.any(formants_array[:, 0] > 0) else 0.0,
            'formant_f2': np.mean(formants_array[:, 1]) if len(formants_array) > 0 and np.any(formants_array[:, 1] > 0) else 0.0,
            'formant_f3': np.mean(formants_array[:, 2]) if len(formants_array) > 0 and np.any(formants_array[:, 2] > 0) else 0.0,
            'formant_f4': np.mean(formants_array[:, 3]) if len(formants_array) > 0 and np.any(formants_array[:, 3] > 0) else 0.0,
            'formant_f1_std': np.std(formants_array[:, 0]) if len(formants_array) > 0 and np.any(formants_array[:, 0] > 0) else 0.0,
            'formant_f2_std': np.std(formants_array[:, 1]) if len(formants_array) > 0 and np.any(formants_array[:, 1] > 0) else 0.0,
            'formant_f3_std': np.std(formants_array[:, 2]) if len(formants_array) > 0 and np.any(formants_array[:, 2] > 0) else 0.0,
            'formant_f4_std': np.std(formants_array[:, 3]) if len(formants_array) > 0 and np.any(formants_array[:, 3] > 0) else 0.0,
        }
    except Exception as e:
        return extract_formants_lpc(audio, sr, n_formants)

def extract_quality_metrics(audio, sr=SAMPLE_RATE):
    """Extract quality metrics for noise assessment."""
    stft = librosa.stft(audio, hop_length=HOP_LENGTH)
    magnitude = np.abs(stft)
    magnitude = magnitude + 1e-10
    
    geometric_mean = np.exp(np.mean(np.log(magnitude), axis=0))
    arithmetic_mean = np.mean(magnitude, axis=0)
    spectral_flatness = geometric_mean / (arithmetic_mean + 1e-10)
    spectral_flatness_mean = np.mean(spectral_flatness)
    
    harmonic = librosa.effects.harmonic(audio)
    percussive = librosa.effects.percussive(audio)
    hnr = np.mean(harmonic**2) / (np.mean(percussive**2) + 1e-10)
    
    zcr = librosa.feature.zero_crossing_rate(audio, hop_length=HOP_LENGTH)[0]
    zcr_mean = np.mean(zcr)
    
    rms = librosa.feature.rms(y=audio, hop_length=HOP_LENGTH)[0]
    energy_std = np.std(rms)
    energy_mean = np.mean(rms)
    energy_cv = energy_std / (energy_mean + 1e-10)
    
    return {
        'spectral_flatness': spectral_flatness_mean,
        'harmonic_noise_ratio': hnr,
        'zcr_mean': zcr_mean,
        'energy_cv': energy_cv,
    }

def extract_all_features(audio_path, sr=SAMPLE_RATE):
    """Extract all features from an audio file."""
    try:
        audio, _ = librosa.load(audio_path, sr=sr, mono=True)
        
        features = {}
        features.update(extract_mfcc_features(audio, sr))
        features.update(extract_energy_features(audio, sr))
        features.update(extract_spectral_features(audio, sr))
        
        if HAS_PARSELMOUTH:
            features.update(extract_formants_parselmouth(audio, sr))
        else:
            features.update(extract_formants_lpc(audio, sr))
        
        features.update(extract_quality_metrics(audio, sr))
        
        return features
    except Exception as e:
        return None

def extract_spectrogram_window(audio_path, target_duration_ms=SPECTROGRAM_WINDOW_MS, sr=SAMPLE_RATE):
    """Extract mel-spectrogram with fixed window size."""
    try:
        audio, _ = librosa.load(audio_path, sr=sr, mono=True)
        audio_duration_ms = len(audio) / sr * 1000
        
        target_samples = int(target_duration_ms / 1000 * sr)
        
        if len(audio) < target_samples:
            padding = target_samples - len(audio)
            audio = np.pad(audio, (0, padding), mode='constant')
        elif len(audio) > target_samples:
            audio = audio[:target_samples]
        
        mel_spec = librosa.feature.melspectrogram(
            y=audio, 
            sr=sr, 
            n_mels=N_MELS, 
            hop_length=HOP_LENGTH,
            fmax=sr/2
        )
        
        mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
        
        return mel_spec_db
    except Exception as e:
        return None

print("Feature extraction functions defined.")


Feature extraction functions defined.


## 2. Extract Features from Context Audio Files


In [None]:
# Load phoneme metadata from fresh phoneme_intervals.csv file
print(f"Loading phoneme metadata from {PHONEMES_FILE.name}...")
df_phonemes = pd.read_csv(PHONEMES_FILE)
print(f"Phonemes shape (before filtering): {df_phonemes.shape}")
print(f"Phonemes columns: {list(df_phonemes.columns)}")

# Filter to only 'b' and 'p' phonemes for this notebook
print("\nFiltering to only 'b' and 'p' phonemes...")
df_phonemes = df_phonemes[df_phonemes['phoneme'].isin(['b', 'p'])].copy()
print(f"Phonemes shape (after filtering b/p): {df_phonemes.shape}")
print(f"Phoneme distribution:")
print(df_phonemes['phoneme'].value_counts())

# Create phoneme_id column (unique identifier for each phoneme)
print("\nCreating phoneme_id column...")
df_phonemes['phoneme_id'] = range(len(df_phonemes))
print(f"Created {len(df_phonemes)} unique phoneme IDs")

# Create class column (same as phoneme for this notebook)
print("\nCreating class column...")
df_phonemes['class'] = df_phonemes['phoneme']
print(f"Class distribution:")
print(df_phonemes['class'].value_counts())

# Update audio paths to use context audio directory
print("\nUpdating audio paths to use context audio files...")
def find_context_audio_path(row):
    """Find the corresponding audio file in phoneme_wav_with_context"""
    utt_id = row['utterance_id']
    phoneme = row.get('phoneme', row.get('class', ''))
    start_ms = int(row['start_ms'])
    end_ms = int(row['end_ms'])
    
    pattern = f"{utt_id}__{phoneme}__{start_ms}-{end_ms}.wav"
    audio_path = PHONEME_WAV_DIR / pattern
    
    if audio_path.exists():
        return str(audio_path)
    
    if PHONEME_WAV_DIR.exists():
        matching_files = list(PHONEME_WAV_DIR.glob(f"{utt_id}__{phoneme}__*.wav"))
        if matching_files:
            return str(matching_files[0])
    
    original_path = Path(row.get('audio_path', ''))
    if original_path.exists():
        return str(original_path)
    
    return None

# Update audio paths
if 'audio_path' in df_phonemes.columns:
    print("Updating audio_path column...")
    df_phonemes['audio_path'] = df_phonemes.apply(find_context_audio_path, axis=1)
    valid_paths = df_phonemes['audio_path'].notna()
    print(f"Found context audio files: {valid_paths.sum()} / {len(df_phonemes)}")
    if not valid_paths.all():
        print(f"Warning: {len(df_phonemes) - valid_paths.sum()} audio files not found in context directory")
else:
    print("Creating audio_path column...")
    df_phonemes['audio_path'] = df_phonemes.apply(find_context_audio_path, axis=1)

# Filter to only phonemes with valid audio paths
df_phonemes = df_phonemes[df_phonemes['audio_path'].notna()].copy()
print(f"\nFinal dataset size: {len(df_phonemes):,} phonemes with context audio")

# Check if features already exist
if EXTRACT_FEATURES and FEATURES_FILE.exists():
    print(f"\nFeatures file already exists: {FEATURES_FILE}")
    print("Set EXTRACT_FEATURES=False to skip extraction and load existing features.")
    print("Proceeding with extraction (will overwrite existing file)...")

if EXTRACT_FEATURES:
    print(f"\n{'='*60}")
    print("EXTRACTING FEATURES FROM CONTEXT AUDIO FILES")
    print(f"{'='*60}")
    print(f"This will process {len(df_phonemes):,} phonemes with context windows")
    print(f"Audio files are longer (~300ms) due to ±100ms context")
    print(f"{'='*60}\n")
    
    # Helper function for parallel processing
    def process_single_phoneme(row_data):
        """Process a single phoneme row and extract features."""
        idx, row = row_data
        audio_path = row['audio_path']
        if audio_path is None or not Path(audio_path).exists():
            return None
        
        features = extract_all_features(audio_path)
        if features is not None:
            features['phoneme_id'] = row['phoneme_id']
            features['class'] = row['class']
            features['duration_ms'] = row['duration_ms']
        return features
    
    # Use parallel processing
    num_workers = os.cpu_count() or 4
    print(f"Using {num_workers} parallel workers...")
    
    features_list = []
    rows_to_process = [(idx, row) for idx, row in df_phonemes.iterrows()]
    
    print("Extracting features from context audio files...")
    with ThreadPoolExecutor(max_workers=num_workers) as executor:
        future_to_row = {executor.submit(process_single_phoneme, row_data): row_data 
                      for row_data in rows_to_process}
        
        for future in tqdm(as_completed(future_to_row), total=len(rows_to_process), desc="Extracting features"):
            try:
                features = future.result()
                if features is not None:
                    features_list.append(features)
            except Exception as e:
                idx, row = future_to_row[future]
                print(f"Error processing phoneme {row.get('phoneme_id', 'unknown')}: {e}")
    
    print(f"\nExtracted features for {len(features_list):,} phonemes")
    
    # Convert to DataFrame
    df_features = pd.DataFrame(features_list)
    print(f"\nFeature columns: {len(df_features.columns)}")
    print(f"Feature shape: {df_features.shape}")
    
    # Expand array columns into separate columns
    array_columns = []
    for col in df_features.columns:
        if df_features[col].dtype == 'object':
            sample = df_features[col].iloc[0] if len(df_features) > 0 else None
            if sample is not None and isinstance(sample, np.ndarray):
                array_columns.append(col)
    
    # Expand array columns
    for col in array_columns:
        array_length = len(df_features[col].iloc[0])
        for i in range(array_length):
            new_col_name = f"{col}_{i}"
            df_features[new_col_name] = df_features[col].apply(
                lambda x: x[i] if isinstance(x, np.ndarray) and len(x) > i else np.nan
            )
        df_features = df_features.drop(columns=[col])
    
    print(f"\nAfter expanding arrays: {len(df_features.columns)} columns")
    print(f"Feature shape: {df_features.shape}")
    
    # Quality assessment and outlier detection
    print("\nPerforming quality assessment...")
    numeric_cols = df_features.select_dtypes(include=[np.number]).columns.tolist()
    numeric_cols = [c for c in numeric_cols if c not in ['phoneme_id', 'duration_ms']]
    numeric_cols = [c for c in numeric_cols if df_features[c].std() > 1e-10]
    
    X_outlier = df_features[numeric_cols].fillna(0)
    scaler_outlier = StandardScaler()
    X_scaled = scaler_outlier.fit_transform(X_outlier)
    
    iso_forest = IsolationForest(contamination=0.1, random_state=RANDOM_STATE)
    outlier_labels = iso_forest.fit_predict(X_scaled)
    df_features['is_outlier_iso'] = outlier_labels == -1
    print(f"Detected {df_features['is_outlier_iso'].sum():,} outliers ({df_features['is_outlier_iso'].mean()*100:.1f}%)")
    
    # Quality score
    df_features['quality_score'] = (
        (1 - df_features['spectral_flatness'].fillna(0.5)) * 0.3 +
        (df_features['harmonic_noise_ratio'].fillna(1.0) / (df_features['harmonic_noise_ratio'].fillna(1.0).max() + 1e-10)) * 0.3 +
        (1 - df_features['zcr_mean'].fillna(0.5) / (df_features['zcr_mean'].fillna(0.5).max() + 1e-10)) * 0.2 +
        (1 - df_features['energy_cv'].fillna(1.0) / (df_features['energy_cv'].fillna(1.0).max() + 1e-10)) * 0.2
    )
    
    # Save features
    print(f"\nSaving features to {FEATURES_FILE}...")
    df_features.to_parquet(FEATURES_FILE, index=False)
    print(f"Features saved! Shape: {df_features.shape}")
    
    # Extract spectrograms from context audio
    print(f"\nExtracting spectrograms from context audio files...")
    spectrograms_dict = {}
    
    for idx, row in tqdm(df_phonemes.iterrows(), total=len(df_phonemes), desc="Extracting spectrograms"):
        audio_path = row['audio_path']
        phoneme_id = row['phoneme_id']
        
        if audio_path is None or not Path(audio_path).exists():
            continue
        
        spec = extract_spectrogram_window(audio_path, target_duration_ms=SPECTROGRAM_WINDOW_MS)
        if spec is not None:
            spectrograms_dict[phoneme_id] = spec
    
    print(f"\nExtracted {len(spectrograms_dict):,} spectrograms")
    if spectrograms_dict:
        print(f"Spectrogram shape: {list(spectrograms_dict.values())[0].shape}")
    
    # Save spectrograms
    print(f"\nSaving spectrograms to {SPECTROGRAMS_FILE}...")
    with h5py.File(SPECTROGRAMS_FILE, 'w') as f:
        for phoneme_id, spec in tqdm(spectrograms_dict.items(), desc="Saving spectrograms"):
            f.create_dataset(str(phoneme_id), data=spec, compression='gzip')
    print(f"Spectrograms saved!")
    
else:
    # Load existing features
    print(f"\nLoading existing features from {FEATURES_FILE}...")
    df_features = pd.read_parquet(FEATURES_FILE)
    print(f"Features shape: {df_features.shape}")
    print(f"Features columns: {len(df_features.columns)}")
    
    # Load existing spectrograms
    print(f"\nLoading existing spectrograms from {SPECTROGRAMS_FILE}...")
    spectrograms_dict = {}
    with h5py.File(SPECTROGRAMS_FILE, 'r') as f:
        phoneme_ids = list(f.keys())
        for phoneme_id in tqdm(phoneme_ids, desc="Loading spectrograms"):
            spectrograms_dict[phoneme_id] = f[phoneme_id][:]
    print(f"Loaded {len(spectrograms_dict):,} spectrograms")


Loading phoneme metadata from phoneme_intervals.csv...
Phonemes shape (before filtering): (1337749, 5)
Phonemes columns: ['utterance_id', 'phoneme', 'start_ms', 'end_ms', 'duration_ms']

Filtering to only 'b' and 'p' phonemes...
Phonemes shape (after filtering b/p): (36903, 5)
Phoneme distribution:
phoneme
b    25874
p    11029
Name: count, dtype: int64

Creating phoneme_id column...
Created 36903 unique phoneme IDs

Creating class column...
Class distribution:
class
b    25874
p    11029
Name: count, dtype: int64

Updating audio paths to use context audio files...
Creating audio_path column...

Final dataset size: 36,903 phonemes with context audio

Features file already exists: /Volumes/SSanDisk/SpeechRec-German/artifacts/b-p_dl_models_with_context/features/features.parquet
Set EXTRACT_FEATURES=False to skip extraction and load existing features.
Proceeding with extraction (will overwrite existing file)...

EXTRACTING FEATURES FROM CONTEXT AUDIO FILES
This will process 36,903 phoneme

Extracting features: 100%|██████████| 36903/36903 [08:00<00:00, 76.78it/s]



Extracted features for 36,903 phonemes

Feature columns: 32
Feature shape: (36903, 32)

After expanding arrays: 110 columns
Feature shape: (36903, 110)

Performing quality assessment...
Detected 3,691 outliers (10.0%)

Saving features to /Volumes/SSanDisk/SpeechRec-German/artifacts/b-p_dl_models_with_context/features/features.parquet...
Features saved! Shape: (36903, 112)

Extracting spectrograms from context audio files...


Extracting spectrograms: 100%|██████████| 36903/36903 [01:11<00:00, 518.76it/s]



Extracted 36,903 spectrograms
Spectrogram shape: (128, 7)

Saving spectrograms to /Volumes/SSanDisk/SpeechRec-German/artifacts/b-p_dl_models_with_context/features/spectrograms.h5...


Saving spectrograms: 100%|██████████| 36903/36903 [00:04<00:00, 8135.94it/s]


Spectrograms saved!


## 3. Merge Features and Phoneme Data


In [None]:
# Merge features with phoneme metadata
print("Merging features with phoneme metadata...")
df = df_phonemes.merge(df_features, on='phoneme_id', how='inner', suffixes=('', '_features'))
print(f"Merged dataset shape: {df.shape}")

# Handle duplicate columns from merge
if 'class_features' in df.columns:
    df = df.drop(columns=['class_features'])
if 'class' not in df.columns:
    if 'phoneme' in df.columns:
        print("\n'class' column not found, creating from 'phoneme' column...")
        df['class'] = df['phoneme']
    else:
        raise ValueError("Neither 'class' nor 'phoneme' column found in merged DataFrame")
else:
    print("\n'class' column found in merged DataFrame")

# Filter to only b and p classes (exclude pf if present)
if 'pf' in df['class'].values:
    print("\nFiltering out 'pf' class, keeping only 'b' and 'p'...")
    df = df[df['class'].isin(['b', 'p'])].copy()
    print(f"Dataset after filtering: {len(df)} samples")

# Check class distribution
print(f"\nClass distribution:")
print(df['class'].value_counts())
print(f"\nClass distribution (%):")
print(df['class'].value_counts(normalize=True) * 100)

# Encode target
le = LabelEncoder()
df['class_encoded'] = le.fit_transform(df['class'])  # b=0, p=1
print(f"\nClass encoding: {dict(zip(le.classes_, le.transform(le.classes_)))}")

# Check which phonemes have spectrograms
df['has_spectrogram'] = df['phoneme_id'].isin(spectrograms_dict.keys())
print(f"\nPhonemes with spectrograms: {df['has_spectrogram'].sum()} / {len(df)}")

# Get feature columns (exclude metadata and non-numeric columns)
# Note: duration_ms_y (from features) should be included, but duration_ms_x (from phonemes) should be excluded
exclude_cols = ['phoneme_id', 'utterance_id', 'phoneme', 'class', 'class_x', 'class_y', 
                'class_encoded', 'start_ms', 'end_ms', 'duration_ms', 'duration_ms_x', 
                'audio_path', 'is_outlier_iso', 'split', 'has_spectrogram',
                'class_features']  # Exclude merge suffixes (but keep duration_ms_y)
feature_cols = [col for col in df.columns if col not in exclude_cols]

# Filter to only numeric columns (using pd.api.types for consistency with data_loader.py)
feature_cols = [col for col in feature_cols if pd.api.types.is_numeric_dtype(df[col])]

print(f"\nNumber of feature columns: {len(feature_cols)}")
print(f"First 10 features: {feature_cols[:10]}")


Merging features with phoneme metadata...
Merged dataset shape: (36903, 119)

'class' column found in merged DataFrame

Class distribution:
class
b    25874
p    11029
Name: count, dtype: int64

Class distribution (%):
class
b    70.113541
p    29.886459
Name: proportion, dtype: float64

Class encoding: {'b': np.int64(0), 'p': np.int64(1)}

Phonemes with spectrograms: 36903 / 36903

Number of feature columns: 109
First 10 features: ['energy_rms', 'energy_rms_std', 'energy_zcr', 'energy_zcr_std', 'spectral_centroid', 'spectral_centroid_std', 'spectral_rolloff', 'spectral_rolloff_std', 'spectral_bandwidth', 'spectral_bandwidth_std']


## 4. Train/Val/Test Split


In [None]:
# Save feature columns list (important for model loading)
# This file is needed for loading models later
with open(OUTPUT_DIR / 'feature_cols.json', 'w') as f:
    json.dump(feature_cols, f, indent=2)
print(f"Feature columns saved to {OUTPUT_DIR / 'feature_cols.json'}")


Feature columns saved to /Volumes/SSanDisk/SpeechRec-German/artifacts/b-p_dl_models_with_context/feature_cols.json


In [6]:
# Filter to only phonemes with spectrograms
df = df[df['has_spectrogram']].copy()
print(f"Dataset after filtering: {len(df)} samples")

# Train/Val/Test split (70/15/15) with stratification
X_temp, X_test, y_temp, y_test = train_test_split(
    df.index, df['class_encoded'], 
    test_size=0.15, 
    random_state=RANDOM_STATE, 
    stratify=df['class_encoded']
)

X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, 
    test_size=0.176,  # 0.176 ≈ 15/85
    random_state=RANDOM_STATE, 
    stratify=y_temp
)

# Create split column
df['split'] = 'train'
df.loc[X_val, 'split'] = 'val'
df.loc[X_test, 'split'] = 'test'

print(f"\nTrain set: {len(X_train):,} samples ({len(X_train)/len(df)*100:.1f}%)")
print(f"  Class distribution: {np.bincount(df.loc[X_train, 'class_encoded'])}")
print(f"Val set: {len(X_val):,} samples ({len(X_val)/len(df)*100:.1f}%)")
print(f"  Class distribution: {np.bincount(df.loc[X_val, 'class_encoded'])}")
print(f"Test set: {len(X_test):,} samples ({len(X_test)/len(df)*100:.1f}%)")
print(f"  Class distribution: {np.bincount(df.loc[X_test, 'class_encoded'])}")

# Save split indices
split_indices = {
    'train': [int(idx) for idx in X_train],
    'val': [int(idx) for idx in X_val],
    'test': [int(idx) for idx in X_test]
}

with open(OUTPUT_DIR / 'split_indices.json', 'w') as f:
    json.dump(split_indices, f)
print(f"\nSplit indices saved to {OUTPUT_DIR / 'split_indices.json'}")


Dataset after filtering: 36903 samples

Train set: 25,846 samples (70.0%)
  Class distribution: [18122  7724]
Val set: 5,521 samples (15.0%)
  Class distribution: [3871 1650]
Test set: 5,536 samples (15.0%)
  Class distribution: [3881 1655]

Split indices saved to /Volumes/SSanDisk/SpeechRec-German/artifacts/b-p_dl_models_with_context/split_indices.json


## 4. Create PyTorch Dataset Classes


In [None]:
class SpectrogramDataset(Dataset):
    """Dataset for models using spectrograms only"""
    def __init__(self, df, spectrograms_dict, split='train', transform=None):
        self.df = df[df['split'] == split].reset_index(drop=True)
        self.spectrograms_dict = spectrograms_dict
        self.transform = transform
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        phoneme_id = row['phoneme_id']
        
        # Get spectrogram
        spectrogram = self.spectrograms_dict[phoneme_id].astype(np.float32)
        
        # Add channel dimension if needed (for CNN: [1, 128, 7])
        if len(spectrogram.shape) == 2:
            spectrogram = np.expand_dims(spectrogram, axis=0)
        
        # Normalize to [0, 1]
        spectrogram = (spectrogram - spectrogram.min()) / (spectrogram.max() - spectrogram.min() + 1e-8)
        
        if self.transform:
            spectrogram = self.transform(spectrogram)
        
        label = row['class_encoded']
        
        return torch.from_numpy(spectrogram), torch.tensor(label, dtype=torch.long)


class FeatureDataset(Dataset):
    """Dataset for models using extracted features only"""
    def __init__(self, df, feature_cols, scaler=None, split='train', fit_scaler=False):
        self.df = df[df['split'] == split].reset_index(drop=True)
        self.feature_cols = feature_cols
        
        # Extract features
        X = self.df[feature_cols].values.astype(np.float32)
        
        # Handle missing values
        X = np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0)
        
        # Scale features
        if fit_scaler:
            self.scaler = StandardScaler()
            X = self.scaler.fit_transform(X)
        elif scaler is not None:
            self.scaler = scaler
            X = self.scaler.transform(X)
        else:
            self.scaler = None
        
        self.X = torch.from_numpy(X)
        self.y = torch.from_numpy(self.df['class_encoded'].values).long()
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


class HybridDataset(Dataset):
    """Dataset for hybrid models using both spectrograms and features"""
    def __init__(self, df, spectrograms_dict, feature_cols, scaler=None, split='train', fit_scaler=False, transform=None):
        self.df = df[df['split'] == split].reset_index(drop=True)
        self.spectrograms_dict = spectrograms_dict
        self.feature_cols = feature_cols
        self.transform = transform
        
        # Extract and scale features
        X_features = self.df[feature_cols].values.astype(np.float32)
        X_features = np.nan_to_num(X_features, nan=0.0, posinf=0.0, neginf=0.0)
        
        if fit_scaler:
            self.scaler = StandardScaler()
            X_features = self.scaler.fit_transform(X_features)
        elif scaler is not None:
            self.scaler = scaler
            X_features = self.scaler.transform(X_features)
        else:
            self.scaler = None
        
        self.X_features = torch.from_numpy(X_features)
        self.y = torch.from_numpy(self.df['class_encoded'].values).long()
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        phoneme_id = row['phoneme_id']
        
        # Get spectrogram
        spectrogram = self.spectrograms_dict[phoneme_id].astype(np.float32)
        if len(spectrogram.shape) == 2:
            spectrogram = np.expand_dims(spectrogram, axis=0)
        spectrogram = (spectrogram - spectrogram.min()) / (spectrogram.max() - spectrogram.min() + 1e-8)
        
        if self.transform:
            spectrogram = self.transform(spectrogram)
        
        features = self.X_features[idx]
        label = self.y[idx]
        
        return (torch.from_numpy(spectrogram), features), label


class RawAudioDataset(Dataset):
    """Dataset for models using raw audio waveforms (with context)"""
    def __init__(self, df, split='train', sample_rate=16000, max_length=None, transform=None):
        self.df = df[df['split'] == split].reset_index(drop=True)
        self.sample_rate = sample_rate
        self.max_length = max_length
        self.transform = transform
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        audio_path = row['audio_path']
        
        # Load audio (now with context - longer duration)
        try:
            if pd.notna(audio_path) and Path(audio_path).exists():
                audio, sr = librosa.load(audio_path, sr=self.sample_rate, mono=True)
            else:
                # If path is invalid, return zeros
                audio = np.zeros(self.sample_rate // 5)  # 200ms of silence (longer for context)
        except:
            # If loading fails, return zeros
            audio = np.zeros(self.sample_rate // 5)  # 200ms of silence
        
        # Normalize audio
        if len(audio) > 0:
            audio = audio / (np.abs(audio).max() + 1e-8)
        
        # Pad or truncate to max_length
        # Note: With context, audio is longer, so max_length should be adjusted accordingly
        if self.max_length is not None:
            if len(audio) < self.max_length:
                audio = np.pad(audio, (0, self.max_length - len(audio)), mode='constant')
            else:
                audio = audio[:self.max_length]
        
        if self.transform:
            audio = self.transform(audio)
        
        label = row['class_encoded']
        
        return torch.from_numpy(audio.astype(np.float32)), torch.tensor(label, dtype=torch.long)


class ContextAudioDataset(Dataset):
    """Dataset for models using raw audio with context from original utterance"""
    def __init__(self, df, split='train', sample_rate=16000, transform=None):
        self.df = df[df['split'] == split].reset_index(drop=True)
        self.sample_rate = sample_rate
        self.transform = transform
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        phoneme_audio_path = row['audio_path']
        
        # Load phoneme audio (already includes context from phoneme_wav_with_context)
        try:
            if pd.notna(phoneme_audio_path) and Path(phoneme_audio_path).exists():
                phoneme_audio, sr = librosa.load(phoneme_audio_path, sr=self.sample_rate, mono=True)
            else:
                phoneme_audio = np.zeros(self.sample_rate // 5)  # 200ms
        except:
            phoneme_audio = np.zeros(self.sample_rate // 5)
        
        # The audio already contains context, so we use it directly
        context_audio = phoneme_audio  # Already includes ±100ms context
        
        # Normalize
        if len(phoneme_audio) > 0:
            phoneme_audio = phoneme_audio / (np.abs(phoneme_audio).max() + 1e-8)
        if len(context_audio) > 0:
            context_audio = context_audio / (np.abs(context_audio).max() + 1e-8)
        
        if self.transform:
            phoneme_audio = self.transform(phoneme_audio)
            context_audio = self.transform(context_audio)
        
        label = row['class_encoded']
        
        return (
            torch.from_numpy(phoneme_audio.astype(np.float32)),
            torch.from_numpy(context_audio.astype(np.float32))
        ), torch.tensor(label, dtype=torch.long)


class SequenceDataset(Dataset):
    """Dataset for sequence models (LSTM, Transformer) using spectrograms as sequences"""
    def __init__(self, df, spectrograms_dict, split='train', transform=None):
        self.df = df[df['split'] == split].reset_index(drop=True)
        self.spectrograms_dict = spectrograms_dict
        self.transform = transform
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        phoneme_id = row['phoneme_id']
        
        # Get spectrogram: shape (128, 7) -> (7, 128) for sequence models
        spectrogram = self.spectrograms_dict[phoneme_id].astype(np.float32)
        spectrogram = spectrogram.T  # Transpose: (7, 128) - 7 time steps, 128 features
        
        # Normalize
        spectrogram = (spectrogram - spectrogram.min()) / (spectrogram.max() - spectrogram.min() + 1e-8)
        
        if self.transform:
            spectrogram = self.transform(spectrogram)
        
        label = row['class_encoded']
        
        return torch.from_numpy(spectrogram), torch.tensor(label, dtype=torch.long)

print("Dataset classes defined!")


Dataset classes defined!


In [None]:
# Create feature scaler on training data
train_df = df[df['split'] == 'train']
X_train_features = train_df[feature_cols].values.astype(np.float32)
X_train_features = np.nan_to_num(X_train_features, nan=0.0, posinf=0.0, neginf=0.0)

feature_scaler = StandardScaler()
feature_scaler.fit(X_train_features)

# Save scaler
import joblib
joblib.dump(feature_scaler, OUTPUT_DIR / 'feature_scaler.joblib')
print(f"Feature scaler saved to {OUTPUT_DIR / 'feature_scaler.joblib'}")

# Compute class weights
class_weights = compute_class_weight(
    'balanced',
    classes=np.unique(df[df['split'] == 'train']['class_encoded']),
    y=df[df['split'] == 'train']['class_encoded']
)
class_weights_dict = {i: weight for i, weight in enumerate(class_weights)}
print(f"\nClass weights: {class_weights_dict}")

# Save class weights
with open(OUTPUT_DIR / 'class_weights.json', 'w') as f:
    json.dump(class_weights_dict, f)
print(f"Class weights saved to {OUTPUT_DIR / 'class_weights.json'}")

# Create datasets
train_spectrogram_ds = SpectrogramDataset(df, spectrograms_dict, split='train')
val_spectrogram_ds = SpectrogramDataset(df, spectrograms_dict, split='val')
test_spectrogram_ds = SpectrogramDataset(df, spectrograms_dict, split='test')

train_feature_ds = FeatureDataset(df, feature_cols, scaler=feature_scaler, split='train')
val_feature_ds = FeatureDataset(df, feature_cols, scaler=feature_scaler, split='val')
test_feature_ds = FeatureDataset(df, feature_cols, scaler=feature_scaler, split='test')

train_hybrid_ds = HybridDataset(df, spectrograms_dict, feature_cols, scaler=feature_scaler, split='train')
val_hybrid_ds = HybridDataset(df, spectrograms_dict, feature_cols, scaler=feature_scaler, split='val')
test_hybrid_ds = HybridDataset(df, spectrograms_dict, feature_cols, scaler=feature_scaler, split='test')

train_sequence_ds = SequenceDataset(df, spectrograms_dict, split='train')
val_sequence_ds = SequenceDataset(df, spectrograms_dict, split='val')
test_sequence_ds = SequenceDataset(df, spectrograms_dict, split='test')

# Note: With context, audio is longer (~300ms instead of ~100ms), so adjust max_length accordingly
# Original phoneme ~100ms + 200ms context = ~300ms total
# At 16kHz: 300ms = 4800 samples
train_raw_audio_ds = RawAudioDataset(df, split='train', sample_rate=16000, max_length=4800)  # ~300ms at 16kHz
val_raw_audio_ds = RawAudioDataset(df, split='val', sample_rate=16000, max_length=4800)
test_raw_audio_ds = RawAudioDataset(df, split='test', sample_rate=16000, max_length=4800)

train_context_audio_ds = ContextAudioDataset(df, split='train', sample_rate=16000)
val_context_audio_ds = ContextAudioDataset(df, split='val', sample_rate=16000)
test_context_audio_ds = ContextAudioDataset(df, split='test', sample_rate=16000)

print("\nAll datasets created!")
print(f"Train spectrogram dataset: {len(train_spectrogram_ds)} samples")
print(f"Train feature dataset: {len(train_feature_ds)} samples")
print(f"Train hybrid dataset: {len(train_hybrid_ds)} samples")
print(f"Train sequence dataset: {len(train_sequence_ds)} samples")
print(f"Train raw audio dataset: {len(train_raw_audio_ds)} samples")
print(f"Train context audio dataset: {len(train_context_audio_ds)} samples")


Feature scaler saved to /Volumes/SSanDisk/SpeechRec-German/artifacts/b-p_dl_models_with_context/feature_scaler.joblib

Class weights: {0: np.float64(0.7131111356362433), 1: np.float64(1.6730968410150182)}
Class weights saved to /Volumes/SSanDisk/SpeechRec-German/artifacts/b-p_dl_models_with_context/class_weights.json

All datasets created!
Train spectrogram dataset: 25846 samples
Train feature dataset: 25846 samples
Train hybrid dataset: 25846 samples
Train sequence dataset: 25846 samples
Train raw audio dataset: 25846 samples
Train context audio dataset: 25846 samples


## 6. Create DataLoaders with Weighted Sampling


In [None]:
# Compute sample weights for weighted sampling
train_labels = df[df['split'] == 'train']['class_encoded'].values
sample_weights = np.array([class_weights[label] for label in train_labels])
sampler = WeightedRandomSampler(
    weights=sample_weights,
    num_samples=len(sample_weights),
    replacement=True
)

# Create DataLoaders
BATCH_SIZE = 64

train_spectrogram_loader = DataLoader(train_spectrogram_ds, batch_size=BATCH_SIZE, sampler=sampler, num_workers=0)
val_spectrogram_loader = DataLoader(val_spectrogram_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)
test_spectrogram_loader = DataLoader(test_spectrogram_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)

train_feature_loader = DataLoader(train_feature_ds, batch_size=BATCH_SIZE, sampler=sampler, num_workers=0)
val_feature_loader = DataLoader(val_feature_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)
test_feature_loader = DataLoader(test_feature_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)

train_hybrid_loader = DataLoader(train_hybrid_ds, batch_size=BATCH_SIZE, sampler=sampler, num_workers=0)
val_hybrid_loader = DataLoader(val_hybrid_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)
test_hybrid_loader = DataLoader(test_hybrid_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)

train_sequence_loader = DataLoader(train_sequence_ds, batch_size=BATCH_SIZE, sampler=sampler, num_workers=0)
val_sequence_loader = DataLoader(val_sequence_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)
test_sequence_loader = DataLoader(test_sequence_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)

train_raw_audio_loader = DataLoader(train_raw_audio_ds, batch_size=BATCH_SIZE, sampler=sampler, num_workers=0)
val_raw_audio_loader = DataLoader(val_raw_audio_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)
test_raw_audio_loader = DataLoader(test_raw_audio_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)

train_context_audio_loader = DataLoader(train_context_audio_ds, batch_size=BATCH_SIZE, sampler=sampler, num_workers=0)
val_context_audio_loader = DataLoader(val_context_audio_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)
test_context_audio_loader = DataLoader(test_context_audio_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)

print("All DataLoaders created!")
print(f"\nTrain batches (spectrogram): {len(train_spectrogram_loader)}")
print(f"Train batches (feature): {len(train_feature_loader)}")
print(f"Train batches (hybrid): {len(train_hybrid_loader)}")

# Test a batch
print("\nTesting a batch from spectrogram dataset...")
sample_batch = next(iter(train_spectrogram_loader))
print(f"Batch shape: {sample_batch[0].shape}, Labels shape: {sample_batch[1].shape}")

# Test raw audio batch (should be longer due to context)
print("\nTesting a batch from raw audio dataset (with context)...")
sample_audio_batch = next(iter(train_raw_audio_loader))
print(f"Audio batch shape: {sample_audio_batch[0].shape}, Labels shape: {sample_audio_batch[1].shape}")
print(f"Audio duration: {sample_audio_batch[0].shape[1] / 16000 * 1000:.1f}ms (expected ~300ms with context)")


All DataLoaders created!

Train batches (spectrogram): 404
Train batches (feature): 404
Train batches (hybrid): 404

Testing a batch from spectrogram dataset...
Batch shape: torch.Size([64, 1, 128, 7]), Labels shape: torch.Size([64])

Testing a batch from raw audio dataset (with context)...
Audio batch shape: torch.Size([64, 4800]), Labels shape: torch.Size([64])
Audio duration: 300.0ms (expected ~300ms with context)


## 7. Save Dataset Information


In [None]:
# Save dataset information
dataset_info = {
    'total_samples': len(df),
    'train_samples': len(df[df['split'] == 'train']),
    'val_samples': len(df[df['split'] == 'val']),
    'test_samples': len(df[df['split'] == 'test']),
    'n_features': len(feature_cols),
    'spectrogram_shape': list(spectrograms_dict[list(spectrograms_dict.keys())[0]].shape),
    'context_window_ms': 100,  # ±100ms context
    'audio_max_length_samples': 4800,  # ~300ms at 16kHz (100ms phoneme + 200ms context)
    'class_distribution': {
        'train': df[df['split'] == 'train']['class'].value_counts().to_dict(),
        'val': df[df['split'] == 'val']['class'].value_counts().to_dict(),
        'test': df[df['split'] == 'test']['class'].value_counts().to_dict()
    },
    'class_weights': class_weights_dict,
    'feature_columns': feature_cols,
    'audio_source': 'phoneme_wav_with_context'
}

with open(OUTPUT_DIR / 'dataset_info.json', 'w') as f:
    json.dump(dataset_info, f, indent=2)

print(f"Dataset info saved to {OUTPUT_DIR / 'dataset_info.json'}")
print(f"\nDataset summary:")
print(f"  Total samples: {dataset_info['total_samples']}")
print(f"  Train: {dataset_info['train_samples']}")
print(f"  Val: {dataset_info['val_samples']}")
print(f"  Test: {dataset_info['test_samples']}")
print(f"  Features: {dataset_info['n_features']}")
print(f"  Spectrogram shape: {dataset_info['spectrogram_shape']}")
print(f"  Context window: ±{dataset_info['context_window_ms']}ms")
print(f"  Audio max length: {dataset_info['audio_max_length_samples']} samples (~{dataset_info['audio_max_length_samples']/16:.0f}ms)")
print(f"  Audio source: {dataset_info['audio_source']}")


Dataset info saved to /Volumes/SSanDisk/SpeechRec-German/artifacts/b-p_dl_models_with_context/dataset_info.json

Dataset summary:
  Total samples: 36903
  Train: 25846
  Val: 5521
  Test: 5536
  Features: 109
  Spectrogram shape: [128, 7]
  Context window: ±100ms
  Audio max length: 4800 samples (~300ms)
  Audio source: phoneme_wav_with_context
