# D-T Phoneme DL Data Preparation (with Context) - Version 2

**Version 2** - Extended feature extraction with VOT, burst features, low-frequency energy, and enhanced quality filtering.

Preparation of PyTorch datasets for deep learning models using phoneme audio with extended context windows:
- **Extract features from context audio files** (±100ms context from `phoneme_wav_with_context`)
- Extract spectrograms from context audio files
- Create PyTorch Dataset classes for different input types
- Train/Val/Test split with stratification
- Data normalization
- DataLoader creation with batch sampling
- Handle class imbalance

**New features in Version 2:**
- Voice Onset Time (VOT) extraction
- Burst-specific features (ZCR around burst, burst spectral centroid)
- Low-frequency energy (voicing) features
- Plosive duration validation (closure/burst length checks)
- Enhanced quality filtering (conservative thresholds)
- VAD (Voice Activity Detection) support

**Key difference from 02.1:** 
- Features are extracted **anew** from context audio files (~300ms duration) instead of using old features
- Uses extended context windows (±100ms) for better capture of coarticulation, formant transitions, and VOT
- All features (MFCC, energy, spectral, formants, quality metrics, VOT, burst features) reflect the extended context


In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
import h5py
import librosa
import soundfile as sf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.ensemble import IsolationForest
from tqdm import tqdm
import warnings
import json
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
from scipy import signal
warnings.filterwarnings('ignore')

# Try to import optional libraries
try:
    import parselmouth
    HAS_PARSELMOUTH = True
except ImportError:
    HAS_PARSELMOUTH = False
    print("Warning: parselmouth not installed. Will use LPC for formant extraction.")

try:
    import webrtcvad
    HAS_WEBRTCVAD = True
except ImportError:
    HAS_WEBRTCVAD = False
    print("Warning: webrtcvad not installed. VAD features will be limited.")

# Configuration
# Determine project root (parent of notebooks directory)
PROJECT_ROOT = Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()
# Audio files with context are in the main artifacts directory
PHONEME_WAV_DIR = PROJECT_ROOT / 'artifacts' / 'phoneme_wav_with_context'  # With context!
PHONEMES_FILE = PROJECT_ROOT / 'artifacts' / 'phoneme_intervals.csv'  # Load from fresh phoneme intervals file

# Output directories
OUTPUT_DIR = PROJECT_ROOT / 'artifacts' / 'd-t_dl_models_with_context_v2'
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
FEATURES_OUTPUT_DIR = OUTPUT_DIR / 'features'
FEATURES_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

FEATURES_FILE = FEATURES_OUTPUT_DIR / 'features.parquet'
SPECTROGRAMS_FILE = FEATURES_OUTPUT_DIR / 'spectrograms.h5'

# Flag to control whether to extract features or load existing
EXTRACT_FEATURES = True  # Set to False to skip extraction and load existing features

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
torch.manual_seed(RANDOM_STATE)

# Device setup
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print(f"Using MPS device")
elif torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Using CUDA device")
else:
    device = torch.device("cpu")
    print(f"Using CPU device")

print(f"Project root: {PROJECT_ROOT}")
print(f"Phoneme audio directory (with context): {PHONEME_WAV_DIR}")
print(f"Output directory (Version 2): {OUTPUT_DIR}")
print(f"Features output directory: {FEATURES_OUTPUT_DIR}")
print(f"Extract features: {EXTRACT_FEATURES}")


Using MPS device
Project root: /Volumes/SSanDisk/SpeechRec-German
Phoneme audio directory (with context): /Volumes/SSanDisk/SpeechRec-German/artifacts/phoneme_wav_with_context
Output directory (Version 2): /Volumes/SSanDisk/SpeechRec-German/artifacts/d-t_dl_models_with_context_v2
Features output directory: /Volumes/SSanDisk/SpeechRec-German/artifacts/d-t_dl_models_with_context_v2/features
Extract features: True


## 1. Feature Extraction Functions


In [2]:
# Import feature extraction functions from utils module
from utils.dl_data_preparation import (
    extract_all_features,
    extract_spectrogram_window,
    SAMPLE_RATE, N_MELS, HOP_LENGTH, MFCC_N_COEFFS, SPECTROGRAM_WINDOW_MS
)

print("Feature extraction functions imported from utils.dl_data_preparation")


Feature extraction functions imported from utils.dl_data_preparation


## 2. Extract Features from Context Audio Files


In [None]:
from utils.dl_data_preparation import find_context_audio_path
# Load phoneme metadata from fresh phoneme_intervals.csv file
print(f"Loading phoneme metadata from {PHONEMES_FILE.name}...")
df_phonemes = pd.read_csv(PHONEMES_FILE)
print(f"Phonemes shape (before filtering): {df_phonemes.shape}")
print(f"Phonemes columns: {list(df_phonemes.columns)}")

# Filter to only 'd' and 't' phonemes for this notebook
print("\nFiltering to only 'd' and 't' phonemes...")
df_phonemes = df_phonemes[df_phonemes['phoneme'].isin(['d', 't'])].copy()
print(f"Phonemes shape (after filtering d/t): {df_phonemes.shape}")
print(f"Phoneme distribution:")
print(df_phonemes['phoneme'].value_counts())

# Create phoneme_id column (unique identifier for each phoneme)
print("\nCreating phoneme_id column...")
df_phonemes['phoneme_id'] = range(len(df_phonemes))
print(f"Created {len(df_phonemes)} unique phoneme IDs")

# Create class column (same as phoneme for this notebook)
print("\nCreating class column...")
df_phonemes['class'] = df_phonemes['phoneme']
print(f"Class distribution:")
print(df_phonemes['class'].value_counts())

# Update audio paths to use context audio directory
print("\nUpdating audio paths to use context audio files...")

# Update audio paths
if 'audio_path' in df_phonemes.columns:
    print("Updating audio_path column...")
    df_phonemes['audio_path'] = df_phonemes.apply(lambda row: find_context_audio_path(row, PHONEME_WAV_DIR), axis=1)
    valid_paths = df_phonemes['audio_path'].notna()
    print(f"Found context audio files: {valid_paths.sum()} / {len(df_phonemes)}")
    if not valid_paths.all():
        print(f"Warning: {len(df_phonemes) - valid_paths.sum()} audio files not found in context directory")
else:
    print("Creating audio_path column...")
    df_phonemes['audio_path'] = df_phonemes.apply(lambda row: find_context_audio_path(row, PHONEME_WAV_DIR), axis=1)

# Filter to only phonemes with valid audio paths
df_phonemes = df_phonemes[df_phonemes['audio_path'].notna()].copy()
print(f"\nFinal dataset size: {len(df_phonemes):,} phonemes with context audio")

# Check if features already exist
if EXTRACT_FEATURES and FEATURES_FILE.exists():
    print(f"\nFeatures file already exists: {FEATURES_FILE}")
    print("Set EXTRACT_FEATURES=False to skip extraction and load existing features.")
    print("Proceeding with extraction (will overwrite existing file)...")

if EXTRACT_FEATURES:
    print(f"\n{'='*60}")
    print("EXTRACTING FEATURES FROM CONTEXT AUDIO FILES")
    print(f"{'='*60}")
    print(f"This will process {len(df_phonemes):,} phonemes with context windows")
    print(f"Audio files are longer (~300ms) due to ±100ms context")
    print(f"{'='*60}\n")
    
    # Helper function for parallel processing
    def process_single_phoneme(row_data):
        """Process a single phoneme row and extract features."""
        idx, row = row_data
        audio_path = row['audio_path']
        if audio_path is None or not Path(audio_path).exists():
            return None
        
        features = extract_all_features(audio_path, phoneme_type='d-t')
        if features is not None:
            features['phoneme_id'] = row['phoneme_id']
            features['class'] = row['class']
            features['duration_ms'] = row['duration_ms']
        return features
    
    # Use parallel processing
    num_workers = os.cpu_count() or 4
    print(f"Using {num_workers} parallel workers...")
    
    features_list = []
    rows_to_process = [(idx, row) for idx, row in df_phonemes.iterrows()]
    
    print("Extracting features from context audio files...")
    with ThreadPoolExecutor(max_workers=num_workers) as executor:
        future_to_row = {executor.submit(process_single_phoneme, row_data): row_data 
                      for row_data in rows_to_process}
        
        for future in tqdm(as_completed(future_to_row), total=len(rows_to_process), desc="Extracting features"):
            try:
                features = future.result()
                if features is not None:
                    features_list.append(features)
            except Exception as e:
                idx, row = future_to_row[future]
                print(f"Error processing phoneme {row.get('phoneme_id', 'unknown')}: {e}")
    
    print(f"\nExtracted features for {len(features_list):,} phonemes")
    
    # Convert to DataFrame
    df_features = pd.DataFrame(features_list)
    print(f"\nFeature columns: {len(df_features.columns)}")
    print(f"Feature shape: {df_features.shape}")
    
    # Expand array columns into separate columns
    array_columns = []
    for col in df_features.columns:
        if df_features[col].dtype == 'object':
            sample = df_features[col].iloc[0] if len(df_features) > 0 else None
            if sample is not None and isinstance(sample, np.ndarray):
                array_columns.append(col)
    
    # Expand array columns
    for col in array_columns:
        array_length = len(df_features[col].iloc[0])
        for i in range(array_length):
            new_col_name = f"{col}_{i}"
            df_features[new_col_name] = df_features[col].apply(
                lambda x: x[i] if isinstance(x, np.ndarray) and len(x) > i else np.nan
            )
        df_features = df_features.drop(columns=[col])
    
    print(f"\nAfter expanding arrays: {len(df_features.columns)} columns")
    print(f"Feature shape: {df_features.shape}")
    
    # Quality assessment and outlier detection
    print("\nPerforming quality assessment...")
    numeric_cols = df_features.select_dtypes(include=[np.number]).columns.tolist()
    numeric_cols = [c for c in numeric_cols if c not in ['phoneme_id', 'duration_ms']]
    numeric_cols = [c for c in numeric_cols if df_features[c].std() > 1e-10]
    
    X_outlier = df_features[numeric_cols].fillna(0)
    scaler_outlier = StandardScaler()
    X_scaled = scaler_outlier.fit_transform(X_outlier)
    
    iso_forest = IsolationForest(contamination=0.1, random_state=RANDOM_STATE)
    outlier_labels = iso_forest.fit_predict(X_scaled)
    df_features['is_outlier_iso'] = outlier_labels == -1
    print(f"Detected {df_features['is_outlier_iso'].sum():,} outliers ({df_features['is_outlier_iso'].mean()*100:.1f}%)")
    
    # Quality score
    df_features['quality_score'] = (
        (1 - df_features['spectral_flatness'].fillna(0.5)) * 0.3 +
        (df_features['harmonic_noise_ratio'].fillna(1.0) / (df_features['harmonic_noise_ratio'].fillna(1.0).max() + 1e-10)) * 0.3 +
        (1 - df_features['zcr_mean'].fillna(0.5) / (df_features['zcr_mean'].fillna(0.5).max() + 1e-10)) * 0.2 +
        (1 - df_features['energy_cv'].fillna(1.0) / (df_features['energy_cv'].fillna(1.0).max() + 1e-10)) * 0.2
    )
    
    # Save features
    print(f"\nSaving features to {FEATURES_FILE}...")
    df_features.to_parquet(FEATURES_FILE, index=False)
    print(f"Features saved! Shape: {df_features.shape}")
    
    # Extract spectrograms from context audio
    print(f"\nExtracting spectrograms from context audio files...")
    spectrograms_dict = {}
    
    for idx, row in tqdm(df_phonemes.iterrows(), total=len(df_phonemes), desc="Extracting spectrograms"):
        audio_path = row['audio_path']
        phoneme_id = row['phoneme_id']
        
        if audio_path is None or not Path(audio_path).exists():
            continue
        
        spec = extract_spectrogram_window(audio_path, target_duration_ms=SPECTROGRAM_WINDOW_MS)
        if spec is not None:
            spectrograms_dict[phoneme_id] = spec
    
    print(f"\nExtracted {len(spectrograms_dict):,} spectrograms")
    if spectrograms_dict:
        print(f"Spectrogram shape: {list(spectrograms_dict.values())[0].shape}")
    
    # Save spectrograms
    print(f"\nSaving spectrograms to {SPECTROGRAMS_FILE}...")
    with h5py.File(SPECTROGRAMS_FILE, 'w') as f:
        for phoneme_id, spec in tqdm(spectrograms_dict.items(), desc="Saving spectrograms"):
            f.create_dataset(str(phoneme_id), data=spec, compression='gzip')
    print(f"Spectrograms saved!")
    
else:
    # Load existing features
    print(f"\nLoading existing features from {FEATURES_FILE}...")
    df_features = pd.read_parquet(FEATURES_FILE)
    print(f"Features shape: {df_features.shape}")
    print(f"Features columns: {len(df_features.columns)}")
    
    # Load existing spectrograms
    print(f"\nLoading existing spectrograms from {SPECTROGRAMS_FILE}...")
    spectrograms_dict = {}
    with h5py.File(SPECTROGRAMS_FILE, 'r') as f:
        phoneme_ids = list(f.keys())
        for phoneme_id in tqdm(phoneme_ids, desc="Loading spectrograms"):
            # Convert string keys to int to match DataFrame phoneme_id type
            spectrograms_dict[int(phoneme_id)] = f[phoneme_id][:]
    print(f"Loaded {len(spectrograms_dict):,} spectrograms")


Loading phoneme metadata from phoneme_intervals.csv...
Phonemes shape (before filtering): (1337749, 5)
Phonemes columns: ['utterance_id', 'phoneme', 'start_ms', 'end_ms', 'duration_ms']

Filtering to only 'd' and 't' phonemes...
Phonemes shape (after filtering d/t): (132992, 5)
Phoneme distribution:
phoneme
t    74454
d    58538
Name: count, dtype: int64

Creating phoneme_id column...
Created 132992 unique phoneme IDs

Creating class column...
Class distribution:
class
t    74454
d    58538
Name: count, dtype: int64

Updating audio paths to use context audio files...
Creating audio_path column...

Final dataset size: 132,992 phonemes with context audio

Features file already exists: /Volumes/SSanDisk/SpeechRec-German/artifacts/d-t_dl_models_with_context_v2/features/features.parquet
Set EXTRACT_FEATURES=False to skip extraction and load existing features.
Proceeding with extraction (will overwrite existing file)...

EXTRACTING FEATURES FROM CONTEXT AUDIO FILES
This will process 132,992 

Extracting features: 100%|██████████| 132992/132992 [36:04<00:00, 61.43it/s]



Extracted features for 132,992 phonemes

Feature columns: 54
Feature shape: (132992, 54)

After expanding arrays: 132 columns
Feature shape: (132992, 132)

Performing quality assessment...
Detected 13,300 outliers (10.0%)

Saving features to /Volumes/SSanDisk/SpeechRec-German/artifacts/d-t_dl_models_with_context_v2/features/features.parquet...
Features saved! Shape: (132992, 134)

Extracting spectrograms from context audio files...


Extracting spectrograms: 100%|██████████| 132992/132992 [37:01<00:00, 59.85it/s]   



Extracted 132,992 spectrograms
Spectrogram shape: (128, 7)

Saving spectrograms to /Volumes/SSanDisk/SpeechRec-German/artifacts/d-t_dl_models_with_context_v2/features/spectrograms.h5...


Saving spectrograms: 100%|██████████| 132992/132992 [00:15<00:00, 8562.27it/s]


Spectrograms saved!


## 3. Merge Features and Phoneme Data


In [4]:
# Merge features with phoneme metadata
print("Merging features with phoneme metadata...")
df = df_phonemes.merge(df_features, on='phoneme_id', how='inner', suffixes=('', '_features'))
print(f"Merged dataset shape: {df.shape}")

# Handle duplicate columns from merge
if 'class_features' in df.columns:
    df = df.drop(columns=['class_features'])
if 'class' not in df.columns:
    if 'phoneme' in df.columns:
        print("\n'class' column not found, creating from 'phoneme' column...")
        df['class'] = df['phoneme']
    else:
        raise ValueError("Neither 'class' nor 'phoneme' column found in merged DataFrame")
else:
    print("\n'class' column found in merged DataFrame")

# Filter to only d and t classes (exclude pf if present)
if 'pf' in df['class'].values:
    print("\nFiltering out 'pf' class, keeping only 'd' and 't'...")
    df = df[df['class'].isin(['d', 't'])].copy()
    print(f"Dataset after filtering: {len(df)} samples")

# Check class distribution
print(f"\nClass distribution:")
print(df['class'].value_counts())
print(f"\nClass distribution (%):")
print(df['class'].value_counts(normalize=True) * 100)

# Encode target
le = LabelEncoder()
df['class_encoded'] = le.fit_transform(df['class'])  # d=0, t=1
print(f"\nClass encoding: {dict(zip(le.classes_, le.transform(le.classes_)))}")

# Check which phonemes have spectrograms
df['has_spectrogram'] = df['phoneme_id'].isin(spectrograms_dict.keys())
print(f"\nPhonemes with spectrograms: {df['has_spectrogram'].sum()} / {len(df)}")

# Get feature columns (exclude metadata and non-numeric columns)
# Note: duration_ms_y (from features) should be included, but duration_ms_x (from phonemes) should be excluded
exclude_cols = ['phoneme_id', 'utterance_id', 'phoneme', 'class', 'class_x', 'class_y', 
                'class_encoded', 'start_ms', 'end_ms', 'duration_ms', 'duration_ms_x', 
                'audio_path', 'is_outlier_iso', 'split', 'has_spectrogram',
                'class_features']  # Exclude merge suffixes (but keep duration_ms_y)
feature_cols = [col for col in df.columns if col not in exclude_cols]

# Filter to only numeric columns (using pd.api.types for consistency with data_loader.py)
feature_cols = [col for col in feature_cols if pd.api.types.is_numeric_dtype(df[col])]

print(f"\nNumber of feature columns: {len(feature_cols)}")
print(f"First 10 features: {feature_cols[:10]}")


Merging features with phoneme metadata...
Merged dataset shape: (132992, 141)

'class' column found in merged DataFrame

Class distribution:
class
t    74454
d    58538
Name: count, dtype: int64

Class distribution (%):
class
t    55.983819
d    44.016181
Name: proportion, dtype: float64

Class encoding: {'d': np.int64(0), 't': np.int64(1)}

Phonemes with spectrograms: 132992 / 132992

Number of feature columns: 130
First 10 features: ['energy_rms', 'energy_rms_std', 'energy_zcr', 'energy_zcr_std', 'spectral_centroid', 'spectral_centroid_std', 'spectral_rolloff', 'spectral_rolloff_std', 'spectral_bandwidth', 'spectral_bandwidth_std']


## 4. Train/Val/Test Split


In [5]:
# Save feature columns list (important for model loading)
# This file is needed for loading models later
with open(OUTPUT_DIR / 'feature_cols.json', 'w') as f:
    json.dump(feature_cols, f, indent=2)
print(f"Feature columns saved to {OUTPUT_DIR / 'feature_cols.json'}")


Feature columns saved to /Volumes/SSanDisk/SpeechRec-German/artifacts/d-t_dl_models_with_context_v2/feature_cols.json


In [6]:
# Filter to only phonemes with spectrograms
df = df[df['has_spectrogram']].copy()
print(f"Dataset after filtering: {len(df)} samples")

# Train/Val/Test split (70/15/15) with stratification
X_temp, X_test, y_temp, y_test = train_test_split(
    df.index, df['class_encoded'], 
    test_size=0.15, 
    random_state=RANDOM_STATE, 
    stratify=df['class_encoded']
)

X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, 
    test_size=0.176,  # 0.176 ≈ 15/85
    random_state=RANDOM_STATE, 
    stratify=y_temp
)

# Create split column
df['split'] = 'train'
df.loc[X_val, 'split'] = 'val'
df.loc[X_test, 'split'] = 'test'

print(f"\nTrain set: {len(X_train):,} samples ({len(X_train)/len(df)*100:.1f}%)")
print(f"  Class distribution: {np.bincount(df.loc[X_train, 'class_encoded'])}")
print(f"Val set: {len(X_val):,} samples ({len(X_val)/len(df)*100:.1f}%)")
print(f"  Class distribution: {np.bincount(df.loc[X_val, 'class_encoded'])}")
print(f"Test set: {len(X_test):,} samples ({len(X_test)/len(df)*100:.1f}%)")
print(f"  Class distribution: {np.bincount(df.loc[X_test, 'class_encoded'])}")

# Save split indices
split_indices = {
    'train': [int(idx) for idx in X_train],
    'val': [int(idx) for idx in X_val],
    'test': [int(idx) for idx in X_test]
}

with open(OUTPUT_DIR / 'split_indices.json', 'w') as f:
    json.dump(split_indices, f)
print(f"\nSplit indices saved to {OUTPUT_DIR / 'split_indices.json'}")


Dataset after filtering: 132992 samples

Train set: 93,147 samples (70.0%)
  Class distribution: [41000 52147]
Val set: 19,896 samples (15.0%)
  Class distribution: [ 8757 11139]
Test set: 19,949 samples (15.0%)
  Class distribution: [ 8781 11168]

Split indices saved to /Volumes/SSanDisk/SpeechRec-German/artifacts/d-t_dl_models_with_context_v2/split_indices.json


## 4. Create PyTorch Dataset Classes


In [7]:
# Import Dataset classes from utils module
from utils.dl_data_preparation import (
    SpectrogramDataset,
    FeatureDataset,
    HybridDataset,
    RawAudioDataset,
    ContextAudioDataset,
    SequenceDataset
)

print("Dataset classes imported from utils.dl_data_preparation")


Dataset classes imported from utils.dl_data_preparation


In [8]:
# Create feature scaler on training data
train_df = df[df['split'] == 'train']
X_train_features = train_df[feature_cols].values.astype(np.float32)
X_train_features = np.nan_to_num(X_train_features, nan=0.0, posinf=0.0, neginf=0.0)

feature_scaler = StandardScaler()
feature_scaler.fit(X_train_features)

# Save scaler
import joblib
joblib.dump(feature_scaler, OUTPUT_DIR / 'feature_scaler.joblib')
print(f"Feature scaler saved to {OUTPUT_DIR / 'feature_scaler.joblib'}")

# Compute class weights
class_weights = compute_class_weight(
    'balanced',
    classes=np.unique(df[df['split'] == 'train']['class_encoded']),
    y=df[df['split'] == 'train']['class_encoded']
)
class_weights_dict = {i: weight for i, weight in enumerate(class_weights)}
print(f"\nClass weights: {class_weights_dict}")

# Save class weights
with open(OUTPUT_DIR / 'class_weights.json', 'w') as f:
    json.dump(class_weights_dict, f)
print(f"Class weights saved to {OUTPUT_DIR / 'class_weights.json'}")

# Create datasets
train_spectrogram_ds = SpectrogramDataset(df, spectrograms_dict, split='train')
val_spectrogram_ds = SpectrogramDataset(df, spectrograms_dict, split='val')
test_spectrogram_ds = SpectrogramDataset(df, spectrograms_dict, split='test')

train_feature_ds = FeatureDataset(df, feature_cols, scaler=feature_scaler, split='train')
val_feature_ds = FeatureDataset(df, feature_cols, scaler=feature_scaler, split='val')
test_feature_ds = FeatureDataset(df, feature_cols, scaler=feature_scaler, split='test')

train_hybrid_ds = HybridDataset(df, spectrograms_dict, feature_cols, scaler=feature_scaler, split='train')
val_hybrid_ds = HybridDataset(df, spectrograms_dict, feature_cols, scaler=feature_scaler, split='val')
test_hybrid_ds = HybridDataset(df, spectrograms_dict, feature_cols, scaler=feature_scaler, split='test')

train_sequence_ds = SequenceDataset(df, spectrograms_dict, split='train')
val_sequence_ds = SequenceDataset(df, spectrograms_dict, split='val')
test_sequence_ds = SequenceDataset(df, spectrograms_dict, split='test')

# Note: With context, audio is longer (~300ms instead of ~100ms), so adjust max_length accordingly
# Original phoneme ~100ms + 200ms context = ~300ms total
# At 16kHz: 300ms = 4800 samples
train_raw_audio_ds = RawAudioDataset(df, split='train', sample_rate=16000, max_length=4800)  # ~300ms at 16kHz
val_raw_audio_ds = RawAudioDataset(df, split='val', sample_rate=16000, max_length=4800)
test_raw_audio_ds = RawAudioDataset(df, split='test', sample_rate=16000, max_length=4800)

train_context_audio_ds = ContextAudioDataset(df, split='train', sample_rate=16000)
val_context_audio_ds = ContextAudioDataset(df, split='val', sample_rate=16000)
test_context_audio_ds = ContextAudioDataset(df, split='test', sample_rate=16000)

print("\nAll datasets created!")
print(f"Train spectrogram dataset: {len(train_spectrogram_ds)} samples")
print(f"Train feature dataset: {len(train_feature_ds)} samples")
print(f"Train hybrid dataset: {len(train_hybrid_ds)} samples")
print(f"Train sequence dataset: {len(train_sequence_ds)} samples")
print(f"Train raw audio dataset: {len(train_raw_audio_ds)} samples")
print(f"Train context audio dataset: {len(train_context_audio_ds)} samples")


Feature scaler saved to /Volumes/SSanDisk/SpeechRec-German/artifacts/d-t_dl_models_with_context_v2/feature_scaler.joblib

Class weights: {0: np.float64(1.1359390243902439), 1: np.float64(0.8931194507833624)}
Class weights saved to /Volumes/SSanDisk/SpeechRec-German/artifacts/d-t_dl_models_with_context_v2/class_weights.json

All datasets created!
Train spectrogram dataset: 93147 samples
Train feature dataset: 93147 samples
Train hybrid dataset: 93147 samples
Train sequence dataset: 93147 samples
Train raw audio dataset: 93147 samples
Train context audio dataset: 93147 samples


## 6. Create DataLoaders with Weighted Sampling


In [9]:
# Compute sample weights for weighted sampling
train_labels = df[df['split'] == 'train']['class_encoded'].values
sample_weights = np.array([class_weights[label] for label in train_labels])
sampler = WeightedRandomSampler(
    weights=sample_weights,
    num_samples=len(sample_weights),
    replacement=True
)

# Create DataLoaders
BATCH_SIZE = 64

train_spectrogram_loader = DataLoader(train_spectrogram_ds, batch_size=BATCH_SIZE, sampler=sampler, num_workers=0)
val_spectrogram_loader = DataLoader(val_spectrogram_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)
test_spectrogram_loader = DataLoader(test_spectrogram_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)

train_feature_loader = DataLoader(train_feature_ds, batch_size=BATCH_SIZE, sampler=sampler, num_workers=0)
val_feature_loader = DataLoader(val_feature_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)
test_feature_loader = DataLoader(test_feature_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)

train_hybrid_loader = DataLoader(train_hybrid_ds, batch_size=BATCH_SIZE, sampler=sampler, num_workers=0)
val_hybrid_loader = DataLoader(val_hybrid_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)
test_hybrid_loader = DataLoader(test_hybrid_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)

train_sequence_loader = DataLoader(train_sequence_ds, batch_size=BATCH_SIZE, sampler=sampler, num_workers=0)
val_sequence_loader = DataLoader(val_sequence_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)
test_sequence_loader = DataLoader(test_sequence_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)

train_raw_audio_loader = DataLoader(train_raw_audio_ds, batch_size=BATCH_SIZE, sampler=sampler, num_workers=0)
val_raw_audio_loader = DataLoader(val_raw_audio_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)
test_raw_audio_loader = DataLoader(test_raw_audio_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)

train_context_audio_loader = DataLoader(train_context_audio_ds, batch_size=BATCH_SIZE, sampler=sampler, num_workers=0)
val_context_audio_loader = DataLoader(val_context_audio_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)
test_context_audio_loader = DataLoader(test_context_audio_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)

print("All DataLoaders created!")
print(f"\nTrain batches (spectrogram): {len(train_spectrogram_loader)}")
print(f"Train batches (feature): {len(train_feature_loader)}")
print(f"Train batches (hybrid): {len(train_hybrid_loader)}")

# Test a batch
print("\nTesting a batch from spectrogram dataset...")
sample_batch = next(iter(train_spectrogram_loader))
print(f"Batch shape: {sample_batch[0].shape}, Labels shape: {sample_batch[1].shape}")

# Test raw audio batch (should be longer due to context)
print("\nTesting a batch from raw audio dataset (with context)...")
sample_audio_batch = next(iter(train_raw_audio_loader))
print(f"Audio batch shape: {sample_audio_batch[0].shape}, Labels shape: {sample_audio_batch[1].shape}")
print(f"Audio duration: {sample_audio_batch[0].shape[1] / 16000 * 1000:.1f}ms (expected ~300ms with context)")


All DataLoaders created!

Train batches (spectrogram): 1456
Train batches (feature): 1456
Train batches (hybrid): 1456

Testing a batch from spectrogram dataset...
Batch shape: torch.Size([64, 1, 128, 7]), Labels shape: torch.Size([64])

Testing a batch from raw audio dataset (with context)...
Audio batch shape: torch.Size([64, 4800]), Labels shape: torch.Size([64])
Audio duration: 300.0ms (expected ~300ms with context)


## 7. Save Dataset Information


In [None]:
# Save dataset information
dataset_info = {
    'total_samples': len(df),
    'train_samples': len(df[df['split'] == 'train']),
    'val_samples': len(df[df['split'] == 'val']),
    'test_samples': len(df[df['split'] == 'test']),
    'n_features': len(feature_cols),
    'spectrogram_shape': list(spectrograms_dict[list(spectrograms_dict.keys())[0]].shape),
    'context_window_ms': 100,  # ±100ms context
    'audio_max_length_samples': 4800,  # ~300ms at 16kHz (100ms phoneme + 200ms context)
    'class_distribution': {
        'train': df[df['split'] == 'train']['class'].value_counts().to_dict(),
        'val': df[df['split'] == 'val']['class'].value_counts().to_dict(),
        'test': df[df['split'] == 'test']['class'].value_counts().to_dict()
    },
    'class_weights': class_weights_dict,
    'feature_columns': feature_cols,
    'audio_source': 'phoneme_wav_with_context'
}

with open(OUTPUT_DIR / 'dataset_info.json', 'w') as f:
    json.dump(dataset_info, f, indent=2)

print(f"Dataset info saved to {OUTPUT_DIR / 'dataset_info.json'}")
print(f"\nDataset summary:")
print(f"  Total samples: {dataset_info['total_samples']}")
print(f"  Train: {dataset_info['train_samples']}")
print(f"  Val: {dataset_info['val_samples']}")
print(f"  Test: {dataset_info['test_samples']}")
print(f"  Features: {dataset_info['n_features']}")
print(f"  Spectrogram shape: {dataset_info['spectrogram_shape']}")
print(f"  Context window: ±{dataset_info['context_window_ms']}ms")
print(f"  Audio max length: {dataset_info['audio_max_length_samples']} samples (~{dataset_info['audio_max_length_samples']/16:.0f}ms)")
print(f"  Audio source: {dataset_info['audio_source']}")


Dataset info saved to /Volumes/SSanDisk/SpeechRec-German/artifacts/d-t_dl_models_with_context_v2/dataset_info.json

Dataset summary:
  Total samples: 132992
  Train: 93147
  Val: 19896
  Test: 19949
  Features: 130
  Spectrogram shape: [128, 7]
  Context window: ±100ms
  Audio max length: 4800 samples (~300ms)
  Audio source: phoneme_wav_with_context


: 