# AeroGuard: Privacy-Preserving On-Device Cough Monitoring via TinyML
## Data Analysis and Preparation Notebook

This notebook walks through:
1. **Audio Normalization** - Convert to 16kHz, 16-bit, mono
2. **Windowing & Slicing** - 1-second windows with 500ms overlap
3. **MFCC Feature Extraction** - Convert to 2D spectrograms
4. **Dataset Balancing** - 40/30/30 split (Cough/Human/Background)
5. **Train/Test Split** - 80/20 with stratification
6. **Visualization** - Waveforms, spectrograms, statistics

### Project Goals
‚úÖ **>90% accuracy** on cough detection  
‚úÖ **Zero privacy** - all processing on-device  
‚úÖ **Low power** - optimized for ESP32  
‚úÖ **Professional** - ready for portfolio/submission

## Section 1: Import Required Libraries

In [8]:
# Import libraries for audio processing, data management, and visualization
import librosa
import librosa.display
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from sklearn.model_selection import train_test_split
import json
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
sns.set_style("darkgrid")
plt.rcParams['figure.figsize'] = (14, 6)

print("‚úì All libraries imported successfully")
print(f"librosa version: {librosa.__version__}")
print(f"numpy version: {np.__version__}")
print(f"pandas version: {pd.__version__}")

‚úì All libraries imported successfully
librosa version: 0.11.0
numpy version: 2.4.2
pandas version: 3.0.0


## Section 2: Dataset Overview

In [9]:
# Set up data paths
ROOT_DIR = Path(r"c:\HS\TML1")
COUGHVID_DIR = ROOT_DIR / "public_dataset"
ESC50_DIR = ROOT_DIR / "ESC-50-master"
OUTPUT_DIR = ROOT_DIR / "Project_AeroGuard_Data"

# List datasets
print("üìÅ DATASET INVENTORY")
print("=" * 60)

# Count COUGHVID files
coughvid_files = list(COUGHVID_DIR.glob("*.wav"))
print(f"\n‚úì COUGHVID Dataset:")
print(f"  ‚Ä¢ Audio files: {len(coughvid_files)}")
print(f"  ‚Ä¢ Directory: {COUGHVID_DIR}")

# Count ESC-50 files
esc50_files = list(ESC50_DIR.glob("audio/*.wav"))
print(f"\n‚úì ESC-50 Dataset:")
print(f"  ‚Ä¢ Audio files: {len(esc50_files)}")
print(f"  ‚Ä¢ Directory: {ESC50_DIR / 'audio'}")

# Load ESC-50 metadata
esc50_csv = ESC50_DIR / "meta" / "esc50.csv"
df_esc50 = pd.read_csv(esc50_csv)
print(f"  ‚Ä¢ Categories: {df_esc50['target'].nunique()}")
print(f"  ‚Ä¢ Total entries: {len(df_esc50)}")

print(f"\n‚úì Total audio files available: {len(coughvid_files) + len(esc50_files)}")

üìÅ DATASET INVENTORY

‚úì COUGHVID Dataset:
  ‚Ä¢ Audio files: 27550
  ‚Ä¢ Directory: c:\HS\TML1\public_dataset

‚úì ESC-50 Dataset:
  ‚Ä¢ Audio files: 2000
  ‚Ä¢ Directory: c:\HS\TML1\ESC-50-master\audio
  ‚Ä¢ Categories: 50
  ‚Ä¢ Total entries: 2000

‚úì Total audio files available: 29550


## Section 3: Audio Normalization Pipeline

All audio must be normalized to:
- **Sample Rate**: 16,000 Hz (16kHz) - ESP32 sweet spot
- **Bit Depth**: 16-bit PCM - Standard for audio
- **Channels**: Mono - Reduces processing & memory

In [None]:
# Test audio normalization on a sample file
def load_and_normalize_audio(file_path, target_sr=16000):
    """
    Load and normalize audio to 16kHz, 16-bit, mono.
    
    Args:
        file_path: Path to audio file
        target_sr: Target sample rate (16000 Hz for ESP32)
    
    Returns:
        audio: Normalized audio array
        sr: Sample rate
    """
    try:
        # Load audio with target sample rate
        y, sr = librosa.load(file_path, sr=target_sr, mono=True)
        
        # Normalize amplitude
        if np.max(np.abs(y)) > 0:
            y = y / np.max(np.abs(y))
        
        # Convert to 16-bit PCM
        y_int16 = np.int16(y * 32767)
        
        return y_int16, sr
    except Exception as e:
        print(f"Error loading {file_path}: {e}")
        return None, None

# Test with first COUGHVID file
sample_file = coughvid_files[0]
audio, sr = load_and_normalize_audio(sample_file)

print(f"üìä SAMPLE AUDIO ANALYSIS")
print("=" * 60)
print(f"File: {sample_file.name}")
print(f"Sample rate: {sr} Hz")
print(f"Duration: {len(audio) / sr:.2f} seconds")
print(f"Audio shape: {audio.shape}")
print(f"Data type: {audio.dtype}")
print(f"Min value: {audio.min()}")
print(f"Max value: {audio.max()}")

# Visualize waveform
fig, ax = plt.subplots(figsize=(12, 4))
librosa.display.waveshow(audio.astype(np.float32) / 32768, sr=sr, ax=ax)
ax.set_title(f"Waveform: {sample_file.name}")
ax.set_xlabel("Time (s)")
ax.set_ylabel("Amplitude")
plt.tight_layout()
plt.show()

print(f"\n‚úì Audio loaded and normalized successfully")

Error loading c:\HS\TML1\public_dataset\00014dcc-0f06-4c27-8c7b-737b18a2cf4c.wav: Numba needs NumPy 2.3 or less. Got NumPy 2.4.
üìä SAMPLE AUDIO ANALYSIS
File: 00014dcc-0f06-4c27-8c7b-737b18a2cf4c.wav
Sample rate: None Hz


TypeError: object of type 'NoneType' has no len()

## Section 4: Windowing and Slicing Audio Files

**The 1-Second Rule**: 
- Coughs last 250-700ms
- Windows of 1 second capture the full event
- 500ms overlap ensures no edge-case misses
- ESP32 RAM can hold exactly 1 second @ 16kHz

In [None]:
def create_windows(audio, sr=16000, window_size_ms=1000, overlap_ms=500):
    """
    Create sliding windows from audio.
    
    Args:
        audio: Audio array
        sr: Sample rate
        window_size_ms: Window size in milliseconds
        overlap_ms: Overlap in milliseconds
    
    Returns:
        List of window arrays
    """
    window_samples = int(window_size_ms * sr / 1000)
    overlap_samples = int(overlap_ms * sr / 1000)
    
    windows = []
    start = 0
    
    while start + window_samples <= len(audio):
        window = audio[start:start + window_samples]
        if len(window) == window_samples:
            windows.append(window)
        start += overlap_samples
    
    return windows

# Demonstrate windowing
windows = create_windows(audio, sr=sr)

print(f"ü™ü WINDOWING DEMONSTRATION")
print("=" * 60)
print(f"Original audio duration: {len(audio) / sr:.2f} seconds")
print(f"Window size: 1000 ms")
print(f"Overlap: 500 ms (50%)")
print(f"Number of windows created: {len(windows)}")
print(f"Samples per window: {len(windows[0]) if windows else 0}")

# Visualize windows
fig, ax = plt.subplots(figsize=(14, 4))
audio_float = audio.astype(np.float32) / 32768.0
librosa.display.waveshow(audio_float, sr=sr, ax=ax, alpha=0.5, label='Original')

# Mark window boundaries
for i, start_ms in enumerate(np.arange(0, (len(audio) / sr) * 1000, 500)):
    if start_ms < (len(audio) / sr) * 1000 - 1000:
        ax.axvline(x=start_ms/1000, color='red', linestyle='--', alpha=0.5)

ax.set_title("Sliding Windows (1000ms @ 500ms overlap)")
ax.set_xlabel("Time (s)")
ax.set_ylabel("Amplitude")
plt.tight_layout()
plt.show()

print(f"\n‚úì Windowing complete")

## Section 5: Feature Extraction with MFCC

**MFCC = Mel-Frequency Cepstral Coefficients**

This converts raw waveforms into "sound images" that CNNs can learn from. Think of it like converting a sound wave into a spectrogram.

In [None]:
def extract_mfcc(audio, sr=16000, n_mfcc=13):
    """
    Extract MFCC features from audio.
    
    Args:
        audio: Audio array (16-bit PCM)
        sr: Sample rate
        n_mfcc: Number of MFCC coefficients
    
    Returns:
        MFCC feature matrix (shape: n_mfcc x time_steps)
    """
    # Convert to float32 for librosa
    audio_float = audio.astype(np.float32) / 32768.0
    
    # Extract MFCCs
    mfcc = librosa.feature.mfcc(
        y=audio_float,
        sr=sr,
        n_mfcc=n_mfcc,
        n_fft=512,
        hop_length=160
    )
    
    return mfcc

# Extract MFCC from sample window
if windows:
    sample_window = windows[0]
    mfcc_features = extract_mfcc(sample_window, sr=sr)
    
    print(f"üéµ MFCC FEATURE EXTRACTION")
    print("=" * 60)
    print(f"MFCC shape: {mfcc_features.shape}")
    print(f"  ‚Ä¢ Coefficients: {mfcc_features.shape[0]}")
    print(f"  ‚Ä¢ Time steps: {mfcc_features.shape[1]}")
    
    # Visualize MFCC
    fig, ax = plt.subplots(figsize=(12, 5))
    img = librosa.display.specshow(
        mfcc_features,
        sr=sr,
        hop_length=160,
        x_axis='time',
        y_axis='mel_hz',
        ax=ax
    )
    ax.set_title('MFCC Features (1-second window)')
    fig.colorbar(img, ax=ax, format='%+2.0f dB')
    plt.tight_layout()
    plt.show()
    
    print(f"\n‚úì MFCC extracted: {mfcc_features.shape[0]} features x {mfcc_features.shape[1]} time steps")

## Section 6: Dataset Organization and Labeling

**Target Distribution:**
- **Cough**: 40% (from COUGHVID)
- **Human_Noise**: 30% (sneezes, laughs from ESC-50)
- **Background**: 30% (ambient noise from ESC-50)

In [None]:
# ESC-50 category mapping to our 3 classes
ESC50_MAPPING = {
    # Human/Sneeze
    34: 'Human_Noise',  # sneezing
    35: 'Human_Noise',  # laughing
    36: 'Human_Noise',  # crying baby
    37: 'Human_Noise',  # snoring
    26: 'Human_Noise',  # breathing
    
    # Background/Ambient
    40: 'Background',   # door wood knock
    41: 'Background',   # door metal knock
    42: 'Background',   # door open/close
    43: 'Background',   # chainsaw
    44: 'Background',   # siren
    45: 'Background',   # car horn
    46: 'Background',   # engine
    47: 'Background',   # train
    48: 'Background',   # church bells
    49: 'Background',   # alarm clock
}

# Create sample organization
print(f"üìä DATASET ORGANIZATION")
print("=" * 60)

# Count COUGHVID files (all = Cough)
cough_count = len(coughvid_files)
print(f"\nCough samples: {cough_count} files")

# Count ESC-50 files by category
human_noise_count = len(df_esc50[df_esc50['target'].isin([v for k, v in ESC50_MAPPING.items() if v == 'Human_Noise'])])
background_count = len(df_esc50[df_esc50['target'].isin([v for k, v in ESC50_MAPPING.items() if v == 'Background'])])

print(f"Human_Noise samples: {human_noise_count} files")
print(f"Background samples: {background_count} files")

print(f"\nTotal files: {cough_count + human_noise_count + background_count}")

# Show category distribution
categories = df_esc50.groupby('target')['filename'].count().sort_values(ascending=False)
print(f"\nTop ESC-50 categories:")
print(categories.head(10))

## Section 7: Train-Test Split (80/20)

Stratified split ensures class balance in both sets.

In [None]:
# Simulate train-test split
np.random.seed(42)  # For reproducibility

# Create dummy data for demonstration
all_files = []
all_labels = []

# COUGHVID (Cough) - 40%
for i in range(int(1000 * 0.40)):
    all_files.append(f"cough_{i:04d}.wav")
    all_labels.append("Cough")

# ESC-50 Human (Human_Noise) - 30%
for i in range(int(1000 * 0.30)):
    all_files.append(f"human_{i:04d}.wav")
    all_labels.append("Human_Noise")

# ESC-50 Background - 30%
for i in range(int(1000 * 0.30)):
    all_files.append(f"background_{i:04d}.wav")
    all_labels.append("Background")

# Create DataFrame
df = pd.DataFrame({'filename': all_files, 'label': all_labels})

# Stratified train-test split
train_files, test_files, train_labels, test_labels = train_test_split(
    df['filename'],
    df['label'],
    test_size=0.2,
    stratify=df['label'],
    random_state=42
)

print(f"üìä TRAIN-TEST SPLIT (80/20)")
print("=" * 60)
print(f"\nTraining set: {len(train_files)} samples")
print(f"Testing set: {len(test_files)} samples")

# Show distribution
print(f"\nüìà CLASS DISTRIBUTION:")
print(f"\nTraining set:")
train_df = pd.DataFrame({'label': train_labels})
for label in train_df['label'].unique():
    count = len(train_df[train_df['label'] == label])
    pct = (count / len(train_df)) * 100
    print(f"  ‚Ä¢ {label}: {count} ({pct:.1f}%)")

print(f"\nTesting set:")
test_df = pd.DataFrame({'label': test_labels})
for label in test_df['label'].unique():
    count = len(test_df[test_df['label'] == label])
    pct = (count / len(test_df)) * 100
    print(f"  ‚Ä¢ {label}: {count} ({pct:.1f}%)")

# Visualize split
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Training distribution
train_counts = train_df['label'].value_counts()
axes[0].bar(train_counts.index, train_counts.values, color=['#FF6B6B', '#4ECDC4', '#45B7D1'])
axes[0].set_title("Training Set Distribution (80%)")
axes[0].set_ylabel("Number of samples")

# Testing distribution
test_counts = test_df['label'].value_counts()
axes[1].bar(test_counts.index, test_counts.values, color=['#FF6B6B', '#4ECDC4', '#45B7D1'])
axes[1].set_title("Testing Set Distribution (20%)")
axes[1].set_ylabel("Number of samples")

plt.tight_layout()
plt.show()

## Section 8: Data Augmentation for Robustness

**Why Data Augmentation?**
Cough sounds vary significantly based on context (outdoor vs. indoor), microphone quality, and individual factors. Data augmentation artificially increases training diversity by creating variations of existing samples:
- **Time Stretching**: Simulates speech rate variation (fast vs. slow coughs)
- **Pitch Shifting**: Accounts for age/gender variations in cough characteristics
- **Noise Injection**: Handles real-world microphone noise and background interference

We'll augment minority classes (especially background noise) to balance the dataset further.

In [None]:
def time_stretch_audio(y, rate=None):
    """Stretch audio time without changing pitch (changes cough speed)."""
    if rate is None:
        rate = np.random.uniform(0.9, 1.1)  # 10% faster or slower
    return librosa.effects.time_stretch(y, rate=rate)

def pitch_shift_audio(y, sr, semitones=None):
    """Shift pitch up/down (simulates age/gender variation)."""
    if semitones is None:
        semitones = np.random.randint(-3, 4)  # ¬±3 semitones
    return librosa.effects.pitch_shift(y, sr=sr, n_steps=semitones)

def add_gaussian_noise(y, noise_level=None):
    """Add white noise to simulate microphone noise."""
    if noise_level is None:
        noise_level = np.random.uniform(0.001, 0.005)  # 0.1%-0.5% amplitude
    noise = np.random.normal(0, noise_level, len(y))
    return y + noise

def add_background_noise(y, sr, noise_audio_path=None):
    """Mix with actual background noise."""
    noise_mix = np.random.uniform(0.05, 0.15)  # Mix 5-15% noise
    if noise_audio_path:
        noise, _ = librosa.load(noise_audio_path, sr=sr)
        # Pad or trim noise to match audio length
        if len(noise) < len(y):
            noise = np.tile(noise, int(np.ceil(len(y) / len(noise))))
        noise = noise[:len(y)]
        return y + (noise * noise_mix)
    return add_gaussian_noise(y, noise_level=noise_mix)

# Example augmentation on a sample window
sample_window_original = windows[0]
sr = 16000

print("üîä DATA AUGMENTATION DEMONSTRATIONS")
print("=" * 60)

fig, axes = plt.subplots(2, 3, figsize=(15, 8))

# Original
axes[0, 0].plot(np.linspace(0, 1, len(sample_window_original)), sample_window_original)
axes[0, 0].set_title("Original Audio")
axes[0, 0].set_xlabel("Time (s)")
axes[0, 0].set_ylabel("Amplitude")

# Time stretched
stretched = time_stretch_audio(sample_window_original, rate=0.95)
axes[0, 1].plot(np.linspace(0, 1, len(stretched))[:len(sample_window_original)], 
                stretched[:len(sample_window_original)])
axes[0, 1].set_title("Time Stretched (√ó0.95 speed)")
axes[0, 1].set_xlabel("Time (s)")

# Pitch shifted
pitched = pitch_shift_audio(sample_window_original, sr=sr, semitones=2)
axes[0, 2].plot(np.linspace(0, 1, len(pitched)), pitched)
axes[0, 2].set_title("Pitch Shifted (+2 semitones)")
axes[0, 2].set_xlabel("Time (s)")

# Noise added
noisy = add_gaussian_noise(sample_window_original, noise_level=0.01)
axes[1, 0].plot(np.linspace(0, 1, len(noisy)), noisy)
axes[1, 0].set_title("Gaussian Noise Added")
axes[1, 0].set_xlabel("Time (s)")

# Combined augmentation
augmented = add_gaussian_noise(
    pitch_shift_audio(sample_window_original, sr=sr, semitones=1),
    noise_level=0.005
)
axes[1, 1].plot(np.linspace(0, 1, len(augmented)), augmented)
axes[1, 1].set_title("Combined Augmentation")
axes[1, 1].set_xlabel("Time (s)")

# MFCC comparison (original vs augmented)
mfcc_original = extract_mfcc(sample_window_original, sr=sr, n_mfcc=13)
mfcc_augmented = extract_mfcc(augmented, sr=sr, n_mfcc=13)

im = axes[1, 2].imshow(np.vstack([mfcc_original, mfcc_augmented]), aspect='auto', origin='lower')
axes[1, 2].set_title("MFCC: Original (top) vs Augmented (bottom)")
axes[1, 2].set_ylabel("MFCC Coefficient")
axes[1, 2].set_xlabel("Time Frame")
plt.colorbar(im, ax=axes[1, 2])

plt.tight_layout()
plt.show()

print("\n‚úÖ Augmentation strategies for training:")
print("  ‚Ä¢ Time Stretching: ¬±10% speed variation")
print("  ‚Ä¢ Pitch Shifting: ¬±3 semitones")
print("  ‚Ä¢ Gaussian Noise: 0.1%-0.5% SNR")
print("  ‚Ä¢ Combined: Apply 2-3 augmentations per sample")
print("\nüí° Typical approach: For minority classes (Human_Noise, Background),")
print("   create 2-3 augmented variants per training sample to balance dataset.")

## Section 9: Export Processed Dataset and Save Features

**Dataset Organization:**
Final output structure will be:
```
Project_AeroGuard_Data/
‚îú‚îÄ‚îÄ Cough/
‚îÇ   ‚îú‚îÄ‚îÄ train/  (800 samples)
‚îÇ   ‚îî‚îÄ‚îÄ test/   (200 samples)
‚îú‚îÄ‚îÄ Human_Noise/
‚îÇ   ‚îú‚îÄ‚îÄ train/  (600 samples)
‚îÇ   ‚îî‚îÄ‚îÄ test/   (150 samples)
‚îú‚îÄ‚îÄ Background/
‚îÇ   ‚îú‚îÄ‚îÄ train/  (600 samples)
‚îÇ   ‚îî‚îÄ‚îÄ test/   (150 samples)
‚îú‚îÄ‚îÄ metadata/
‚îÇ   ‚îú‚îÄ‚îÄ dataset_metadata.csv
‚îÇ   ‚îî‚îÄ‚îÄ manifest_edge_impulse.json
‚îî‚îÄ‚îÄ features/
    ‚îî‚îÄ‚îÄ (Optional MFCC .npy files for faster training)
```

Each audio file is saved as **16kHz, 16-bit PCM, mono** for direct ESP32 compatibility.

In [None]:
from pathlib import Path
import json
from datetime import datetime
import soundfile as sf

def save_processed_dataset(output_base_dir="Project_AeroGuard_Data"):
    """
    Simulate saving the complete processed dataset with folder structure.
    In production, this would iterate through actual audio files.
    """
    base_path = Path(output_base_dir)
    
    # Create directory structure
    classes = ["Cough", "Human_Noise", "Background"]
    splits = ["train", "test"]
    
    for cls in classes:
        for split in splits:
            (base_path / cls / split).mkdir(parents=True, exist_ok=True)
    
    (base_path / "metadata").mkdir(parents=True, exist_ok=True)
    (base_path / "features").mkdir(parents=True, exist_ok=True)
    
    print(f"üìÅ DATASET DIRECTORY STRUCTURE CREATED")
    print("=" * 60)
    
    # Simulate metadata CSV
    metadata_records = []
    sample_id = 0
    
    # Create metadata for each class/split combination
    class_samples = {"Cough": (800, 200), "Human_Noise": (600, 150), "Background": (600, 150)}
    
    for cls, (train_count, test_count) in class_samples.items():
        # Training samples
        for i in range(train_count):
            metadata_records.append({
                'sample_id': f"{cls[0]}{sample_id:05d}",
                'filename': f"{cls[0]}{sample_id:05d}.wav",
                'class': cls,
                'split': 'train',
                'duration_ms': 1000,
                'sample_rate': 16000,
                'bit_depth': 16,
                'channels': 1,
                'timestamp': datetime.now().isoformat()
            })
            sample_id += 1
        
        # Testing samples
        for i in range(test_count):
            metadata_records.append({
                'sample_id': f"{cls[0]}{sample_id:05d}",
                'filename': f"{cls[0]}{sample_id:05d}.wav",
                'class': cls,
                'split': 'test',
                'duration_ms': 1000,
                'sample_rate': 16000,
                'bit_depth': 16,
                'channels': 1,
                'timestamp': datetime.now().isoformat()
            })
            sample_id += 1
    
    # Save metadata CSV
    metadata_df = pd.DataFrame(metadata_records)
    metadata_csv_path = base_path / "metadata" / "dataset_metadata.csv"
    metadata_df.to_csv(metadata_csv_path, index=False)
    
    print(f"\n‚úÖ Created directory structure:")
    print(f"   ‚Ä¢ Cough: {class_samples['Cough'][0]} train, {class_samples['Cough'][1]} test")
    print(f"   ‚Ä¢ Human_Noise: {class_samples['Human_Noise'][0]} train, {class_samples['Human_Noise'][1]} test")
    print(f"   ‚Ä¢ Background: {class_samples['Background'][0]} train, {class_samples['Background'][1]} test")
    print(f"   ‚Ä¢ Total: {sum(c[0] + c[1] for c in class_samples.values())} samples")
    
    print(f"\nüìä Metadata saved to: {metadata_csv_path}")
    print(f"\nFirst few rows of metadata:")
    print(metadata_df.head(10))
    
    # Create Edge Impulse manifest
    manifest = {
        "version": 1,
        "durations": [1.0],  # 1 second per sample
        "files": []
    }
    
    for _, row in metadata_df.iterrows():
        manifest["files"].append({
            "name": f"{row['class']}/{row['split']}/{row['filename']}",
            "expected_md5": "placeholder_md5",
            "size": 32000  # 1 second @ 16kHz * 2 bytes
        })
    
    manifest_path = base_path / "metadata" / "manifest_edge_impulse.json"
    with open(manifest_path, 'w') as f:
        json.dump(manifest, f, indent=2)
    
    print(f"\nüì¶ Edge Impulse manifest saved to: {manifest_path}")
    
    return base_path, metadata_df

# Execute dataset export simulation
output_dir, metadata_df = save_processed_dataset()

print("\n" + "="*60)
print("üéØ PRODUCTION IMPLEMENTATION:")
print("="*60)
print("""
The actual AeroGuard_DataProcessor.py script will:

1. Iterate through COUGHVID JSON files
   ‚Üí Load corresponding .wav file
   ‚Üí Normalize to 16kHz, 16-bit, mono
   ‚Üí Create 1-second windows with 500ms overlap
   ‚Üí Extract MFCC (13 coefficients)
   ‚Üí Save to Cough/{train|test}/

2. Iterate through ESC-50 files
   ‚Üí Map category to Human_Noise or Background
   ‚Üí Apply same normalization/windowing
   ‚Üí Apply augmentation if minority class
   ‚Üí Save to Human_Noise/{train|test}/ or Background/{train|test}/

3. Generate metadata CSV with all sample information

4. Create Edge Impulse manifest for cloud deployment option

This ensures 100% reproducibility and proper data lineage for portfolio.
""")

## Section 10: Validation and Final Verification

**Critical Checks Before Model Training:**
Before feeding data to the neural network, we must verify:
1. **Correct class distribution**: 40/30/30 maintained
2. **Train/test stratification**: Each split preserves class proportions
3. **Audio integrity**: All files are valid, duration correct
4. **Feature statistics**: MFCC features have expected value ranges
5. **No data leakage**: Same source file doesn't appear in both train and test

In [None]:
def validate_dataset(metadata_df):
    """Comprehensive dataset validation before training."""
    
    print("üîç DATASET VALIDATION REPORT")
    print("=" * 70)
    
    # 1. Class distribution check
    print("\n1Ô∏è‚É£  CLASS DISTRIBUTION (Target: 40/30/30)")
    print("-" * 70)
    class_dist = metadata_df['class'].value_counts(normalize=True).sort_index()
    for cls, pct in class_dist.items():
        pct_val = pct * 100
        print(f"   {cls:15s}: {pct_val:5.1f}% ({int(pct * len(metadata_df))} samples)")
    
    # Check if distribution is acceptable (¬±5% tolerance)
    targets = {'Background': 0.30, 'Cough': 0.40, 'Human_Noise': 0.30}
    all_valid = True
    for cls, target in targets.items():
        actual = class_dist.get(cls, 0)
        if abs(actual - target) > 0.05:
            print(f"   ‚ö†Ô∏è  {cls} distribution off by {abs(actual - target) * 100:.1f}%")
            all_valid = False
    
    if all_valid:
        print("   ‚úÖ Distribution check PASSED")
    else:
        print("   ‚ö†Ô∏è  Distribution check FAILED - may need rebalancing")
    
    # 2. Train/test split validation
    print("\n2Ô∏è‚É£  TRAIN/TEST SPLIT (Target: 80/20)")
    print("-" * 70)
    split_dist = metadata_df['split'].value_counts(normalize=True).sort_index()
    for split, pct in split_dist.items():
        pct_val = pct * 100
        print(f"   {split:10s}: {pct_val:5.1f}% ({int(pct * len(metadata_df))} samples)")
    
    train_pct = split_dist.get('train', 0)
    if abs(train_pct - 0.80) < 0.05:
        print("   ‚úÖ Train/test split check PASSED")
    else:
        print(f"   ‚ö†Ô∏è  Train/test split off target (expected 80%, got {train_pct*100:.1f}%)")
    
    # 3. Stratification check
    print("\n3Ô∏è‚É£  STRATIFICATION (Class distribution maintained in train/test)")
    print("-" * 70)
    stratif_valid = True
    for split in ['train', 'test']:
        split_data = metadata_df[metadata_df['split'] == split]
        split_class_dist = split_data['class'].value_counts(normalize=True)
        print(f"\n   {split.upper()} set class distribution:")
        for cls in ['Cough', 'Human_Noise', 'Background']:
            pct = split_class_dist.get(cls, 0) * 100
            print(f"      {cls:15s}: {pct:5.1f}%")
            # Check if within ¬±5% of target
            if cls == 'Cough' and abs(pct - 40) > 5:
                stratif_valid = False
            elif cls in ['Human_Noise', 'Background'] and abs(pct - 30) > 5:
                stratif_valid = False
    
    if stratif_valid:
        print("\n   ‚úÖ Stratification check PASSED")
    else:
        print("\n   ‚ö†Ô∏è  Stratification check FAILED")
    
    # 4. Data integrity check
    print("\n4Ô∏è‚É£  DATA INTEGRITY")
    print("-" * 70)
    print(f"   Total samples: {len(metadata_df)}")
    print(f"   Duplicate filenames: {len(metadata_df[metadata_df.duplicated(subset=['filename'])])}")
    print(f"   Missing values: {metadata_df.isnull().sum().sum()}")
    print(f"   Sample rate consistency: {metadata_df['sample_rate'].nunique()} unique value(s)")
    
    if (metadata_df[metadata_df.duplicated(subset=['filename'])].empty and 
        metadata_df.isnull().sum().sum() == 0):
        print("   ‚úÖ Data integrity check PASSED")
    else:
        print("   ‚ö†Ô∏è  Data integrity check FAILED")
    
    # 5. Feature statistics summary
    print("\n5Ô∏è‚É£  MFCC FEATURE STATISTICS (from sample window)")
    print("-" * 70)
    
    # Extract MFCC from sample window
    sample_mfcc = extract_mfcc(windows[0], sr=16000, n_mfcc=13)
    
    print(f"   MFCC shape: {sample_mfcc.shape} (13 coefficients √ó time frames)")
    print(f"   Mean values: {np.mean(sample_mfcc, axis=1).round(3)}")
    print(f"   Std deviation: {np.std(sample_mfcc, axis=1).round(3)}")
    print(f"   Value range: [{np.min(sample_mfcc):.3f}, {np.max(sample_mfcc):.3f}]")
    print("   ‚úÖ Feature statistics computed successfully")
    
    # Summary
    print("\n" + "=" * 70)
    print("üìã VALIDATION SUMMARY")
    print("=" * 70)
    print("""
    ‚úÖ READY FOR TRAINING if all checks pass:
       1. Class distribution within ¬±5% of targets
       2. Train/test split at 80/20
       3. Stratification maintained across splits
       4. No duplicate or missing data
       5. MFCC features computed without errors
    
    ‚ö†Ô∏è  POTENTIAL ISSUES:
       ‚Ä¢ Imbalanced classes ‚Üí Apply class weights during training
       ‚Ä¢ Poor stratification ‚Üí Use stratified_train_test_split
       ‚Ä¢ Outlier MFCC values ‚Üí Check for audio processing errors
    """)

# Run validation
validate_dataset(metadata_df)

# Create visualization comparing datasets
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Overall class distribution
class_counts = metadata_df['class'].value_counts()
axes[0, 0].bar(class_counts.index, class_counts.values, color=['#FF6B6B', '#4ECDC4', '#45B7D1'])
axes[0, 0].set_title("Overall Class Distribution")
axes[0, 0].set_ylabel("Count")
for i, v in enumerate(class_counts.values):
    axes[0, 0].text(i, v + 20, str(v), ha='center', fontweight='bold')

# Train vs Test samples per class
train_test_data = metadata_df.groupby(['class', 'split']).size().unstack()
train_test_data.plot(kind='bar', ax=axes[0, 1], color=['#4ECDC4', '#FF6B6B'])
axes[0, 1].set_title("Train vs Test Samples per Class")
axes[0, 1].set_ylabel("Count")
axes[0, 1].legend(['test', 'train'])

# Split distribution pie chart
split_counts = metadata_df['split'].value_counts()
axes[1, 0].pie(split_counts.values, labels=split_counts.index, autopct='%1.1f%%',
               colors=['#4ECDC4', '#45B7D1'], startangle=90)
axes[1, 0].set_title("Train/Test Split Distribution")

# Class percentages in training set
train_data = metadata_df[metadata_df['split'] == 'train']
train_class_pct = (train_data['class'].value_counts() / len(train_data) * 100).sort_index()
axes[1, 1].bar(range(len(train_class_pct)), train_class_pct.values, color=['#FF6B6B', '#4ECDC4', '#45B7D1'])
axes[1, 1].set_xticks(range(len(train_class_pct)))
axes[1, 1].set_xticklabels(train_class_pct.index, rotation=45)
axes[1, 1].set_title("Class Distribution in Training Set (%)")
axes[1, 1].set_ylabel("Percentage")
axes[1, 1].axhline(y=40, color='red', linestyle='--', alpha=0.5, label='Cough target (40%)')
axes[1, 1].axhline(y=30, color='blue', linestyle='--', alpha=0.5, label='Other targets (30%)')
axes[1, 1].legend()

for i, v in enumerate(train_class_pct.values):
    axes[1, 1].text(i, v + 1, f'{v:.1f}%', ha='center', fontweight='bold')

plt.tight_layout()
plt.show()