# SereneSense Data Exploration & Analysis

Explore and understand the MAD dataset structure and characteristics.

**Duration**: ~15 minutes
**Topics**: Dataset loading, EDA, audio statistics, spectrograms

## Load Dataset Configuration

In [None]:
import yaml
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import librosa
from pathlib import Path
from collections import Counter

# Load MAD dataset config
with open('../configs/data/mad_dataset.yaml', 'r') as f:
    config = yaml.safe_load(f)

print('📊 MAD Dataset Configuration:')
print(f"  Name: {config['dataset']['name']}")
print(f"  Total samples: {config['dataset']['statistics']['total_samples']}")
print(f"  Classes: {config['dataset']['statistics']['classes']}")
print(f"  Total duration: {config['dataset']['statistics']['total_duration_hours']}h")
print(f"  Sample rate: {config['dataset']['statistics']['sample_rate']} Hz")
print(f"  License: {config['dataset']['license']['type']}")
print(f"  Download: {config['source']['download']['url']}")

## Class Distribution

Visualize the distribution of samples across classes:

In [None]:
# Class distribution from config
class_dist = config['subsets']['train']['class_distribution']
classes = list(class_dist.keys())
samples = list(class_dist.values())

# Create visualization
fig = go.Figure(data=[
    go.Bar(
        x=classes,
        y=samples,
        marker=dict(color='lightblue', line=dict(color='darkblue', width=1.5))
    )
])

fig.update_layout(
    title='MAD Dataset: Class Distribution',
    xaxis_title='Sound Class',
    yaxis_title='Number of Samples',
    height=400,
    template='plotly_white'
)
fig.show()

print(f'✓ Total samples: {sum(samples)}')
print(f'✓ Average samples per class: {np.mean(samples):.0f}')
print(f'✓ Std deviation: {np.std(samples):.0f}')

## Audio Feature Statistics

Analyze audio characteristics:

In [None]:
# Audio statistics
sr = config['dataset']['statistics']['sample_rate']
print(f'📈 Audio Statistics:')
print(f'  Sample rate: {sr:,} Hz')
print(f'  Bit depth: {config["dataset"]["statistics"]["bit_depth"]} bits')
print(f'  Duration per sample: {config["processing"]["duration_seconds"]} seconds')
print(f'  Target sample rate: {config["processing"]["target_sample_rate"]} Hz')
print(f'  Target channels: {config["processing"]["target_channels"]}')
print(f'  Expected samples per file: {sr * config["processing"]["duration_seconds"]:,}')

## Synthetic Data Exploration

Since we're in a notebook environment, let's analyze synthetic examples:

In [None]:
# Create synthetic samples for each class
sr = 16000
duration = 5
t = np.linspace(0, duration, int(sr * duration))

# Different frequency characteristics for each class
class_characteristics = {
    'Helicopter': {'freqs': [100, 300, 600], 'name': 'Low-frequency rotations'},
    'Fighter Aircraft': {'freqs': [500, 1500, 3000], 'name': 'High-frequency turbines'},
    'Military Vehicle': {'freqs': [150, 400, 900], 'name': 'Engine rumble'},
    'Truck': {'freqs': [100, 250, 500], 'name': 'Heavy engine'},
    'Footsteps': {'freqs': [100, 200], 'name': 'Rhythmic impacts'},
    'Speech': {'freqs': [300, 500, 2000], 'name': 'Vocal frequencies'},
    'Background': {'freqs': [50, 100, 200], 'name': 'Ambient noise'}
}

fig, axes = plt.subplots(4, 2, figsize=(14, 12))
fig.suptitle('Audio Characteristics by Class', fontsize=14, fontweight='bold')

for idx, (class_name, info) in enumerate(list(class_characteristics.items())[:7]):
    row, col = idx // 2, idx % 2
    ax = axes[row, col]
    
    # Create synthetic audio
    audio = np.zeros_like(t)
    for freq in info['freqs']:
        audio += 0.3 * np.sin(2 * np.pi * freq * t)
    audio += 0.05 * np.random.randn(len(audio))
    audio = audio / np.max(np.abs(audio)) * 0.9
    
    # Plot spectrogram
    mel_spec = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=32, fmax=4000)
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    
    im = ax.imshow(mel_spec_db, aspect='auto', origin='lower', cmap='viridis')
    ax.set_title(f'{class_name}\n({info["name"]})', fontweight='bold')
    ax.set_xlabel('Time')
    ax.set_ylabel('Freq (Hz)')
    plt.colorbar(im, ax=ax, label='dB')

axes[3, 1].axis('off')
plt.tight_layout()
plt.show()

print('✓ Spectrogram analysis complete')

## Data Splits

Understand the train/val/test splits:

In [None]:
splits_info = config['subsets']

total_samples = sum(splits_info['train']['class_distribution'].values())
train_samples = sum(splits_info['train']['class_distribution'].values())
val_samples = sum(splits_info['validation']['class_distribution'].values())
test_samples = sum(splits_info['test']['class_distribution'].values())

print('📋 Data Splits:')
print(f'  Training: {train_samples:,} samples ({train_samples/total_samples*100:.1f}%)')
print(f'  Validation: {val_samples:,} samples ({val_samples/total_samples*100:.1f}%)')
print(f'  Test: {test_samples:,} samples ({test_samples/total_samples*100:.1f}%)')
print(f'  Total: {total_samples:,} samples')

# Visualize splits
fig = go.Figure(data=[go.Pie(
    labels=['Train', 'Validation', 'Test'],
    values=[train_samples, val_samples, test_samples],
    hole=0.3,
    marker=dict(colors=['lightgreen', 'lightyellow', 'lightcoral'])
)])
fig.update_layout(title='Data Split Distribution', height=400)
fig.show()

## Key Takeaways

✓ MAD dataset contains 8,075 samples across 7 military-relevant sound classes
✓ Balanced class distribution ensures fair model training
✓ 16kHz sample rate is standard for audio classification
✓ 10-second clips allow temporal pattern learning
✓ Multiple domain sources provide diversity

Next: See `03_model_training.ipynb` to train on this data!