# Notebook 01: Exploratory Data Analysis

This notebook performs exploratory analysis of the VoxCeleb1 dataset.

## Objectives
1. Load and explore the dataset structure
2. Analyze audio file characteristics
3. Visualize audio samples (waveforms, spectrograms)
4. Examine speaker distribution
5. Assess audio quality metrics

In [None]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import librosa
import librosa.display
from pathlib import Path
import json
from tqdm import tqdm

import sys
sys.path.append('..')

from src.utils.helpers import load_config, print_system_info
from src.data.dataset import SpeakerDataset

%matplotlib inline
sns.set_style('whitegrid')

print_system_info()

## 1. Load Configuration

In [None]:
# Load configuration
config = load_config('../config/config.yaml')

# Get dataset path
data_dir = config['dataset']['data_dir']
print(f"Dataset directory: {data_dir}")

## 2. Load Dataset

In [None]:
# Load dataset
dataset = SpeakerDataset(data_dir)

print(f"\nNumber of speakers: {dataset.get_num_speakers()}")
print(f"Total audio files: {len(dataset.audio_files)}")
print(f"\nSpeaker mapping: {dataset.speaker_to_id}")

## 3. Analyze Dataset Statistics

In [None]:
# Create DataFrame for analysis
data = []
for audio_file, label in zip(dataset.audio_files, dataset.labels):
    speaker_name = dataset.get_speaker_name(label)
    data.append({
        'audio_file': audio_file,
        'speaker_id': speaker_name,
        'label': label
    })

df = pd.DataFrame(data)

# Display first few rows
print("\nDataset sample:")
print(df.head())

# Speaker distribution
speaker_counts = df['speaker_id'].value_counts()
print(f"\nSamples per speaker:\n{speaker_counts}")

In [None]:
# Plot speaker distribution
fig, ax = plt.subplots(figsize=(10, 6))
speaker_counts.plot(kind='bar', ax=ax, color='steelblue', alpha=0.7)
ax.set_xlabel('Speaker ID')
ax.set_ylabel('Number of Samples')
ax.set_title('Speaker Distribution')
ax.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.show()

print(f"\nStatistics:")
print(f"  Mean: {speaker_counts.mean():.1f}")
print(f"  Std: {speaker_counts.std():.1f}")
print(f"  Min: {speaker_counts.min()}")
print(f"  Max: {speaker_counts.max()}")

## 4. Analyze Audio Characteristics

In [None]:
# Analyze audio duration and sample rate
print("Analyzing audio files (this may take a while)...\n")

durations = []
sample_rates = []

# Sample a subset for faster analysis
sample_size = min(50, len(dataset.audio_files))
sample_indices = np.random.choice(len(dataset.audio_files), sample_size, replace=False)

for idx in tqdm(sample_indices):
    audio_file = dataset.audio_files[idx]
    try:
        y, sr = librosa.load(audio_file, sr=None)
        duration = librosa.get_duration(y=y, sr=sr)
        durations.append(duration)
        sample_rates.append(sr)
    except Exception as e:
        print(f"Error loading {audio_file}: {e}")

print(f"\nAudio duration statistics (seconds):")
print(f"  Mean: {np.mean(durations):.2f}")
print(f"  Std: {np.std(durations):.2f}")
print(f"  Min: {np.min(durations):.2f}")
print(f"  Max: {np.max(durations):.2f}")

print(f"\nSample rates: {np.unique(sample_rates)}")

In [None]:
# Plot duration distribution
fig, ax = plt.subplots(figsize=(10, 6))
ax.hist(durations, bins=30, color='steelblue', alpha=0.7, edgecolor='black')
ax.set_xlabel('Duration (seconds)')
ax.set_ylabel('Frequency')
ax.set_title('Audio Duration Distribution')
ax.axvline(np.mean(durations), color='red', linestyle='--', label=f'Mean: {np.mean(durations):.2f}s')
ax.legend()
ax.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.show()

## 5. Visualize Audio Samples

In [None]:
# Select one audio per speaker for visualization
speakers_to_viz = []
for speaker_id in range(dataset.get_num_speakers()):
    indices = [i for i, label in enumerate(dataset.labels) if label == speaker_id]
    if indices:
        speakers_to_viz.append(dataset.audio_files[indices[0]])

print(f"Visualizing {len(speakers_to_viz)} audio samples (one per speaker)...")

In [None]:
# Plot waveforms
n_speakers = len(speakers_to_viz)
fig, axes = plt.subplots(n_speakers, 1, figsize=(12, 2*n_speakers))

if n_speakers == 1:
    axes = [axes]

for idx, audio_file in enumerate(speakers_to_viz):
    y, sr = librosa.load(audio_file, sr=16000, duration=3.0)
    librosa.display.waveshow(y, sr=sr, ax=axes[idx])
    speaker_name = Path(audio_file).parent.name
    axes[idx].set_title(f'Speaker {speaker_name}')
    axes[idx].set_xlabel('Time (s)')
    axes[idx].set_ylabel('Amplitude')

plt.tight_layout()
plt.show()

In [None]:
# Plot spectrograms
fig, axes = plt.subplots(n_speakers, 1, figsize=(12, 3*n_speakers))

if n_speakers == 1:
    axes = [axes]

for idx, audio_file in enumerate(speakers_to_viz):
    y, sr = librosa.load(audio_file, sr=16000, duration=3.0)
    D = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max)
    img = librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='hz', ax=axes[idx])
    speaker_name = Path(audio_file).parent.name
    axes[idx].set_title(f'Spectrogram - Speaker {speaker_name}')
    fig.colorbar(img, ax=axes[idx], format='%+2.0f dB')

plt.tight_layout()
plt.show()

## 6. Summary and Conclusions

In [None]:
print("=" * 60)
print("SUMMARY")
print("=" * 60)
print(f"\nDataset: {data_dir}")
print(f"Number of speakers: {dataset.get_num_speakers()}")
print(f"Total audio files: {len(dataset.audio_files)}")
print(f"\nSamples per speaker:")
print(f"  Min: {speaker_counts.min()}")
print(f"  Max: {speaker_counts.max()}")
print(f"  Mean: {speaker_counts.mean():.1f}")
print(f"\nAudio characteristics (from sample):")
print(f"  Duration: {np.mean(durations):.2f} ± {np.std(durations):.2f} seconds")
print(f"  Sample rates: {np.unique(sample_rates)}")
print("\n" + "=" * 60)
print("\nNext steps:")
print("1. Feature extraction (Notebook 02)")
print("2. Model training (Notebooks 03-04)")
print("3. Comparative analysis (Notebook 05)")