# Notebook 02: Feature Extraction

## Objectives
1. Load preprocessed audio data
2. Extract MFCCs, pitch, and spectral features
3. Generate both sequential and aggregated features
4. Save features for model training

In [None]:
import sys
sys.path.append('..')

from src.utils.helpers import load_config, set_random_seeds
from src.data.preprocessing import preprocess_audio_for_features
from src.features.audio_features import extract_all_features, pad_features_to_max_length
from src.features.feature_aggregation import aggregate_features
from src.data.dataset import SpeakerDataset, FeatureDataset
import numpy as np
from tqdm import tqdm

# Load config
config = load_config('../config/config.yaml')
set_random_seeds(config['seeds']['numpy'])

In [None]:
# Load dataset and split
dataset = SpeakerDataset(config['dataset']['data_dir'])
train_files, train_labels, val_files, val_labels, test_files, test_labels = dataset.split_dataset(
    train_ratio=config['splits']['train'],
    val_ratio=config['splits']['val'],
    test_ratio=config['splits']['test'],
    random_state=config['seeds']['numpy']
)

In [None]:
# Extract features for each split
def extract_dataset_features(audio_files, labels, config):
    sequential_features = []
    aggregated_features = []
    
    for audio_file in tqdm(audio_files):
        # Preprocess
        audio, sr = preprocess_audio_for_features(audio_file, config)
        
        # Extract features
        features = extract_all_features(audio, sr, config['features'])
        
        # Sequential (for CNN)
        seq_feat = pad_features_to_max_length(
            features['sequential'],
            max_frames=config['preprocessing']['padding']['max_frames']
        )
        sequential_features.append(seq_feat)
        
        # Aggregated (for RF)
        agg_feat = aggregate_features(seq_feat, config['aggregation']['statistics'])
        aggregated_features.append(agg_feat)
    
    return np.array(sequential_features), np.array(aggregated_features), np.array(labels)

In [None]:
# Extract all features
print('Extracting training features...')
X_train_seq, X_train_agg, y_train = extract_dataset_features(train_files, train_labels, config)

print('Extracting validation features...')
X_val_seq, X_val_agg, y_val = extract_dataset_features(val_files, val_labels, config)

print('Extracting test features...')
X_test_seq, X_test_agg, y_test = extract_dataset_features(test_files, test_labels, config)

In [None]:
# Save features
import h5py
import pickle
from pathlib import Path

Path(config['dataset']['processed_dir']).mkdir(parents=True, exist_ok=True)

# Save sequential features (for CNN)
with h5py.File(f"{config['dataset']['processed_dir']}/train_sequential.h5", 'w') as f:
    f.create_dataset('features', data=X_train_seq)
    f.create_dataset('labels', data=y_train)

with h5py.File(f"{config['dataset']['processed_dir']}/val_sequential.h5", 'w') as f:
    f.create_dataset('features', data=X_val_seq)
    f.create_dataset('labels', data=y_val)

with h5py.File(f"{config['dataset']['processed_dir']}/test_sequential.h5", 'w') as f:
    f.create_dataset('features', data=X_test_seq)
    f.create_dataset('labels', data=y_test)

# Save aggregated features (for RF)
with open(f"{config['dataset']['processed_dir']}/train_aggregated.pkl", 'wb') as f:
    pickle.dump({'features': X_train_agg, 'labels': y_train}, f)

with open(f"{config['dataset']['processed_dir']}/val_aggregated.pkl", 'wb') as f:
    pickle.dump({'features': X_val_agg, 'labels': y_val}, f)

with open(f"{config['dataset']['processed_dir']}/test_aggregated.pkl", 'wb') as f:
    pickle.dump({'features': X_test_agg, 'labels': y_test}, f)

print('Features saved successfully!')