# Enhanced Preprocessing for Ensemble Learning

This notebook extends the existing preprocessing to generate multiple feature types for ensemble learning.

In [None]:
import os
import uuid
from tqdm import tqdm
import random
import numpy as np
import librosa
from joblib import Parallel, delayed
import matplotlib.pyplot as plt
from modules.PostgresDBHandler import PostgresDBHandler

In [2]:
# Configuration
base_dir = "./"
intermediate_dir = "ensemble_intermediate_results"
fixedLength = 128
dbParams = {
    "dbname": "mydatabase",
    "user": "myuser",
    "password": "mypassword",
    "host": "postgres_server",
    "port": "5432",
}

# Feature types to generate
FEATURE_TYPES = [
    'mel_spectrogram', 'mfcc', 'chromagram', 'spectral_contrast',
    'tonnetz', 'constant_q', 'cqt', 'stft', 'harmonic_percussive', 'onset_strength'
]

# Create directories for each feature type
for feature_type in FEATURE_TYPES:
    os.makedirs(os.path.join(intermediate_dir, feature_type), exist_ok=True)

In [None]:
# Initialize database and ensure feature types exist
db = PostgresDBHandler(**dbParams)
db.connect()

# Check if feature types exist, if not create them
existing_feature_types = [ft['name'] for ft in db.get_all_feature_types()]

feature_type_params = {
    'mel_spectrogram': {'n_mels': 64, 'fmin': 0, 'fmax': None},
    'mfcc': {'n_mfcc': 8, 'n_mels': 64},
    'chromagram': {'n_chroma': 8},
    'spectral_contrast': {'n_bands': 2},  # To get 3 bands in output (n_bands+1)
    'tonnetz': {},
    'constant_q': {'bins_per_octave': 6, 'n_bins': 42},  # 7 octaves x 6 bins/octave
    'cqt': {'bins_per_octave': 12, 'n_bins': 84},  # 7 octaves x 12 bins/octave (standard musical scale)
    'stft': {'n_fft': 512, 'hop_length': 256},
    'harmonic_percussive': {'margin': 3.0, 'n_fft': 2048, 'hop_length': 1024},
    'onset_strength': {'hop_length': 128}
}



for feature_type, params in feature_type_params.items():
    if feature_type not in existing_feature_types:
        description = f"{feature_type.replace('_', ' ').title()} features"
        db.insert_feature_type(feature_type, description, params)
        print(f"Created feature type: {feature_type}")

db.close()

In [4]:
def pad_or_truncate(array, fixed_length):
    if array.shape[1] > fixed_length:
        return array[:, :fixed_length]
    elif array.shape[1] < fixed_length:
        pad_width = fixed_length - array.shape[1]
        return np.pad(array, ((0, 0), (0, pad_width)), mode="constant")
    else:
        return array

In [5]:
def augment_audio(audio_data, sr):
    """Apply audio augmentation techniques."""
    # Time-stretching
    stretched = librosa.effects.time_stretch(audio_data, rate=1.1)
    
    # Pitch-shifting
    pitched = librosa.effects.pitch_shift(audio_data, sr=sr, n_steps=2)
    
    # Adding noise
    noise = np.random.randn(len(audio_data))
    audio_data_noisy = audio_data + 0.005 * noise
    
    # Ensure all augmented data have the same dtype
    audio_data_noisy = audio_data_noisy.astype(np.float32)
    stretched = stretched.astype(np.float32)
    pitched = pitched.astype(np.float32)
    
    augmented_data = {
        "time_stretch": stretched,
        "pitch_shifting": pitched,
        "noise": audio_data_noisy,
    }
    
    return augmented_data

In [None]:
def extract_feature_in_chunks(audio_path, feature_type, sr, chunk_duration=0.5):
    """
    Extract features from an audio file in chunks of chunk_duration seconds.
    """
    try:
        # Load full audio
        audio_data, _ = librosa.load(audio_path, sr=sr, offset=0, duration=None)
        audio_length = len(audio_data)

        # Calculate chunk size in samples
        chunk_size = int(sr * chunk_duration)
        min_chunk_size = 2048  # minimum chunk size to support n_fft sizes safely
        if chunk_size < min_chunk_size:
            chunk_size = min_chunk_size

        features = {feature_type: []}

        for start in range(0, audio_length, chunk_size):
            chunk = audio_data[start:start + chunk_size]

            # Pad last chunk if smaller than chunk_size
            if len(chunk) < chunk_size:
                chunk = np.pad(chunk, (0, chunk_size - len(chunk)), mode="constant")

            # Dynamic n_fft and hop_length, n_fft <= chunk length
            n_fft = min(2048, len(chunk))
            hop_length = n_fft // 2

            # Skip chunks with too little signal
            non_silent_intervals = librosa.effects.split(chunk, top_db=30)
            if len(non_silent_intervals) == 0:
                continue

            # Feature extraction based on feature_type
            if feature_type == 'mel_spectrogram':
                mel_spec = librosa.feature.melspectrogram(y=chunk, sr=sr, n_mels=64, n_fft=n_fft, hop_length=hop_length)
                features[feature_type].append(librosa.power_to_db(mel_spec, ref=np.max))

            elif feature_type == 'mfcc':
                mfcc = librosa.feature.mfcc(y=chunk, sr=sr, n_mfcc=8, n_fft=n_fft, hop_length=hop_length)
                features[feature_type].append(mfcc)

            elif feature_type == 'chromagram':
                chroma = librosa.feature.chroma_stft(y=chunk, sr=sr, n_chroma=8, n_fft=n_fft, hop_length=hop_length)
                features[feature_type].append(chroma)

            elif feature_type == 'spectral_contrast':
                contrast = librosa.feature.spectral_contrast(y=chunk, sr=sr, n_bands=3, n_fft=n_fft, hop_length=hop_length)
                features[feature_type].append(contrast)

            elif feature_type == 'tonnetz':
                y_harmonic = librosa.effects.harmonic(chunk)
                tonnetz = librosa.feature.tonnetz(y=y_harmonic, sr=sr)
                features[feature_type].append(tonnetz)

            elif feature_type == 'constant_q':
                cqt = librosa.cqt(y=chunk, sr=sr, bins_per_octave=6, n_bins=42)
                features[feature_type].append(librosa.amplitude_to_db(np.abs(cqt), ref=np.max))

            elif feature_type == 'cqt':
                chroma_cqt = librosa.feature.chroma_cqt(y=chunk, sr=sr, bins_per_octave=84, n_chroma=42)
                features[feature_type].append(chroma_cqt)

            elif feature_type == 'stft':
                stft = librosa.stft(y=chunk, n_fft=n_fft, hop_length=hop_length)
                features[feature_type].append(librosa.amplitude_to_db(np.abs(stft), ref=np.max))

            elif feature_type == 'harmonic_percussive':
                y_harmonic, _ = librosa.effects.hpss(chunk, margin=3.0)
                harmonic_stft = librosa.stft(y_harmonic, n_fft=n_fft, hop_length=hop_length)
                features[feature_type].append(librosa.amplitude_to_db(np.abs(harmonic_stft), ref=np.max))

            elif feature_type == 'onset_strength':
                onset_env = librosa.onset.onset_strength(y=chunk, sr=sr, hop_length=hop_length)
                features[feature_type].append(onset_env.reshape(1, -1))

        if len(features[feature_type]) == 0:
            return None

        # Concatenate features along time axis
        return np.concatenate(features[feature_type], axis=-1)

    except Exception as e:
        print(f"Error extracting features from {audio_path}: {e}")
        return None


In [7]:
def process_file_ensemble(audio_index, db_params):
    """Process a single audio file and generate features incrementally."""
    db = PostgresDBHandler(**db_params)
    db.connect()
    
    try:
        db_data = db.get_audio_file(audio_index)
        audio_path = db_data['filePath']
        sr = db_data["sampleRate"]
        
        # Get feature type IDs
        feature_type_ids = {}
        for feature_type in FEATURE_TYPES:
            feature_type_ids[feature_type] = db.get_feature_type_id(feature_type)
        
        # Process each feature type incrementally
        for feature_type in FEATURE_TYPES:
            print(f"Processing feature: {feature_type}")
            feature_data = extract_feature_in_chunks(audio_path, feature_type, sr)
            
            if feature_data is not None:
                unique_id = uuid.uuid4()
                feature_path = os.path.join(intermediate_dir, feature_type, f"{unique_id}_{feature_type}.npy")
                
                np.save(feature_path, feature_data)
                
                # Insert processed feature into the database
                db.insert_processed_audio(
                    db_data["instrumentID"],
                    db_data["audioID"],
                    fixedLength,
                    feature_type_ids[feature_type],
                    feature_path,
                    "original",
                )
        
        return True
        
    except Exception as e:
        print(f"Error processing audio {audio_index}: {e}")
        return False
    finally:
        db.close()

In [None]:
def apply_ensemble_preprocessing(audios_ids, db_params, n_jobs=-1):
    results = Parallel(n_jobs=n_jobs)(
        delayed(process_file_ensemble)(audio_index, db_params) for audio_index in tqdm(audios_ids)
    )
    successful = sum(results)
    total = len(audios_ids)
    print(f"Successfully processed {successful}/{total} audio files")
    return results

In [9]:
# Get all audio IDs and process them
db = PostgresDBHandler(**dbParams)
db.connect()
audio_ids = db.get_all_audio_ids()
print(f"Found {len(audio_ids)} audio files to process")
db.close()

Found 16310 audio files to process


In [None]:
apply_ensemble_preprocessing(audio_ids, dbParams)


  from pkg_resources import resource_filename
  from pkg_resources import resource_filename
  from pkg_resources import resource_filename
  from pkg_resources import resource_filename
  from pkg_resources import resource_filename
  from pkg_resources import resource_filename
  from pkg_resources import resource_filename
  from pkg_resources import resource_filename
  from pkg_resources import resource_filename
  from pkg_resources import resource_filename
  from pkg_resources import resource_filename
  from pkg_resources import resource_filename
  from pkg_resources import resource_filename
  from pkg_resources import resource_filename
  from pkg_resources import resource_filename
  from pkg_resources import resource_filename
  return pitch_tuning(
  return pitch_tuning(
  return pitch_tuning(
  return pitch_tuning(
  return pitch_tuning(
  return pitch_tuning(
  return pitch_tuning(
  return pitch_tuning(
  return pitch_tuning(
  return pitch_tuning(
  return pitch_tuning(
  return pi

Processing feature: mel_spectrogram
Processing feature: mfcc
Processing feature: chromagram
Processing feature: spectral_contrast
Processing feature: tonnetz
Processing feature: constant_q
Processing feature: cqt
Error extracting features from ./good-sounds/sound_files/violin_raquel_richness/akg/0051.wav: bins_per_octave=6 must be an integer multiple of n_chroma=42
Processing feature: stft
Processing feature: harmonic_percussive
Processing feature: onset_strength
Processing feature: mel_spectrogram
Processing feature: mfcc
Processing feature: chromagram
Processing feature: spectral_contrast
Processing feature: tonnetz
Processing feature: constant_q
Processing feature: cqt
Error extracting features from ./good-sounds/sound_files/violin_raquel_richness/akg/0027.wav: bins_per_octave=6 must be an integer multiple of n_chroma=42
Processing feature: stft
Processing feature: harmonic_percussive
Processing feature: onset_strength
Processing feature: mel_spectrogram
Processing feature: mfcc
Pro

In [None]:
# Visualization function to compare different feature types
def visualize_features_comparison(audio_file_path, sr=22050):
    """Visualize all feature types for a single audio file."""
    features = {}
    for feature_type in FEATURE_TYPES:
        feature_data = extract_feature_in_chunks(audio_file_path, feature_type, sr)
        if feature_data is not None:
            features[feature_type] = feature_data
    
    n_features = len(features)
    cols = 3
    rows = (n_features + cols - 1) // cols
    
    fig, axes = plt.subplots(rows, cols, figsize=(15, 5*rows))
    axes = axes.flatten() if rows > 1 else [axes] if cols == 1 else axes
    
    for i, (feature_name, feature_data) in enumerate(features.items()):
        ax = axes[i]
        
        if feature_name in ['mel_spectrogram', 'constant_q', 'stft', 'harmonic_percussive']:
            librosa.display.specshow(feature_data, sr=sr, x_axis='time', y_axis='mel', ax=ax)
            ax.set_title(f'{feature_name.replace("_", " ").title()}')
        elif feature_name == 'mfcc':
            librosa.display.specshow(feature_data, x_axis='time', ax=ax)
            ax.set_title('MFCC')
        elif feature_name in ['chromagram', 'cqt']:
            librosa.display.specshow(feature_data, x_axis='time', y_axis='chroma', ax=ax)
            ax.set_title(f'{feature_name.replace("_", " ").title()}')
        elif feature_name == 'spectral_contrast':
            librosa.display.specshow(feature_data, x_axis='time', ax=ax)
            ax.set_title('Spectral Contrast')
        elif feature_name == 'tonnetz':
            librosa.display.specshow(feature_data, x_axis='time', ax=ax)
            ax.set_title('Tonal Centroids')
        elif feature_name == 'onset_strength':
            ax.plot(feature_data[0])
            ax.set_title('Onset Strength')
            ax.set_ylabel('Strength')
            ax.set_xlabel('Time')
    
    # Hide empty subplots if any
    for i in range(n_features, len(axes)):
        axes[i].set_visible(False)
    
    plt.tight_layout()
    plt.show()

In [None]:
db = PostgresDBHandler(**dbParams)
db.connect()
audio_files = [db.get_audio_file(audio_id)['filePath'] for audio_id in db.get_all_audio_ids()]
db.close()

random_audio_path = random.choice(audio_files)
visualize_features_comparison(random_audio_path, sr=22050)

In [None]:
# Statistics about processed data
def print_processing_stats():
    """Print statistics about the processed data."""
    db = PostgresDBHandler(**dbParams)
    db.connect()
    
    print("=== Processing Statistics ===")
    
    # Get counts for each feature type
    for feature_type in FEATURE_TYPES:
        processed_data = db.get_processed_data_by_feature_type(feature_type)
        print(f"{feature_type}: {len(processed_data)} samples")
    
    # Get augmentation statistics
    query = """
    SELECT augmentation, COUNT(*) as count 
    FROM Processed 
    GROUP BY augmentation
    """
    db.execute_query(query)
    augmentation_stats = db.fetchall()
    
    print("\n=== Augmentation Statistics ===")
    for aug, count in augmentation_stats:
        print(f"{aug}: {count} samples")
    
    db.close()

print_processing_stats()