# **BirdCLEF 2025 Data Preprocessing Notebook**
This notebook demonstrates how we can transform audio data into mel-spectrogram data. This transformation is essential for training 2D Convolutional Neural Networks (CNNs) on audio data, as it converts the one-dimensional audio signals into two-dimensional image-like representations.
I run this public notebook in debug mode(only a few sample processing). You can find the fully preprocessed mel spectrogram training dataset here --> [BirdCLEF'25 | Mel Spectrograms](https://www.kaggle.com/datasets/kadircandrisolu/birdclef25-mel-spectrograms).


In [None]:
# Imports
import os
import cv2
import math
import time
import librosa
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import pandas as pd

import torch
import warnings
warnings.filterwarnings("ignore")

**Step1: Load the Silero model**

In [None]:
# Loading this requires internet on the first run, but cashes it locally
model, utils = torch.hub.load(
    repo_or_dir='snakers4/silero-vad',
    model='silero_vad',
    force_reload=False
)
(get_speech_timestamps, _, read_audio, _, _) = utils

In [None]:
# Function for removing human voice
def remove_human_voice(audio: np.ndarray, sr: int) -> np.ndarray:
    audio_t = torch.from_numpy(audio).float()
    speech_ts = get_speech_timestamps(audio_t, model, sampling_rate=sr)
    if not speech_ts:
        return audio  # no speech detected

    mask = torch.ones_like(audio_t)
    for seg in speech_ts:
        mask[seg['start']:seg['end']] = 0.0

    return (audio_t * mask).numpy()

In [None]:
# Configuration function
class Config:
    # Debug mode implemented to debug the models
    DEBUG_MODE = False
    # Identifying paths
    OUTPUT_DIR = '/kaggle/working/'
    DATA_ROOT = '/kaggle/input/birdclef-2025'

    FS = 32000 # Hz
    
    # Mel spectrogram parameters
    N_FFT = 1024 # Size of the FFT window
    HOP_LENGTH = 512 # Number of audio samples between frames
    N_MELS = 128 # Number of mel bands to generate
    FMIN = 50 # Lowest frequency (Hz) included
    FMAX = 14000 # Highest frequency (Hz) included
    
    TARGET_DURATION = 5.0 # Duration of audio clips (s)
    TARGET_SHAPE = (256, 256)  # Target shape of the spectrogram
    
    N_MAX = 50 if DEBUG_MODE else None   # Number of audio files to process if in debug mode

    # MFCC specific parameters
    n_mfcc = 40 # Number of MFCCs to extract

    # CQT specific parameters
    n_bins=84  # Number of frequency bins for CQT
    bins_per_octave=12 # Number of frequency bins per octave

config = Config()

In [None]:
# Summary of debug mode enabled and number of samples
print(f"Debug mode: {'ON' if config.DEBUG_MODE else 'OFF'}")
print(f"Max samples to process: {config.N_MAX if config.N_MAX is not None else 'ALL'}")

# Loading taxonomy and metadata
print("Loading taxonomy data...")
taxonomy_df = pd.read_csv(f'{config.DATA_ROOT}/taxonomy.csv')
species_class_map = dict(zip(taxonomy_df['primary_label'], taxonomy_df['class_name']))

print("Loading training metadata...")
train_df = pd.read_csv(f'{config.DATA_ROOT}/train.csv')

In [None]:
# Label extraction
label_list = sorted(train_df['primary_label'].unique())
label_id_list = list(range(len(label_list)))
label2id = dict(zip(label_list, label_id_list))
id2label = dict(zip(label_id_list, label_list))

# Creation of working_df file
print(f'Found {len(label_list)} unique species')
working_df = train_df[['primary_label', 'rating', 'filename']].copy()
working_df['target'] = working_df.primary_label.map(label2id)
working_df['filepath'] = config.DATA_ROOT + '/train_audio/' + working_df.filename
working_df['samplename'] = working_df.filename.map(lambda x: x.split('/')[0] + '-' + x.split('/')[-1].split('.')[0])
working_df['class'] = working_df.primary_label.map(lambda x: species_class_map.get(x, 'Unknown'))
total_samples = min(len(working_df), config.N_MAX or len(working_df))
print(f'Total samples to process: {total_samples} out of {len(working_df)} available')
print(f'Samples by class:')
print(working_df['class'].value_counts())

In [None]:
# Function to calculate the mel-spectrogram
def audio2melspec(audio_data):
    # Replace not numbers with mean signal
    if np.isnan(audio_data).any():
        mean_signal = np.nanmean(audio_data)
        audio_data = np.nan_to_num(audio_data, nan=mean_signal)

    # Using the librosa library to calculate the mel-spectrogram 
    mel_spec = librosa.feature.melspectrogram(
        y=audio_data,
        sr=config.FS,
        n_fft=config.N_FFT,
        hop_length=config.HOP_LENGTH,
        n_mels=config.N_MELS,
        fmin=config.FMIN,
        fmax=config.FMAX,
        power=2.0
    )

    # Normalization to dB
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    mel_spec_norm = (mel_spec_db - mel_spec_db.min()) / (mel_spec_db.max() - mel_spec_db.min() + 1e-8)
    
    return mel_spec_norm

In [None]:
# Print statements to follow processing
print("Starting audio processing...")
print(f"{'DEBUG MODE - Processing only 50 samples' if config.DEBUG_MODE else 'FULL MODE - Processing all samples'}")
# Timer
start_time = time.time()

# Storage
all_bird_data = {}
errors = []

# Fill the all_bird_data file with calculated feature representation
for i, row in tqdm(working_df.iterrows(), total=total_samples):
    if config.N_MAX is not None and i >= config.N_MAX:
        break
    
    try:
        audio_data, _ = librosa.load(row.filepath, sr=config.FS)
        if audio_data is None or audio_data.size == 0:
            print(f"Skipping empty file {row.filepath}")
            errors.append((row.filepath, "Empty audio_data"))
            continue
        audio_novoice = remove_human_voice(audio_data, config.FS)
        
        target_samples = int(config.TARGET_DURATION * config.FS)
        if len(audio_novoice) < target_samples:
            n_copy = math.ceil(target_samples / len(audio_novoice))
            audio_novoice = np.tile(audio_novoice, n_copy)
        start = len(audio_novoice)//2 - target_samples//2
        clip = audio_novoice[max(0, start): max(0, start)+target_samples]
        if len(clip) < target_samples:
            clip = np.pad(clip, (0, target_samples-len(clip)))
            
        print("calculate mel_spec")
        mel_spec = audio2melspec(clip)

        if mel_spec.shape != config.TARGET_SHAPE:
            mel_spec = cv2.resize(mel_spec, config.TARGET_SHAPE, interpolation=cv2.INTER_LINEAR)
        features = np.stack([mel_spec], axis=0).astype(np.float32)
        all_bird_data[row.samplename] = features
        
    except Exception as e:
        print(f"Error processing {row.filepath}: {e}")
        errors.append((row.filepath, str(e)))

# Print processing time and files that did and did not succeed to process
end_time = time.time()
print(f"Processing completed in {end_time - start_time:.2f} seconds")
print(f"Successfully processed {len(all_bird_data)} files out of {total_samples} total")
print(f"Failed to process {len(errors)} files")

In [None]:
working_df['samplename']

In [None]:
# Visualize the spectrogram
import matplotlib.pyplot as plt

samples = []
displayed_classes = set()

max_samples = min(4, len(all_bird_data))

for i, row in working_df.iterrows():
    if i >= (config.N_MAX or len(working_df)):
        break
        
    if row['samplename'] in all_bird_data:
        if row['class'] not in displayed_classes:
            samples.append((row['samplename'], row['class'], row['primary_label']))
            displayed_classes.add(row['class'])
        if len(samples) >= max_samples:  
            break

if samples:
    samplename, class_name, species = samples[0]
    
    feat = all_bird_data[samplename][0]   # shape: (3, H, W)
    plt.figure(figsize=(6, 4))

    plt.imshow(feat, aspect='auto', origin='lower', cmap='viridis')
    plt.title(f"Mel-Spec — {class_name}:{species}")
    plt.colorbar(fraction=0.046, pad=0.04)

    plt.tight_layout()
    plt.show()

In [None]:
# Visualize grid of spectrograms for the first few samples
# Decide how many to show
n_show = 10
# Grab the first n_show sample names
sample_names = list(all_bird_data.keys())[:n_show]
fig, axes = plt.subplots(2, 5, figsize=(15, 6), constrained_layout=True)
fig.suptitle(f'Top {n_show} samples — Mel-spec', fontsize=16)

for idx, samplename in enumerate(sample_names):
    row, col = divmod(idx, 5)
    ax = axes[row][col]
    ax.imshow(all_bird_data[samplename][0],
              origin='lower',
              aspect='auto',
              cmap='viridis')
    ax.set_title(samplename, fontsize=8)
    ax.axis('off')

plt.show()

In [None]:
# Create the pickle files
import os, pickle

# make sure /kaggle/working exists
os.makedirs('/kaggle/working', exist_ok=True)

# 1) Save the DataFrame (metadata)
df_path = '/kaggle/working/working_df.pkl'
working_df.to_pickle(df_path)
print(f"✅ working_df saved to {df_path}")

# 2) Save the feature dict
data_path = '/kaggle/working/all_bird_data.pkl'
with open(data_path, 'wb') as f:
    pickle.dump(all_bird_data, f)
print(f"✅ all_bird_data saved to {data_path}")
