In [28]:
import pandas as pd
import numpy as np
import librosa
import os
import time
import warnings
import logging
import math
import cv2
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder

In [11]:

import torch
import warnings
warnings.filterwarnings("ignore")

In [35]:
class CFG:
    
    seed=42
    apex=True
    DEBUG_MODE=True
    FS=32000
    OUTPUT_DIR='/kaggle/working'
    data = '/kaggle/input/birdclef-2025'
    N_FFT = 1024
    HOP_LENGTH = 512
    N_MELS = 128
    FMIN = 50
    FMAX = 14000
    
    TARGET_DURATION = 5.0
    TARGET_SHAPE = (256, 256)  
    
    N_MAX = 50 if DEBUG_MODE else None  
    TARGET_SHAPE = (256, 256) 
cfg=CFG()

In [36]:
taxonomy_csv=pd.read_csv(f'{cfg.data}/taxonomy.csv')
train_csv=pd.read_csv(f'{cfg.data}/train.csv')
#converting_text data to num data 
le=LabelEncoder()
train_csv['label_id']=le.fit_transform(train_csv["primary_label"])
labeltoid={label: idx for idx ,label in enumerate(le.classes_)}
idtolabel={idx: label for idx ,label in enumerate(le.classes_)}
species_class_map = dict(zip(taxonomy_csv['primary_label'], taxonomy_csv['class_name']))

In [42]:
code_df=train_csv[['primary_label','filename','rating']]
code_df
code_df['targ']=code_df.primary_label.map(labeltoid)
code_df['file_path']=cfg.data+"/train_audio/"+code_df['filename']
code_df['samplename'] = code_df.filename.map(
    lambda x: x.split('/')[0] + '-' + x.split('/')[-1].split('.')[0]
)
code_df['class'] = code_df.primary_label.map(lambda x: species_class_map.get(x, 'Unknown'))
total_samples = min(len(code_df), cfg.N_MAX or len(code_df))


In [46]:
def audio2melspec(audio_data):
    if np.isnan(audio_data).any():
        mean_sig=np.nanmean(audio_data)
        audio_data=np.nantonum(audio_data,nan=mean_sig)
    mel_spec = librosa.feature.melspectrogram(
    y=audio_data,
    sr=32000,
    n_fft=1024,
    hop_length=512,
    n_mels=128,
    fmin=50,
    fmax=14000,
    power=2.0,
)
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    mel_spec_norm = (mel_spec_db - mel_spec_db.min()) / (mel_spec_db.max() - mel_spec_db.min() + 1e-8)
    
    return mel_spec_norm


In [None]:
start_time = time.time()  # Start timer to measure processing duration

all_bird_data = {}  # Dictionary to store mel spectrograms for each sample
errors = []         # List to store any errors that occur during processing

# Loop through each row in the DataFrame with a progress bar
for i, row in tqdm(code.iterrows(), total=total_samples):

    # If a maximum number of samples is defined and reached, stop early
    if cfg.N_MAX is not None and i >= cfg.N_MAX:
        break

    try:
        # Load the audio file using librosa, resample to config.FS
        audio_data, _ = librosa.load(row.filepath, sr=config.FS)

        # Calculate how many samples should be in a fixed-duration clip
        target_samples = int(config.TARGET_DURATION * config.FS)

        # If audio is shorter than target duration, repeat (tile) it until it's long enough
        if len(audio_data) < target_samples:
            n_copy = math.ceil(target_samples / len(audio_data))  # How many times to repeat
            if n_copy > 1:
                audio_data = np.concatenate([audio_data] * n_copy)

        # Extract the center of the audio clip
        start_idx = max(0, int(len(audio_data) / 2 - target_samples / 2))
        end_idx = min(len(audio_data), start_idx + target_samples)
        center_audio = audio_data[start_idx:end_idx]

        # If center audio is still a bit short, pad with silence (zeros) at the end
        if len(center_audio) < target_samples:
            center_audio = np.pad(
                center_audio,
                (0, target_samples - len(center_audio)),
                mode='constant'
            )

        # Convert the centered audio to a Mel spectrogram using your custom function
        mel_spec = audio2melspec(center_audio)

        # Resize mel spectrogram to match desired shape (e.g., for CNN input)
        if mel_spec.shape != config.TARGET_SHAPE:
            mel_spec = cv2.resize(mel_spec, config.TARGET_SHAPE, interpolation=cv2.INTER_LINEAR)

        # Save the final processed spectrogram in the dictionary, keyed by sample name
        all_bird_data[row.samplename] = mel_spec.astype(np.float32)

    except Exception as e:
        # Log and store any errors that occur (e.g., file missing or audio read error)
        print(f"Error processing {row.filepath}: {e}")
        errors.append((row.filepath, str(e)))

# Print summary of processing
end_time = time.time()
print(f"Processing completed in {end_time - start_time:.2f} seconds")
print(f"Successfully processed {len(all_bird_data)} files out of {total_samples} total")
print(f"Failed to process {len(errors)} files")


100%|██████████| 28564/28564 [30:05<00:00, 15.82it/s]  
