In [2]:
import os
import numpy as np
import pandas as pd
import librosa
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")


## Dataset Downloading & Precprocessing

In [7]:
import kagglehub

In [8]:
birdclef_2025_path = kagglehub.competition_download('birdclef-2025')

In [9]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import librosa

# Base path from kagglehub
birdclef_2025_path = kagglehub.competition_download('birdclef-2025')
audio_dir = os.path.join(birdclef_2025_path, "train_audio")
csv_path = os.path.join(birdclef_2025_path, "train.csv")

# Constants
SAMPLE_RATE = 32000
DURATION = 5
N_MELS = 128
FREQ_MAX = 16000
AUDIO_LEN = SAMPLE_RATE * DURATION
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")


### Convert Audio to Log-Mel Spectrogram

This function takes an audio file path and returns its log-mel spectrogram representation:

- Loads the audio at 32kHz sampling rate
- Pads or trims to 5 seconds (exact length)
- Computes the mel spectrogram with `n_mels` bands (default 128)
- Converts it to log scale (decibels) using `librosa.power_to_db`
- Returns a `(n_mels, time)` shaped matrix used as CNN input

In [10]:
def load_log_mel(filepath):
    y, sr = librosa.load(filepath, sr=SAMPLE_RATE)
    if len(y) < AUDIO_LEN:
        y = np.pad(y, (0, AUDIO_LEN - len(y)))
    else:
        y = y[:AUDIO_LEN]
    
    mel = librosa.feature.melspectrogram(
        y=y, sr=sr, n_mels=N_MELS, fmax=FREQ_MAX
    )
    log_mel = librosa.power_to_db(mel, ref=np.max)
    return log_mel


### Data Loading
Loaded training audio files from `train_audio/` and metadata from `train.csv`. Only high-quality clips were used (rating ≥ 4).

---

In [11]:
class BirdLogMelDataset(Dataset):
    def __init__(self, filepaths, labels):
        self.filepaths = filepaths
        self.labels = labels

    def __len__(self):
        return len(self.filepaths)

    def __getitem__(self, idx):
        log_mel = load_log_mel(self.filepaths[idx])
        log_mel = np.expand_dims(log_mel, axis=0)  # (1, 128, time)
        return torch.tensor(log_mel, dtype=torch.float32), torch.tensor(self.labels[idx], dtype=torch.float32)
