In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.pyplot import imshow
import pyloudnorm as pyln

import cv2 as cv
import librosa
import IPython.display as ipd
import librosa.display as lid
import scipy.signal as signal

from PIL import Image

from tqdm import tqdm
from typing import List, Tuple, Dict

import torch

import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from torchvision import transforms, models

In [2]:
DATA_PATH: str = "birdclef-2025"

In [3]:
class EfficientNet(nn.Module):
    def __init__(self, num_classes: int = 1, depth: int = 3, infer_mode: bool = False, grayscale: bool = False):
        super().__init__()
        depth_models = {
            0: models.efficientnet_b0,
            1: models.efficientnet_b1,
            2: models.efficientnet_b2,
            3: models.efficientnet_b3,
            4: models.efficientnet_b4,
            5: models.efficientnet_b5,
            6: models.efficientnet_b6,
            7: models.efficientnet_b7,
        }

        self.infer_mode = infer_mode
        self.base_model = depth_models[depth](pretrained=True)

        if grayscale:
            original_conv: nn.Conv2d = self.base_model.features[0][0]
            self.base_model.features[0][0] = nn.Conv2d(
                in_channels=1,
                out_channels=original_conv.out_channels,
                kernel_size=original_conv.kernel_size,
                stride=original_conv.stride,
                padding=original_conv.padding,
                bias=original_conv.bias
            )

        head_in_features = self.base_model.classifier[1].in_features
        self.base_model.classifier = nn.Sequential(
            nn.Linear(head_in_features, 1024),
            nn.ReLU(),
            nn.BatchNorm1d(1024),
            nn.Dropout(p=0.3),
            nn.Linear(1024, num_classes)
        )

    def forward(self, x):
        x = self.base_model(x)
        if self.infer_mode:
            x = torch.softmax(x, dim=1)
        return x


# Utils

In [4]:
class CFG:
    seed: int = 42
    
    img_size: Tuple[int, int] = [128, 384]
    batch_size: int = 64
    
    # Audio duration, sample rate, and length
    duration: int = 15 # second
    sample_rate: int = 32000
    audio_len: int = duration * sample_rate
    
    # STFT parameters
    nfft: int = 1024
    window: int = 1024
    hop_length: int = audio_len // (img_size[1] - 1)
    fmin: int = 20
    fmax: int = 16000
    n_mels: int = 128
    
    # Number of epochs, model name
    epochs: int = 10
    preset: str = 'efficientnetv2_b2_imagenet'
    
    # Data augmentation parameters
    augment=True

    # Class Labels for BirdCLEF 25
    train_path: str = os.path.join(DATA_PATH, "train_audio")
    class_names: List[str] = sorted(os.listdir(train_path))
    num_classes: int = len(class_names)
    class_labels: List[int] = list(range(num_classes))
    label2name: Dict[int, str] = dict(zip(class_labels, class_names))
    name2label: Dict[str, int] = {v:k for k,v in label2name.items()}

In [5]:
cmap = matplotlib.cm.get_cmap('coolwarm')

def bandpass_filter(y, sr, lowcut=500, highcut=10000, order=8):
    sos = signal.butter(order, [lowcut, highcut], btype='band', fs=sr, output='sos')
    filtered = signal.sosfilt(sos, y)
    return filtered


def load_audio(filepath: str, sr: int | None = None) -> Tuple[np.ndarray, int]:
    audio, sr = librosa.load(filepath, sr=sr)
    return audio, sr


def get_spectrogram(audio: np.ndarray) -> np.ndarray:
    spec = librosa.feature.melspectrogram(y=audio, 
                                   sr=CFG.sample_rate, 
                                   n_mels=CFG.n_mels,
                                   n_fft=CFG.nfft,
                                   hop_length=CFG.hop_length,
                                   win_length=CFG.window,
                                   fmax=CFG.fmax,
                                   fmin=CFG.fmin,
                                   )
    spec = librosa.power_to_db(spec, ref=1.0)
    min_ = spec.min()
    max_ = spec.max()
    if max_ != min_:
        spec = (spec - min_)/(max_ - min_)
    return spec


def display_audio(row: pd.DataFrame, LUFS: float = 1, filtering: bool = False, seconds: int = -1) -> None:
    # Caption for viz
    caption = f'Id: {row["filename"]} | Name: {row["common_name"]} | Sci.Name: {row["scientific_name"]} | Rating: {row["rating"]}'
    # Read audio file
    audio, sr = load_audio(row["filepath"])
    if seconds != -1:
        audio = audio[:seconds * CFG.sample_rate]
    
    if LUFS != 1:
        meter = pyln.Meter(sr)
        loudness = meter.integrated_loudness(audio)
        audio = pyln.normalize.loudness(audio, loudness, LUFS)

    if filtering:
        audio = bandpass_filter(audio, sr)
    # Keep fixed length audio
    # audio = audio[:CFG.audio_len]
    # Spectrogram from audio
    spec = get_spectrogram(audio)
    # Display audio
    print(sr)
    print(len(audio), audio.shape)
    print("# Audio:")
    display(ipd.Audio(audio, rate=CFG.sample_rate))
    print('# Visualization:')
    
    # Calculate the time values based on audio length and sample rate
    duration = len(audio) / CFG.sample_rate
    time = np.linspace(0, duration, len(audio))
    
    # Create figure with subplots
    fig, ax = plt.subplots(2, 1, figsize=(12, 2*3), sharex=False, tight_layout=True)
    fig.suptitle(caption)
    
    # Plot waveplot with time axis
    ax[0].plot(time, audio, color=cmap(0.1))
    ax[0].set_ylabel('Amplitude')
    # ax[0].set_xlabel('Time (s)')
    ax[0].set_xlim(0, duration)

    # Plot spectrogram
    lid.specshow(spec, 
                 sr=CFG.sample_rate, 
                 hop_length=CFG.hop_length,
                 win_length=CFG.window,
                 n_fft=CFG.nfft,
                 fmin=CFG.fmin,
                 fmax=CFG.fmax,
                 x_axis='time', 
                 y_axis='mel',
                 cmap='coolwarm',
                 ax=ax[1])
    ax[1].set_xlabel('Time (s)')
    ax[1].set_xlim(0, duration)
    
    fig.show()


def mirror_pad_audio(audio, target_len):
    if len(audio) >= target_len:
        return audio[:target_len]

    pad_len = target_len - len(audio)
    pad_left = pad_len // 2
    pad_right = pad_len - pad_left

    audio_padded = np.pad(audio, (pad_left, pad_right), mode='reflect')
    return audio_padded

  cmap = matplotlib.cm.get_cmap('coolwarm')


In [6]:
unlabelled_data = os.path.join(DATA_PATH, "train_soundscapes")

paths = os.listdir(unlabelled_data)
paths = [os.path.join(unlabelled_data, path) for path in paths]

In [7]:
audio, sr = load_audio(paths[0])

In [8]:
audio = mirror_pad_audio(audio, len(audio) + sr * 5)

In [9]:
IMAGE_SIZE = (260, 260)

In [10]:
transform_audio = transforms.Compose([
    transforms.ToTensor(),
    transforms.Resize(IMAGE_SIZE)
])


def transform_audio_to_image(audio: np.ndarray, transform: transforms.Compose) -> np.ndarray:
    spec = get_spectrogram(audio)
    spec = (spec * 255).astype(np.int16)
    image = Image.fromarray(spec)
    image = transform(image)
    return image.float()


def predict_audio_sample(model: nn.Module, path: str, window_length: int, sr: int, device: str = "cuda"):
    audio, sr = load_audio(path, sr)
    audio = mirror_pad_audio(audio, len(audio) + sr * window_length)

    res: List[np.ndarray] = []
    windows =  len(audio) // (window_length * sr) - 1
    with torch.no_grad():
        for i in range(windows):
            input_image = transform_audio_to_image(audio[i * sr * window_length: (i + 2) * sr * window_length], transform_audio)
            input_image = input_image.unsqueeze(0)
            input_image = input_image.to(device)
            output = model(input_image)
            res.append(output.to("cpu").numpy())
    return res

In [11]:
efficientnet = EfficientNet(num_classes=206, depth=2, grayscale=True, infer_mode=True)
efficientnet.load_state_dict(torch.load("efficientnetv2_b2.pth"))
efficientnet = efficientnet.to("cuda")
efficientnet.eval()



EfficientNet(
  (base_model): EfficientNet(
    (features): Sequential(
      (0): Conv2dNormActivation(
        (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): SiLU(inplace=True)
      )
      (1): Sequential(
        (0): MBConv(
          (block): Sequential(
            (0): Conv2dNormActivation(
              (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
              (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
              (2): SiLU(inplace=True)
            )
            (1): SqueezeExcitation(
              (avgpool): AdaptiveAvgPool2d(output_size=1)
              (fc1): Conv2d(32, 8, kernel_size=(1, 1), stride=(1, 1))
              (fc2): Conv2d(8, 32, kernel_size=(1, 1), stride=(1, 1))
              (activation): SiLU(inplace=True)
              (sca

In [12]:
res_columns = ["row_id"] + [CFG.label2name[label] for label in range(206)]
submission = pd.DataFrame(columns=res_columns)

In [None]:
from copy import deepcopy

rows = []

new_row = {"row_id": ""}
for i in range(206):
    new_row[CFG.label2name[i]] = 0.0

for path in tqdm(paths[:50]):
    row_id = path.split("/")[-1][:-4]
    predictions = predict_audio_sample(efficientnet, path, 5, 32000, device='cuda')
    secs = 5
    for prediction in predictions:
        new_row["row_id"] = row_id + f"_{secs}"
        for i in range(206):
            new_row[CFG.label2name[i]] = prediction[0][i]
        secs += 5

        rows.append(deepcopy(new_row))

100%|██████████| 50/50 [00:41<00:00,  1.21it/s]


In [20]:
result = pd.DataFrame(rows)

In [26]:
result[:13]

Unnamed: 0,row_id,1139490,1192948,1194042,126247,1346504,134933,135045,1462711,1462737,...,yebfly1,yebsee1,yecspi2,yectyr1,yehbla2,yehcar1,yelori1,yeofly1,yercac1,ywcpar
0,H93_20230511_002000_5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,H93_20230511_002000_10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,H93_20230511_002000_15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,H93_20230511_002000_20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,H93_20230511_002000_25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,H93_20230511_002000_30,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,H93_20230511_002000_35,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,H93_20230511_002000_40,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,H93_20230511_002000_45,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,H93_20230511_002000_50,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
result.to_csv("sample_submission.csv", index=False)

In [14]:
torch.save(efficientnet.state_dict(), "efficientnetv2_b2.pt")

In [29]:
test = pd.read_csv("./birdclef-2025/sample_submission.csv")

In [30]:
test

Unnamed: 0,row_id,1139490,1192948,1194042,126247,1346504,134933,135045,1462711,1462737,...,yebfly1,yebsee1,yecspi2,yectyr1,yehbla2,yehcar1,yelori1,yeofly1,yercac1,ywcpar
0,soundscape_8358733_5,0.004854,0.004854,0.004854,0.004854,0.004854,0.004854,0.004854,0.004854,0.004854,...,0.004854,0.004854,0.004854,0.004854,0.004854,0.004854,0.004854,0.004854,0.004854,0.004854
1,soundscape_8358733_10,0.004854,0.004854,0.004854,0.004854,0.004854,0.004854,0.004854,0.004854,0.004854,...,0.004854,0.004854,0.004854,0.004854,0.004854,0.004854,0.004854,0.004854,0.004854,0.004854
2,soundscape_8358733_15,0.004854,0.004854,0.004854,0.004854,0.004854,0.004854,0.004854,0.004854,0.004854,...,0.004854,0.004854,0.004854,0.004854,0.004854,0.004854,0.004854,0.004854,0.004854,0.004854


In [32]:
test2 = pd.read_csv("sample_submission.csv")

In [33]:
test2

Unnamed: 0,row_id,1139490,1192948,1194042,126247,1346504,134933,135045,1462711,1462737,...,yebfly1,yebsee1,yecspi2,yectyr1,yehbla2,yehcar1,yelori1,yeofly1,yercac1,ywcpar
0,H93_20230511_002000_5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,H93_20230511_002000_10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,H93_20230511_002000_15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,H93_20230511_002000_20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,H93_20230511_002000_25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,H17_20230514_005500_40,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
596,H17_20230514_005500_45,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
597,H17_20230514_005500_50,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
598,H17_20230514_005500_55,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
print(test.columns == test2.columns)

[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  T