Please refer to : https://github.com/Sai-Sam-N/BirdCLEF_2025_KaggleCompetition 

Separate notebooks were used for EDA, Feature engineering to extract pre-processed mels, Using pre-trained resnet model to train a custom model for this use-case, detailed walkthrough present in the Readme.md file. 

Thank you!

In [1]:
import os
import torch
import torchaudio
import pandas as pd
import numpy as np
from tqdm import tqdm
import torchaudio.transforms as T
from pathlib import Path
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision import models

In [2]:
BASE_DIR = r'/kaggle/input/birdclef-2025/'
MODEL_PATH = r'/kaggle/input/birdclef_trained_resnet_model/pytorch/v1/1/baseline_model.pt'
TEST_DIR = os.path.join(BASE_DIR, 'test_soundscapes')
SAMPLE_SUBMISSION_PATH = os.path.join(BASE_DIR, "sample_submission.csv")

In [3]:
class BirdCLEFModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.backbone = models.resnet18()
        self.backbone.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)
        self.backbone.fc = nn.Linear(self.backbone.fc.in_features, NUM_CLASSES)

    def forward(self, x):
        return self.backbone(x)

In [4]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
SAMPLE_RATE = 32000
DURATION = 5
NUM_SAMPLES = SAMPLE_RATE * DURATION
N_MELS = 128
TEST_AUDIO_DIR = "/kaggle/input/birdclef-2025/test_soundscapes"
LABELS_CSV = os.path.join(BASE_DIR, 'train.csv')
SUBMISSION_PATH = os.path.join(r'/kaggle/working/', "submission.csv")
NUM_CLASSES = 206

In [5]:
def convert_base_to_backbone(state_dict):
    new_state_dict = {}
    for k, v in state_dict.items():
        if k.startswith("base."):
            new_k = k.replace("base.", "backbone.")
        else:
            new_k = k
        new_state_dict[new_k] = v
    return new_state_dict

In [6]:
state_dict = torch.load(MODEL_PATH, map_location=DEVICE)
converted_state_dict = convert_base_to_backbone(state_dict)

model = BirdCLEFModel().to(DEVICE)
model.load_state_dict(converted_state_dict)
model.eval()

BirdCLEFModel(
  (backbone): ResNet(
    (conv1): Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, tr

In [7]:
taxonomy_df = pd.read_csv(os.path.join(BASE_DIR, "taxonomy.csv"))
species_list = taxonomy_df['primary_label'].unique().tolist() 

In [8]:
labels_df = pd.read_csv(LABELS_CSV)
labels_df['class_index'] = labels_df['primary_label'].astype('category').cat.codes
class_map = labels_df[['primary_label', 'class_index']].drop_duplicates().sort_values('class_index')
idx_to_label = class_map['primary_label'].tolist()

In [9]:
mel_transform = T.MelSpectrogram(
    sample_rate=SAMPLE_RATE,
    n_fft=1024,
    hop_length=512,
    n_mels=N_MELS
)

In [10]:
amplitude_to_db = T.AmplitudeToDB()

In [11]:
def preprocess_waveform(waveform, sr):
    if sr != SAMPLE_RATE:
        resampler = T.Resample(sr, SAMPLE_RATE)
        waveform = resampler(waveform)
    if waveform.ndim > 1:
        waveform = waveform.mean(dim=0)
    return waveform

def waveform_to_logmel(waveform):
    mel_spec = mel_transform(waveform)
    logmel = amplitude_to_db(mel_spec)
    logmel = (logmel - logmel.mean()) / logmel.std()
    return logmel.unsqueeze(0)

In [12]:
submission_rows = []
segment_duration = 5  # seconds
stride = 5  # non-overlapping

for file in tqdm(os.listdir(TEST_AUDIO_DIR)):
    if not file.endswith(".ogg"):
        continue

    path = os.path.join(TEST_AUDIO_DIR, file)
    waveform, sr = torchaudio.load(path)
    waveform = preprocess_waveform(waveform, sr)

    total_samples = waveform.shape[0]
    num_segments = max(1, total_samples // NUM_SAMPLES)

    for i in range(num_segments):
        start_sample = i * NUM_SAMPLES
        segment = waveform[start_sample:start_sample + NUM_SAMPLES]

        if segment.shape[0] < NUM_SAMPLES:
            pad = NUM_SAMPLES - segment.shape[0]
            segment = torch.nn.functional.pad(segment, (0, pad))

        logmel = waveform_to_logmel(segment)
        logmel = logmel.to(DEVICE)
        logmel = logmel.unsqueeze(0) 


        with torch.no_grad():
            preds = model(logmel).sigmoid().cpu().numpy()[0]

        row_id = f"{file.replace('.ogg','')}_{(i+1)*5}"
        row = [row_id] + preds.tolist()
        print(f"Processed {file}, segment {i+1}, row_id: {row_id}, max_pred: {preds.max():.4f}")
        submission_rows.append(row)

100%|██████████| 1/1 [00:00<00:00, 11428.62it/s]


In [13]:
sample_sub = pd.read_csv("/kaggle/input/birdclef-2025/sample_submission.csv")
species_cols = sample_sub.columns[1:]  
submission_df = pd.DataFrame(submission_rows, columns=['row_id'] + idx_to_label)

In [14]:
submission_rows

[]

In [15]:
submission_df = submission_df[['row_id'] + list(species_cols)]

In [16]:
print(submission_df.head())

Empty DataFrame
Columns: [row_id, 1139490, 1192948, 1194042, 126247, 1346504, 134933, 135045, 1462711, 1462737, 1564122, 21038, 21116, 21211, 22333, 22973, 22976, 24272, 24292, 24322, 41663, 41778, 41970, 42007, 42087, 42113, 46010, 47067, 476537, 476538, 48124, 50186, 517119, 523060, 528041, 52884, 548639, 555086, 555142, 566513, 64862, 65336, 65344, 65349, 65373, 65419, 65448, 65547, 65962, 66016, 66531, 66578, 66893, 67082, 67252, 714022, 715170, 787625, 81930, 868458, 963335, amakin1, amekes, ampkin1, anhing, babwar, bafibi1, banana, baymac, bbwduc, bicwre1, bkcdon, bkmtou1, blbgra1, blbwre1, blcant4, blchaw1, blcjay1, blctit1, blhpar1, blkvul, bobfly1, bobher1, brtpar1, bubcur1, bubwre1, bucmot3, bugtan, butsal1, cargra1, cattyr, chbant1, chfmac1, cinbec1, cocher1, cocwoo1, colara1, colcha1, compau, compot1, ...]
Index: []

[0 rows x 207 columns]


In [17]:
submission_df.to_csv(SUBMISSION_PATH, index=False) 

In [18]:
print("Submission file saved to :", SUBMISSION_PATH)

Submission file saved to : /kaggle/working/submission.csv
