In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os


# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import os
import numpy as np
import pandas as pd
import librosa
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")


## Dataset Downloading & Precprocessing

In [7]:
import kagglehub

In [8]:
birdclef_2025_path = kagglehub.competition_download('birdclef-2025')

In [9]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import librosa

# Base path from kagglehub
birdclef_2025_path = kagglehub.competition_download('birdclef-2025')
audio_dir = os.path.join(birdclef_2025_path, "train_audio")
csv_path = os.path.join(birdclef_2025_path, "train.csv")

# Constants
SAMPLE_RATE = 32000
DURATION = 5
N_MELS = 128
FREQ_MAX = 16000
AUDIO_LEN = SAMPLE_RATE * DURATION
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")


### Convert Audio to Log-Mel Spectrogram

This function takes an audio file path and returns its log-mel spectrogram representation:

- Loads the audio at 32kHz sampling rate
- Pads or trims to 5 seconds (exact length)
- Computes the mel spectrogram with `n_mels` bands (default 128)
- Converts it to log scale (decibels) using `librosa.power_to_db`
- Returns a `(n_mels, time)` shaped matrix used as CNN input

In [10]:
def load_log_mel(filepath):
    y, sr = librosa.load(filepath, sr=SAMPLE_RATE)
    if len(y) < AUDIO_LEN:
        y = np.pad(y, (0, AUDIO_LEN - len(y)))
    else:
        y = y[:AUDIO_LEN]
    
    mel = librosa.feature.melspectrogram(
        y=y, sr=sr, n_mels=N_MELS, fmax=FREQ_MAX
    )
    log_mel = librosa.power_to_db(mel, ref=np.max)
    return log_mel


### Data Loading
Loaded training audio files from `train_audio/` and metadata from `train.csv`. Only high-quality clips were used (rating ≥ 4).

---

In [11]:
class BirdLogMelDataset(Dataset):
    def __init__(self, filepaths, labels):
        self.filepaths = filepaths
        self.labels = labels

    def __len__(self):
        return len(self.filepaths)

    def __getitem__(self, idx):
        log_mel = load_log_mel(self.filepaths[idx])
        log_mel = np.expand_dims(log_mel, axis=0)  # (1, 128, time)
        return torch.tensor(log_mel, dtype=torch.float32), torch.tensor(self.labels[idx], dtype=torch.float32)


## 🧠 BirdCNN – Simple Binary Classifier

- Takes a log-mel spectrogram as input (1 × 128 × time)
- 3 convolution layers with ReLU + pooling
- Uses global average pooling to reduce size
- Final layer gives 1 probability (bird present or not)

In [12]:
class BirdCNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Conv2d(1, 16, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(16, 32, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(32, 64, 3, padding=1),
            nn.ReLU(),
            nn.AdaptiveAvgPool2d((1, 1)),
            nn.Flatten(),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)


In [None]:
import tqdm.notebook as tqdm
df = pd.read_csv(csv_path)

# Choose top-N most common species to demo
top_species = df["primary_label"].value_counts().index[:10]
models = {}

for bird in top_species:
    print(f"Training model for: {bird}")

    # Positive = bird, Negative = others (same size)
    pos_df = df[df["primary_label"] == bird]
    neg_df = df[df["primary_label"] != bird].sample(len(pos_df))

    combined_df = pd.concat([pos_df, neg_df])
    labels = (combined_df["primary_label"] == bird).astype(int).values
    filepaths = [os.path.join(audio_dir, fname) for fname in combined_df["filename"]]

    print("Done Loading")
    # Train/val split
    train_fp, val_fp, train_labels, val_labels = train_test_split(
        filepaths, labels, test_size=0.2, random_state=42
    )

    train_ds = BirdLogMelDataset(train_fp, train_labels)
    val_ds = BirdLogMelDataset(val_fp, val_labels)
    train_dl = DataLoader(train_ds, batch_size=16, shuffle=True)
    val_dl = DataLoader(val_ds, batch_size=16)

    # Model init
    model = BirdCNN().to(DEVICE)
    loss_fn = nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

    print("Done Model")
    # Training
    for epoch in range(3):  # Increase this for better performance
        model.train()
        for x, y in tqdm.tqdm(train_dl):
            x, y = x.to(DEVICE), y.to(DEVICE).unsqueeze(1)
            optimizer.zero_grad()
            preds = model(x)
            loss = loss_fn(preds, y)
            loss.backward()
            optimizer.step()

    models[bird] = model
    torch.save(model.state_dict(), f"{bird}_logmel_cnn.pth")


Training model for: grekis
Done Loading
Done Model


  0%|          | 0/99 [00:00<?, ?it/s]

  0%|          | 0/99 [00:00<?, ?it/s]

  0%|          | 0/99 [00:00<?, ?it/s]

Training model for: compau
Done Loading
Done Model


  0%|          | 0/81 [00:00<?, ?it/s]

  0%|          | 0/81 [00:00<?, ?it/s]

  0%|          | 0/81 [00:00<?, ?it/s]

Training model for: trokin
Done Loading
Done Model


  0%|          | 0/79 [00:00<?, ?it/s]

  0%|          | 0/79 [00:00<?, ?it/s]

  0%|          | 0/79 [00:00<?, ?it/s]

Training model for: roahaw
Done Loading
Done Model


  0%|          | 0/71 [00:00<?, ?it/s]

  0%|          | 0/71 [00:00<?, ?it/s]

  0%|          | 0/71 [00:00<?, ?it/s]

Training model for: banana
Done Loading
Done Model


  0%|          | 0/61 [00:00<?, ?it/s]

  0%|          | 0/61 [00:00<?, ?it/s]

  0%|          | 0/61 [00:00<?, ?it/s]

Training model for: whtdov
Done Loading
Done Model


  0%|          | 0/58 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

Training model for: socfly1
Done Loading
Done Model


  0%|          | 0/55 [00:00<?, ?it/s]

  0%|          | 0/55 [00:00<?, ?it/s]

  0%|          | 0/55 [00:00<?, ?it/s]

Training model for: yeofly1
Done Loading
Done Model


  0%|          | 0/53 [00:00<?, ?it/s]

In [19]:
from sklearn.metrics import f1_score


In [20]:
def evaluate_model(model, dataloader):
    model.eval()
    all_preds = []
    all_targets = []

    with torch.no_grad():
        for x, y in dataloader:
            x = x.to(DEVICE)
            preds = model(x).cpu().numpy().flatten()
            preds = (preds > 0.5).astype(int)
            all_preds.extend(preds)
            all_targets.extend(y.numpy())

    return f1_score(all_targets, all_preds)


In [None]:
    train_f1 = evaluate_model(model, train_dl)
    val_f1 = evaluate_model(model, val_dl)
    
    print(f"F1 Score - Train: {train_f1:.4f}, Val: {val_f1:.4f}")
