## Code for sample submission

In [None]:
import os
import librosa
import numpy as np
import pandas as pd

# Set seed
np.random.seed(42)

# Class labels from train audio
class_labels = sorted(os.listdir('/kaggle/input/birdclef-2025/train_audio/'))

# List of test soundscapes (only visible during submission)
test_soundscape_path = '/kaggle/input/birdclef-2025/test_soundscapes/'
test_soundscapes = [os.path.join(test_soundscape_path, afile) for afile in sorted(os.listdir(test_soundscape_path)) if afile.endswith('.ogg')]

# Open each soundscape and make predictions for 5-second segments
# Use pandas df with 'row_id' plus class labels as columns
predictions = pd.DataFrame(columns=['row_id'] + class_labels)
for soundscape in test_soundscapes:

    # Load audio
    sig, rate = librosa.load(path=soundscape, sr=None)

    # Split into 5-second chunks
    chunks = []
    for i in range(0, len(sig), rate*5):
        chunk = sig[i:i+rate*5]
        chunks.append(chunk)
        
    # Make predictions for each chunk
    for i, chunk in enumerate(chunks):
        
        # Get row id  (soundscape id + end time of 5s chunk)      
        row_id = os.path.basename(soundscape).split('.')[0] + f'_{i * 5 + 5}'
        
        # Make prediction (let's use random scores for now)
        # scores = model.predict...
        scores = np.random.rand(len(class_labels))
        
        # Append to predictions as new row
        new_row = pd.DataFrame([[row_id] + list(scores)], columns=['row_id'] + class_labels)
        predictions = pd.concat([predictions, new_row], axis=0, ignore_index=True)
        
# Save prediction as csv
predictions.to_csv('submission.csv', index=False)
predictions.head()

## My Code

In [73]:
import os
import pickle
import pandas as pd
import torch
import torchaudio
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader 
from torchaudio.transforms import MelSpectrogram, AmplitudeToDB
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [74]:

# ======================
# 1. Configuration
# ======================

DEVICE = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

print(f"MPS support: {torch.backends.mps.is_available()}")

AUDIO_DIR = '../data/train_audio/'
SR = 32000
DURATION = 5
N_MELS = 128

MPS support: True


In [75]:
# ======================
# 2. Data Preparation
# ======================

class BirdCLEFDataset(Dataset):
    def __init__(self, df, audio_dir, label_encoder, sr=SR, duration=DURATION, n_mels=N_MELS):
        self.df = df
        self.audio_dir = audio_dir
        self.label_encoder = label_encoder
        self.labels = self.label_encoder.transform(df['primary_label'])
        self.sr = sr
        self.audio_len = sr * duration
        
        # Pre-compute file paths
        self.file_paths = [
            os.path.join(audio_dir, row['filename'])
            for _, row in df.iterrows()
        ]
        
        # Audio transforms
        self.mel_spec = MelSpectrogram(
            sample_rate=sr,
            n_fft=2048,
            win_length=1024,
            hop_length=512,
            n_mels=n_mels,
            f_min=500,
            f_max=14000
        )
        self.to_db = AmplitudeToDB()

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        # Load audio
        waveform, _ = torchaudio.load(self.file_paths[idx])
        waveform = self._fix_length(waveform)
        
        # Feature extraction
        spec = self.to_db(self.mel_spec(waveform))
        spec = (spec - spec.mean()) / (spec.std() + 1e-6)
        
        return spec, torch.tensor(self.labels[idx], dtype=torch.long)

    def _fix_length(self, waveform):
        if waveform.shape[1] > self.audio_len:
            return waveform[:, :self.audio_len]
        return F.pad(waveform, (0, self.audio_len - waveform.shape[1]))


In [76]:
# ======================
# 3. Model Architecture
# ======================

class BasicCNN(nn.Module):
    def __init__(self, n_classes):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(1, 16, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(16, 32, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(32, 64, 3, padding=1),
            nn.ReLU(),
            nn.AdaptiveAvgPool2d((1, 1))
        )
        self.classifier = nn.Linear(64, n_classes)

    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        return self.classifier(x)


## Use both train_audio and train_soundscapes for fitting model and validating

## Make sure filepaths contain data for soundscapes as well as train audio

In [None]:
# ======================
# 4. Training Pipeline
# ======================

# Load metadata
df = pd.read_csv("../data/train.csv")  # Update path for Kaggle

# Initialize encoder
le = LabelEncoder()
df['label_idx'] = le.fit_transform(df['primary_label'])

# Split data (stratified)
train_df, val_df = train_test_split(
    df,
    test_size=0.2,
    stratify=df['primary_label'],
    random_state=42
)

# Create datasets
train_dataset = BirdCLEFDataset(train_df, AUDIO_DIR, le)
val_dataset = BirdCLEFDataset(val_df, AUDIO_DIR, le)

# Create loaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

# Initialize model
model = BasicCNN(len(le.classes_)).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

# Training loop
for epoch in range(5):
    model.train()
    train_loss = 0
    
    # Training phase
    for x, y in train_loader:
        x, y = x.to(DEVICE), y.to(DEVICE)
        
        optimizer.zero_grad()
        outputs = model(x)
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    
    # Validation phase
    model.eval()
    val_correct = 0
    with torch.no_grad():
        for x, y in val_loader:
            x, y = x.to(DEVICE), y.to(DEVICE)
            outputs = model(x)
            val_correct += (outputs.argmax(1) == y).sum().item()
    
    print(f"Epoch {epoch+1}: "
          f"Train Loss={train_loss/len(train_loader):.4f}, "
          f"Val Acc={val_correct/len(val_dataset):.4f}")

# Save for submission
torch.save(model.state_dict(), "model.pth")
with open("label_encoder.pkl", "wb") as f:
    pickle.dump(le, f)

In [66]:
# 3. Split into train/validation sets (stratified by species)
train_df, val_df = train_test_split(
    df,
    test_size=0.2,                # 20% for validation
    stratify=df['primary_label'], # Preserve class balance
    random_state=42               # For reproducibility
)

print(f"Training samples: {len(train_df)}")
print(f"Validation samples: {len(val_df)}")
print(f"Unique species in train: {train_df['primary_label'].nunique()}")
print(f"Unique species in val: {val_df['primary_label'].nunique()}")

Training samples: 22851
Validation samples: 5713
Unique species in train: 206
Unique species in val: 191


In [71]:
# Use the SAME audio_dir for both train and val
train_dataset = BirdCLEFDataset(train_df, audio_dir="../data/train_audio/", label_encoder=le)
val_dataset = BirdCLEFDataset(val_df, audio_dir="../data/train_audio/", label_encoder=le)

# Save encoder for inverse mapping later
with open("label_encoder.pkl", "wb") as f:
    pickle.dump(le, f)


train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)


model = BasicCNN(n_classes=df['label_idx'].nunique()).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)





In [68]:
# Improved validation loop
def validate(model, val_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for x, y in val_loader:
            x, y = x.to(device), y.to(device)
            outputs = model(x)
            preds = torch.argmax(outputs, dim=1)
            correct += (preds == y).sum().item()
            total += y.size(0)
    return correct / total




In [69]:
# Training loop
for epoch in range(5):  # adjust as needed
    model.train()
    train_loss = 0
    for x, y in train_loader:
        x, y = x.to(device), y.to(device)

        optimizer.zero_grad()
        outputs = model(x)
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {train_loss/len(train_loader):.4f}")


Epoch 1, Loss: 4.7716
Epoch 2, Loss: 4.6761
Epoch 3, Loss: 4.4630
Epoch 4, Loss: 4.2598
Epoch 5, Loss: 4.0495


In [36]:
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for x, y in val_loader:
        x = x.to(device)
        y_true = le.transform(y.cpu)
        outputs = model(x)
        preds = torch.argmax(outputs, dim=1).cpu().numpy()
        correct += (preds == y_true).sum()
        total += len(y)

print(f"Validation Accuracy: {correct / total:.4f}")


RuntimeError: Error loading audio file: failed to open file ../data/train_soundscapes/pirfly1/XC333930.ogg