## Code for sample submission

In [None]:
import os
import librosa
import numpy as np
import pandas as pd

# Set seed
np.random.seed(42)

# Class labels from train audio
class_labels = sorted(os.listdir('/kaggle/input/birdclef-2025/train_audio/'))

# List of test soundscapes (only visible during submission)
test_soundscape_path = '/kaggle/input/birdclef-2025/test_soundscapes/'
test_soundscapes = [os.path.join(test_soundscape_path, afile) for afile in sorted(os.listdir(test_soundscape_path)) if afile.endswith('.ogg')]

# Open each soundscape and make predictions for 5-second segments
# Use pandas df with 'row_id' plus class labels as columns
predictions = pd.DataFrame(columns=['row_id'] + class_labels)
for soundscape in test_soundscapes:

    # Load audio
    sig, rate = librosa.load(path=soundscape, sr=None)

    # Split into 5-second chunks
    chunks = []
    for i in range(0, len(sig), rate*5):
        chunk = sig[i:i+rate*5]
        chunks.append(chunk)
        
    # Make predictions for each chunk
    for i, chunk in enumerate(chunks):
        
        # Get row id  (soundscape id + end time of 5s chunk)      
        row_id = os.path.basename(soundscape).split('.')[0] + f'_{i * 5 + 5}'
        
        # Make prediction (let's use random scores for now)
        # scores = model.predict...
        scores = np.random.rand(len(class_labels))
        
        # Append to predictions as new row
        new_row = pd.DataFrame([[row_id] + list(scores)], columns=['row_id'] + class_labels)
        predictions = pd.concat([predictions, new_row], axis=0, ignore_index=True)
        
# Save prediction as csv
predictions.to_csv('submission.csv', index=False)
predictions.head()

## My Code

In [88]:
import os
import pickle
import pandas as pd
import torch
import torchaudio
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torchaudio.transforms import MelSpectrogram, AmplitudeToDB
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

In [81]:

# ======================
# 1. Configuration
# ======================

DEVICE = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

print(f"MPS support: {torch.backends.mps.is_available()}")

AUDIO_DIR = '../data/train_audio/'
SR = 32000
DURATION = 5
N_MELS = 128

MPS support: True


In [82]:
# ======================
# 2. Data Preparation
# ======================

class BirdCLEFDataset(Dataset):
    def __init__(self, df, audio_dir, label_encoder, sr=SR, duration=DURATION, n_mels=N_MELS):
        self.df = df
        self.audio_dir = audio_dir
        self.label_encoder = label_encoder
        self.labels = self.label_encoder.transform(df['primary_label'])
        self.sr = sr
        self.audio_len = sr * duration
        
        # Pre-compute file paths
        self.file_paths = [
            os.path.join(audio_dir, row['filename'])
            for _, row in df.iterrows()
        ]
        
        # Audio transforms
        self.mel_spec = MelSpectrogram(
            sample_rate=sr,
            n_fft=2048,
            win_length=1024,
            hop_length=512,
            n_mels=n_mels,
            f_min=500,
            f_max=14000
        )
        self.to_db = AmplitudeToDB()

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        # Load audio
        waveform, _ = torchaudio.load(self.file_paths[idx])
        waveform = self._fix_length(waveform)
        
        # Feature extraction
        spec = self.to_db(self.mel_spec(waveform))
        spec = (spec - spec.mean()) / (spec.std() + 1e-6)
        
        return spec, torch.tensor(self.labels[idx], dtype=torch.long)

    def _fix_length(self, waveform):
        if waveform.shape[1] > self.audio_len:
            return waveform[:, :self.audio_len]
        return F.pad(waveform, (0, self.audio_len - waveform.shape[1]))


In [83]:
# ======================
# 3. Model Architecture
# ======================

class BasicCNN(nn.Module):
    def __init__(self, n_classes):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(1, 16, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(16, 32, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(32, 64, 3, padding=1),
            nn.ReLU(),
            nn.AdaptiveAvgPool2d((1, 1))
        )
        self.classifier = nn.Linear(64, n_classes)

    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        return self.classifier(x)


In [86]:
# ======================
# 4. Training Class
# ======================

class BirdCLEFTrainer:
    def __init__(self, model, train_loader, val_loader, device):
        self.model = model.to(device)
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.device = device
        self.criterion = nn.CrossEntropyLoss()
        self.optimizer = optim.Adam(model.parameters(), lr=3e-4)
        self.scheduler = ReduceLROnPlateau(self.optimizer, 'min', patience=2)
        self.best_loss = float('inf')
        self.train_losses = []
        self.val_losses = []
        
    def train_epoch(self):
        self.model.train()
        running_loss = 0.0
        for inputs, labels in self.train_loader:
            inputs, labels = inputs.to(self.device), labels.to(self.device)
            
            self.optimizer.zero_grad()
            
            # Mixed precision training
            with torch.cuda.amp.autocast():
                outputs = self.model(inputs)
                loss = self.criterion(outputs, labels)
            
            loss.backward()
            self.optimizer.step()
            running_loss += loss.item()
        
        return running_loss / len(self.train_loader)
    
    def validate(self):
        self.model.eval()
        val_loss = 0.0
        correct = 0
        total = 0
        
        with torch.no_grad():
            for inputs, labels in self.val_loader:
                inputs, labels = inputs.to(self.device), labels.to(self.device)
                outputs = self.model(inputs)
                val_loss += self.criterion(outputs, labels).item()
                
                _, predicted = outputs.max(1)
                correct += predicted.eq(labels).sum().item()
                total += labels.size(0)
        
        return val_loss / len(self.val_loader), correct / total
    
    def run(self, max_epochs=50, patience=5):
        no_improve = 0
        best_epoch = 0
        
        for epoch in range(max_epochs):
            train_loss = self.train_epoch()
            val_loss, val_acc = self.validate()
            self.scheduler.step(val_loss)
            
            self.train_losses.append(train_loss)
            self.val_losses.append(val_loss)
            
            # Save best model
            if val_loss < self.best_loss:
                self.best_loss = val_loss
                no_improve = 0
                best_epoch = epoch
                torch.save({
                    'epoch': epoch,
                    'model_state_dict': self.model.state_dict(),
                    'optimizer_state_dict': self.optimizer.state_dict(),
                    'loss': val_loss,
                    'accuracy': val_acc,
                }, 'best_model.pth')
            else:
                no_improve += 1
            
            # Save periodic checkpoints
            if epoch % 5 == 0:
                torch.save(self.model.state_dict(), f'checkpoint_epoch_{epoch}.pth')
            
            print(f'Epoch {epoch+1}/{max_epochs}: '
                  f'Train Loss: {train_loss:.4f} | '
                  f'Val Loss: {val_loss:.4f} | '
                  f'Val Acc: {val_acc:.4f} | '
                  f'LR: {self.optimizer.param_groups[0]["lr"]:.2e}')
            
            # Early stopping
            if no_improve >= patience:
                print(f'Early stopping at epoch {epoch}')
                break
        
        self.plot_progress(best_epoch)
        return best_epoch
    
    def plot_progress(self, best_epoch):
        plt.figure(figsize=(10, 5))
        plt.plot(self.train_losses, label='Train Loss')
        plt.plot(self.val_losses, label='Val Loss')
        plt.axvline(best_epoch, color='r', linestyle='--', label='Best Epoch')
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.legend()
        plt.savefig('training_curve.png')
        plt.close()



In [89]:
# ======================
# 5. Training Pipeline
# ======================

# Load metadata
df = pd.read_csv("../data/train.csv")  # Update path for Kaggle

# Initialize encoder
le = LabelEncoder()
df['label_idx'] = le.fit_transform(df['primary_label'])

# Split data (stratified)
train_df, val_df = train_test_split(
    df,
    test_size=0.2,
    stratify=df['primary_label'],
    random_state=42
)

# Create datasets
train_dataset = BirdCLEFDataset(train_df, AUDIO_DIR, le)
val_dataset = BirdCLEFDataset(val_df, AUDIO_DIR, le)

# Create loaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

# Initialize model
model = BasicCNN(len(le.classes_)).to(DEVICE)

# Train
trainer = BirdCLEFTrainer(model, train_loader, val_loader, DEVICE)
best_epoch = trainer.run(max_epochs=50, patience=5)

# Save final artifacts
torch.save(model.state_dict(), "model.pth")
with open("label_encoder.pkl", "wb") as f:
    pickle.dump(le, f)

# Quantize for inference (optional)
quantized_model = torch.quantization.quantize_dynamic(
    model, {nn.Linear}, dtype=torch.qint8
)
torch.save(quantized_model.state_dict(), "quantized_model.pth")

  with torch.cuda.amp.autocast():


Epoch 1/50: Train Loss: 4.7830 | Val Loss: 4.7281 | Val Acc: 0.0282 | LR: 3.00e-04
Epoch 2/50: Train Loss: 4.7362 | Val Loss: 4.7294 | Val Acc: 0.0347 | LR: 3.00e-04
Epoch 3/50: Train Loss: 4.7283 | Val Loss: 4.7182 | Val Acc: 0.0371 | LR: 3.00e-04
Epoch 4/50: Train Loss: 4.7049 | Val Loss: 4.6802 | Val Acc: 0.0410 | LR: 3.00e-04
Epoch 5/50: Train Loss: 4.6418 | Val Loss: 4.5857 | Val Acc: 0.0488 | LR: 3.00e-04
Epoch 6/50: Train Loss: 4.5474 | Val Loss: 4.5255 | Val Acc: 0.0637 | LR: 3.00e-04
Epoch 7/50: Train Loss: 4.4819 | Val Loss: 4.4537 | Val Acc: 0.0725 | LR: 3.00e-04
Epoch 8/50: Train Loss: 4.4282 | Val Loss: 4.4057 | Val Acc: 0.0812 | LR: 3.00e-04
Epoch 9/50: Train Loss: 4.3801 | Val Loss: 4.3612 | Val Acc: 0.0866 | LR: 3.00e-04
Epoch 10/50: Train Loss: 4.3295 | Val Loss: 4.3239 | Val Acc: 0.0917 | LR: 3.00e-04
Epoch 11/50: Train Loss: 4.2753 | Val Loss: 4.2739 | Val Acc: 0.1019 | LR: 3.00e-04
Epoch 12/50: Train Loss: 4.2250 | Val Loss: 4.2155 | Val Acc: 0.1189 | LR: 3.00e-04
E

NotImplementedError: The operator 'aten::quantize_per_tensor' is not currently implemented for the MPS device. If you want this op to be considered for addition please comment on https://github.com/pytorch/pytorch/issues/141287 and mention use-case, that resulted in missing op as well as commit hash 2236df1770800ffea5697b11b0bb0d910b2e59e1. As a temporary fix, you can set the environment variable `PYTORCH_ENABLE_MPS_FALLBACK=1` to use the CPU as a fallback for this op. WARNING: this will be slower than running natively on MPS.