In [1]:
import os
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import torch
import torchaudio
import torch.nn as nn
import torch.nn.functional as F

# ----------------------------
# Configuration
# ----------------------------
class Config:
    def __init__(self):
        self.test_soundscapes = "/kaggle/input/birdclef-2025/test_soundscapes"   # folder with test audio files
        self.submission_csv = "/kaggle/input/birdclef-2025/sample_submission.csv"  # sample submission file
        self.debug = False
        self.debug_count = 10  # use only in debug mode

cfg = Config()

# Set up device
device = torch.device("mps" if torch.backends.mps.is_available() 
                      else "cuda" if torch.cuda.is_available() 
                      else "cpu")
print("Using device:", device)

# ----------------------------
# Model Architecture: BiLSTMDenseNet
# ----------------------------
class DenseBlock(nn.Module):
    def __init__(self, in_channels, growth_rate, n_layers):
        super(DenseBlock, self).__init__()
        self.layers = nn.ModuleList()
        for i in range(n_layers):
            self.layers.append(nn.Sequential(
                nn.BatchNorm2d(in_channels + i * growth_rate),
                nn.ReLU(inplace=True),
                nn.Conv2d(in_channels + i * growth_rate, growth_rate, kernel_size=3, padding=1, bias=False)
            ))
    def forward(self, x):
        features = [x]
        for layer in self.layers:
            new_feature = layer(torch.cat(features, dim=1))
            features.append(new_feature)
        return torch.cat(features, dim=1)

class TransitionLayer(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(TransitionLayer, self).__init__()
        self.layer = nn.Sequential(
            nn.BatchNorm2d(in_channels),
            nn.ReLU(inplace=True),
            nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False),
            nn.AvgPool2d(kernel_size=2, stride=2)
        )
    def forward(self, x):
        return self.layer(x)

class BiLSTMDenseNet(nn.Module):
    def __init__(self, num_classes, growth_rate=16, num_dense_layers=4, 
                 lstm_hidden_size=64, lstm_layers=2):
        super(BiLSTMDenseNet, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=7, stride=2, padding=3, bias=False)
        self.bn1 = nn.BatchNorm2d(32)
        self.relu = nn.ReLU(inplace=True)
        self.pool1 = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.denseblock = DenseBlock(32, growth_rate, num_dense_layers)
        in_channels_after_dense = 32 + num_dense_layers * growth_rate
        self.trans = TransitionLayer(in_channels_after_dense, in_channels_after_dense // 2)
        self.conv2 = nn.Conv2d(in_channels_after_dense // 2, 64, kernel_size=3, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(64)
        self.lstm_hidden_size = lstm_hidden_size
        self.bi_lstm = nn.LSTM(input_size=64, hidden_size=lstm_hidden_size, 
                               num_layers=lstm_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(2 * lstm_hidden_size, num_classes)
    
    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.pool1(x)
        x = self.denseblock(x)
        x = self.trans(x)
        x = self.conv2(x)
        x = self.bn2(x)
        x = self.relu(x)
        # Average over frequency axis (axis=2)
        x = torch.mean(x, dim=2)  # shape: (batch, 64, time)
        x = x.permute(0, 2, 1)     # reshape to (batch, time, channels) for LSTM
        self.bi_lstm.flatten_parameters()
        lstm_out, _ = self.bi_lstm(x)
        lstm_out = torch.mean(lstm_out, dim=1)
        return self.fc(lstm_out)

# ----------------------------
# Load Models
# ----------------------------
# Load binary classifier (2 classes: non-bird, bird)
model_binary = BiLSTMDenseNet(num_classes=2).to(device)
model_binary.load_state_dict(torch.load("/kaggle/input/bi-lstm-densenet/model_binary_state_dict.pth", map_location=device))
model_binary.eval()

# Load multiclass classifier (trained only on bird species)
# We'll re-instantiate based on taxonomy later.
model_multiclass = None

# ----------------------------
# Load Taxonomy and Build Species IDs
# ----------------------------
taxonomy_df = pd.read_csv("/kaggle/input/birdclef-2025/taxonomy.csv")
# Get bird species (Aves) as primary_label; these are the classes the multiclass model predicts.
bird_species = sorted(taxonomy_df[taxonomy_df['class_name'] == 'Aves']['primary_label'].astype(str).unique())
nBird = len(bird_species)
print("Number of bird species (trained):", nBird)

# Load sample submission to get the full set of 206 species
sample_sub = pd.read_csv(cfg.submission_csv)
submission_species = list(sample_sub.columns[1:])  # skip row_id
nTotal = len(submission_species)
print("Total species in submission:", nTotal)

# Non-bird species: those in submission but not in bird_species.
nonbird_species = [sp for sp in submission_species if sp not in bird_species]
nNonBird = len(nonbird_species)
print("Number of non-bird species:", nNonBird)

species_ids = {
    'bird': bird_species,
    'nonbird': nonbird_species,
    'submission': submission_species
}

# Re-instantiate the multiclass model with correct number of classes (nBird)
model_multiclass = BiLSTMDenseNet(num_classes=nBird).to(device)
model_multiclass.load_state_dict(torch.load("/kaggle/input/bi-lstm-densenet/model_multiclass_state_dict.pth", map_location=device))
model_multiclass.eval()

print("Models loaded.")

# ----------------------------
# Define Mel-Spectrogram Transform
# ----------------------------
sample_rate = 32000
mel_transform = torchaudio.transforms.MelSpectrogram(
    sample_rate=sample_rate, n_mels=64, n_fft=1024
)

# ----------------------------
# Test-Time Augmentation (TTA)
# ----------------------------
def apply_tta(spec, tta_idx):
    """Apply TTA to a spectrogram (numpy array)."""
    if tta_idx == 0:
        return spec
    elif tta_idx == 1:
        return np.flip(spec, axis=1)  # horizontal flip (time shift)
    elif tta_idx == 2:
        return np.flip(spec, axis=0)  # vertical flip (frequency shift)
    else:
        return spec

# ----------------------------
# Sliding Window Inference Function (5-second chunks)
# ----------------------------
import librosa

def predict_on_spectrogram(audio_path, models, cfg, species_ids):
    """
    Process a test audio file by splitting it into non-overlapping 5-second chunks.
    For each chunk:
      - Compute Mel-spectrogram.
      - Apply TTA (tta_idx 0, 1, 2) and average binary predictions to get pb.
      - If pb > threshold, apply TTA with the multiclass model to get bird probabilities (scaled by pb).
      - Otherwise, assign zeros for bird predictions.
      - For non-bird species, assign uniform probability = (1 - pb) / nNonBird.
      - Construct a final prediction vector of length nTotal (206) following the order in species_ids['submission'].
      - Generate row_id as "H02_20230420_074000_5", "H02_20230420_074000_10", etc.
    Return a list of (row_id, final_pred) tuples.
    """
    # Load the audio using librosa: force a sample rate of 32000 and convert to mono.
    waveform_np, sr = librosa.load(audio_path, sr=sample_rate, mono=True)
    # Convert the numpy array to a torch tensor and add a channel dimension (shape becomes [1, total_samples])
    waveform = torch.tensor(waveform_np).unsqueeze(0)
    
    # Set chunk duration to 5 seconds
    chunk_duration = 5.0
    chunk_size = int(chunk_duration * sample_rate)
    total_samples = waveform.shape[1]
    num_chunks = total_samples // chunk_size
    
    results = []
    base_id = os.path.basename(audio_path).split('.')[0]
    
    for i in range(num_chunks):
        start = i * chunk_size
        chunk = waveform[:, start:start+chunk_size]
        spec = mel_transform(chunk)  # shape: (1, n_mels, time)
        spec = spec.unsqueeze(0).to(device)  # add batch dimension
        
        # Apply TTA for binary classifier
        tta_bin_preds = []
        for tta_idx in [0, 1, 2]:
            spec_tta = apply_tta(spec.cpu().numpy()[0], tta_idx)
            spec_tta = torch.tensor(spec_tta).unsqueeze(0).to(device)
            with torch.no_grad():
                out_bin = models['binary'](spec_tta)
                prob_bin = torch.softmax(out_bin, dim=1)
            tta_bin_preds.append(prob_bin.cpu().numpy()[0, 1])
        pb = np.mean(tta_bin_preds)
        nonbird_prob = 1 - pb
        
        binary_threshold = 0.5
        if pb > binary_threshold:
            tta_multi_preds = []
            for tta_idx in [0, 1, 2]:
                spec_tta = apply_tta(spec.cpu().numpy()[0], tta_idx)
                spec_tta = torch.tensor(spec_tta).unsqueeze(0).to(device)
                with torch.no_grad():
                    out_multi = models['multiclass'](spec_tta)
                    prob_multi = torch.softmax(out_multi, dim=1)
                tta_multi_preds.append(prob_multi.cpu().numpy()[0])
            avg_bird_probs = np.mean(tta_multi_preds, axis=0)
            bird_probs = pb * avg_bird_probs
        else:
            bird_probs = np.zeros(len(species_ids['bird']))
        
        nonbird_probs = np.full((len(species_ids['nonbird']),), nonbird_prob / len(species_ids['nonbird']))
        
        final_pred = np.zeros(nTotal)
        for j, sp in enumerate(species_ids['submission']):
            if sp in species_ids['bird']:
                idx = species_ids['bird'].index(sp)
                final_pred[j] = bird_probs[idx]
            else:
                final_pred[j] = nonbird_probs[species_ids['nonbird'].index(sp)]
        
        # Generate row_id: simply concatenate the base filename with "_" and (i*5 + 5)
        row_id = base_id + f'_{i*5 + 5}'
        results.append((row_id, final_pred))
    
    return results

# ----------------------------
# Inference and Submission Generation
# ----------------------------
def run_inference(cfg, models, species_ids):
    test_files = list(Path(cfg.test_soundscapes).glob('*.ogg'))
    if cfg.debug:
        print(f"Debug mode enabled: using only {cfg.debug_count} files")
        test_files = test_files[:cfg.debug_count]
    print(f"Found {len(test_files)} test soundscapes")
    
    all_row_ids = []
    all_predictions = []
    for audio_path in tqdm(test_files, desc="Processing test files"):
        rows = predict_on_spectrogram(str(audio_path), models, cfg, species_ids)
        for row_id, pred in rows:
            all_row_ids.append(row_id)
            all_predictions.append(pred)
    return all_row_ids, all_predictions

def create_submission(row_ids, predictions, species_ids, cfg):
    print("Creating submission dataframe...")
    submission_dict = {'row_id': row_ids}
    for i, sp in enumerate(species_ids['submission']):
        submission_dict[sp] = [pred[i] for pred in predictions]
    submission_df = pd.DataFrame(submission_dict)
    
    # Ensure we match sample submission columns
    sample_sub = pd.read_csv(cfg.submission_csv)
    ordered_columns = ['row_id'] + list(sample_sub.columns[1:])
    submission_df = submission_df[ordered_columns]
    
    return submission_df

# ----------------------------
# Main Inference and Submission Generation
# ----------------------------
# Build species_ids dictionary using taxonomy and sample submission.
bird_species = sorted(taxonomy_df[taxonomy_df['class_name'] == 'Aves']['primary_label'].astype(str).unique())
sample_sub = pd.read_csv(cfg.submission_csv)
submission_species = list(sample_sub.columns[1:])  # skip row_id column
nonbird_species = [sp for sp in submission_species if sp not in bird_species]

species_ids = {
    'bird': bird_species,
    'nonbird': nonbird_species,
    'submission': submission_species
}

models = {
    'binary': model_binary,
    'multiclass': model_multiclass
}

all_row_ids, all_predictions = run_inference(cfg, models, species_ids)
submission_df = create_submission(all_row_ids, all_predictions, species_ids, cfg)
submission_df.to_csv("submission.csv", index=False)
print("Submission saved to submission.csv")

Using device: cpu
Number of bird species (trained): 146
Total species in submission: 206
Number of non-bird species: 60
Models loaded.


  model_binary.load_state_dict(torch.load("/kaggle/input/bi-lstm-densenet/model_binary_state_dict.pth", map_location=device))
  model_multiclass.load_state_dict(torch.load("/kaggle/input/bi-lstm-densenet/model_multiclass_state_dict.pth", map_location=device))


Found 0 test soundscapes


Processing test files: 0it [00:00, ?it/s]


Creating submission dataframe...
Submission saved to submission.csv


In [2]:
#Verifying format
submission_df.head()

Unnamed: 0,row_id,1139490,1192948,1194042,126247,1346504,134933,135045,1462711,1462737,...,yebfly1,yebsee1,yecspi2,yectyr1,yehbla2,yehcar1,yelori1,yeofly1,yercac1,ywcpar
