In [None]:
!cp /kaggle/input/try1-birdclef-local/submission.csv /kaggle/working/

In [None]:
import os
import sys
import torch
import torchaudio
import torchaudio.transforms as AT
sys.path.append("/kaggle/input/birdclef2025-utils")

In [None]:
def get_results(predictions, class_map):
    """
    Process predictions to generate results for sound event detection.
    
    Args:
        predictions (list): List of prediction dictionaries containing 'filename' and 'logits'.
        class_map (dict): Dictionary mapping class indices to class names.
    
    Returns:
        pd.DataFrame: DataFrame containing results with columns for filename, class, start_time, end_time, and probability.
    """
    results = []
    for pred in predictions:
        filename = pred['filename']
        logits = pred['logits']  # Shape: (time_frames, num_classes)
        
        # Convert logits to probabilities using sigmoid
        probs = 1 / (1 + np.exp(-logits))
        
        # Iterate over each class
        for class_idx, class_name in class_map.items():
            class_probs = probs[:, class_idx]
            
            # Apply threshold to identify events
            threshold = 0.5  # Adjustable threshold
            events = (class_probs > threshold).astype(int)
            
            # Find start and end times of events
            diff = np.diff(events, prepend=0, append=0)
            starts = np.where(diff == 1)[0]
            ends = np.where(diff == -1)[0]
            
            # Convert frame indices to seconds (assuming 32kHz and 5s clips)
            frame_duration = 5.0 / logits.shape[0]  # Seconds per frame
            
            for start, end in zip(starts, ends):
                start_time = start * frame_duration
                end_time = end * frame_duration
                max_prob = np.max(class_probs[start:end]) if end > start else class_probs[start]
                
                results.append({
                    'filename': filename,
                    'class': class_name,
                    'start_time': start_time,
                    'end_time': end_time,
                    'probability': max_prob
                })
    
    return pd.DataFrame(results)

In [None]:
test_audio_dir = '../input/birdclef-2025/test_soundscapes/'
file_list = [f for f in sorted(os.listdir(test_audio_dir))]
file_list = [file.split('.')[0] for file in file_list if file.endswith('.ogg')]
class_labels = sorted(os.listdir('../input/birdclef-2025/train_audio/'))

debug = False
debug_st_num=0
debug_num=0
if len(file_list) == 0:
    debug = True
    debug_st_num = 5
    debug_num = 8
    test_audio_dir = '../input/birdclef-2025/train_soundscapes/'
    file_list = [f for f in sorted(os.listdir(test_audio_dir))]
    file_list = [file.split('.')[0] for file in file_list if file.endswith('.ogg')]
    file_list = file_list[debug_st_num:debug_st_num+debug_num]

print('Debug mode:', debug)
print('Number of test soundscapes:', len(file_list))

In [None]:
results = get_results(test_audio_dir,file_list,debug,debug_num)
results.to_csv("submission.csv", index=False) 

## Visualize

In [None]:
if debug == True:
    import numpy as np
    import matplotlib.pyplot as plt
    
    sample_rate = 32000
    n_fft=1024
    win_length=1024
    hop_length=512
    f_min=20
    f_max=16000
    n_mels=128
    
    mel_spectrogram = AT.MelSpectrogram(
        sample_rate=sample_rate,
        n_fft=n_fft,
        win_length=win_length,
        hop_length=hop_length,
        center=True,
        f_min=f_min,
        f_max=f_max,
        pad_mode="reflect",
        power=2.0,
        norm='slaney',
        n_mels=n_mels,
        mel_scale="htk",
        # normalized=True
    )
    
    def audio_to_mel_debug(filepath=None):
        waveform, sample_rate = torchaudio.load(filepath,backend="soundfile")
        len_wav = waveform.shape[1]
        waveform = waveform / torch.max(torch.abs(waveform))
        melspec = mel_spectrogram(waveform)
        melspec = 10*torch.log10(melspec)
        return melspec
    
    def plot_results(results, file_name):
        path = test_audio_dir + file_name + ".ogg"
        specgram = audio_to_mel_debug(path)
        fig, axes = plt.subplots(2, 1, figsize=(10, 8))
        axes[0].set_title(file_name)
        im = axes[0].imshow((specgram[0]), origin="lower", aspect="auto")
        axes[0].set_ylabel("mel bin")
        axes[0].set_xlabel("frame")
        fig.colorbar(im, ax=axes[0])
        heatmap = axes[1].pcolor(results[results["row_id"].str.contains(file_name)].iloc[:12,1:].values.T, edgecolors='k', linewidths=0.1, vmin=0, vmax=1, cmap='Blues')
        fig.colorbar(heatmap, ax=axes[1])
        axes[1].set_xticks(np.arange(0, 12, 1))
        axes[1].set_xticklabels(np.arange(0,60,5))
        axes[1].set_ylabel("species")
        axes[1].set_xlabel("sec")
        fig.tight_layout()
        fig.show()