In [1]:
! python -m pip install --no-index --find-links=../input/openvino-wheels -r ../input/openvino-wheels/requirements.txt


Looking in links: ../input/openvino-wheels
Processing /kaggle/input/openvino-wheels/openvino_dev-2024.6.0-17404-py3-none-any.whl (from openvino-dev[onnx]==2024.6.0->-r ../input/openvino-wheels/requirements.txt (line 1))
Processing /kaggle/input/openvino-wheels/networkx-3.1-py3-none-any.whl (from openvino-dev==2024.6.0->openvino-dev[onnx]==2024.6.0->-r ../input/openvino-wheels/requirements.txt (line 1))
Processing /kaggle/input/openvino-wheels/openvino_telemetry-2025.1.0-py3-none-any.whl (from openvino-dev==2024.6.0->openvino-dev[onnx]==2024.6.0->-r ../input/openvino-wheels/requirements.txt (line 1))
Processing /kaggle/input/openvino-wheels/openvino-2024.6.0-17404-cp311-cp311-manylinux2014_x86_64.whl (from openvino-dev==2024.6.0->openvino-dev[onnx]==2024.6.0->-r ../input/openvino-wheels/requirements.txt (line 1))
Processing /kaggle/input/openvino-wheels/fastjsonschema-2.17.1-py3-none-any.whl (from openvino-dev[onnx]==2024.6.0->-r ../input/openvino-wheels/requirements.txt (line 1))

In [2]:
import os
import gc
import warnings
import logging
import time
import math
import cv2
from pathlib import Path
import joblib
from typing import Union
import numpy as np
import pandas as pd
import librosa
import soundfile as sf
from soundfile import SoundFile 
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.cuda.amp import autocast, GradScaler
import timm
import functools 
from tqdm.auto import tqdm
from glob import glob
import torchaudio
import random
import itertools
from typing import Union

import pickle
import torchaudio
import torchaudio.transforms as AT
from contextlib import contextmanager
import concurrent.futures


from openvino.tools import mo # 用于模型转换
import openvino as ov
from openvino.runtime import Core # 用于模型加载和推理


warnings.filterwarnings("ignore")
logging.basicConfig(level=logging.ERROR)


model_dicts = {
    "efficientnet_b0":[
                       '/kaggle/input/generate-effinet-openvino/model_0/efficientnet_b0.xml' ,
                        "/kaggle/input/generate-effinet-openvino/model_1/efficientnet_b0.xml",
                        "/kaggle/input/generate-effinet-openvino/model_2/efficientnet_b0.xml" ],
     "seresnext26t_32x4d":[
         "/kaggle/input/generate-seresnext-openvino/openvino_models/seresnext26t_32x4d.xml"],
     
     'eca_nfnet_l0':["/kaggle/input/generate-eca-nfnet-openvino/model_0/eca_nfnet_l0_0.xml",
                     "/kaggle/input/generate-eca-nfnet-openvino/model_1/eca_nfnet_l0_1.xml",
                     "/kaggle/input/generate-eca-nfnet-openvino/model_2/eca_nfnet_l0_2.xml"],
     'convnextv2_nano.fcmae_ft_in22k_in1k':[
         "/kaggle/input/generate-convnextv2-openvino/model_0/convnextv2_nano.fcmae_ft_in22k_in1k.xml",
         "/kaggle/input/generate-convnextv2-openvino/model_1/convnextv2_nano.fcmae_ft_in22k_in1k.xml"]
 }

def load_openvino_model(xml_paths, device="CPU"):
    """
    加载 OpenVINO IR 模型并编译。
    """
    core = Core() # 创建 OpenVINO Core 对象
    models = []
    for xml_path in xml_paths:  
        model = core.read_model(model=xml_path) # 读取 OpenVINO IR 模型
        compiled_model = core.compile_model(model=model, device_name=device) # 编译模型以优化到指定设备
        models.append(compiled_model)
        print(f"OpenVINO 模型 '{Path(xml_path).stem}' 已编译到设备: {device}")
    return models

     
     
     

# first efficientnet-b0


In [3]:
def apply_power_to_low_ranked_cols(
    p: np.ndarray,
    top_k: int = 30,
    exponent: Union[int, float] = 2,
    inplace: bool = True
) -> np.ndarray:
    if not inplace:
        p = p.copy()
    tail_cols = np.argsort(-p.max(axis=0))[top_k:]
    p[:, tail_cols] = p[:, tail_cols] ** exponent
    return p
    
class CFG:
 
    test_soundscapes = '/kaggle/input/birdclef-2025/test_soundscapes'
    submission_csv = '/kaggle/input/birdclef-2025/sample_submission.csv'
    taxonomy_csv = '/kaggle/input/birdclef-2025/taxonomy.csv'

    # Audio parameters
    FS = 32000  
    WINDOW_SIZE = 5  
    
    # Mel spectrogram parameters
    N_FFT = 1024
    HOP_LENGTH = 512
    N_MELS = 128
    FMIN = 50
    FMAX = 16000
    TARGET_SHAPE = (256, 256)

  
    in_channels = 1
    device = 'cpu'  
    
    # Inference parameters
    batch_size = 32
    use_tta = False
    # 是否使用测试时增强（Test Time Augmentation，TTA）。 TTA 是一种通过对测试样本进行增强来提高模型性能的技术。
    tta_count = 2   
    # TTA 的次数。 如果 use_tta 为 True，则指定对每个测试样本进行多少次增强。
    threshold = 0.5

    debug =  False
    # True  False
    debug_count = 3

cfg = CFG()
print(f"Using device: {cfg.device}")
print(f"Loading taxonomy data...")
taxonomy_df = pd.read_csv(cfg.taxonomy_csv)
species_ids = taxonomy_df['primary_label'].tolist()
num_classes = len(species_ids)
print(f"Number of classes: {num_classes}")

def audio2melspec(audio_data, cfg):
    """Convert audio data to mel spectrogram"""
    if np.isnan(audio_data).any():
        mean_signal = np.nanmean(audio_data)
        audio_data = np.nan_to_num(audio_data, nan=mean_signal)

    mel_spec = librosa.feature.melspectrogram(
        y=audio_data,
        sr=cfg.FS,
        n_fft=cfg.N_FFT,
        hop_length=cfg.HOP_LENGTH,
        n_mels=cfg.N_MELS,
        fmin=cfg.FMIN,
        fmax=cfg.FMAX,
        power=2.0,
        pad_mode="reflect",
        norm='slaney',
        htk=True,
        center=True,
    )

    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    mel_spec_norm = (mel_spec_db - mel_spec_db.min()) / (mel_spec_db.max() - mel_spec_db.min() + 1e-8)
    
    return mel_spec_norm

def process_audio_segment(audio_data, cfg):
    """Process audio segment to get mel spectrogram"""
    if len(audio_data) < cfg.FS * cfg.WINDOW_SIZE:
        audio_data = np.pad(audio_data, 
                          (0, cfg.FS * cfg.WINDOW_SIZE - len(audio_data)), 
                          mode='constant')
    
    mel_spec = audio2melspec(audio_data, cfg)
    
    # Resize if needed
    if mel_spec.shape != cfg.TARGET_SHAPE:
        mel_spec = cv2.resize(mel_spec, cfg.TARGET_SHAPE, interpolation=cv2.INTER_LINEAR)
        
    return mel_spec
def sigmoid(x): 
    return 1 / (1 + np.exp(-x))
def predict_on_spectrogram(audio_path, models, cfg, species_ids):
    """Process a single audio file and predict species presence for each 5-second segment"""
    predictions = []
    row_ids = []
    soundscape_id = Path(audio_path).stem   #提取音频文件的名称
    
    # try:
    print(f"predict_on_spectrogram:Processing {soundscape_id}")
    audio_data, _ = librosa.load(audio_path, sr=cfg.FS)
    
    total_segments = int(len(audio_data) / (cfg.FS * cfg.WINDOW_SIZE))
        #将   soundscape文件中所有的ogg音频文件截取成  5s/5s 的片段。
    for segment_idx in range(total_segments):
        start_sample = segment_idx * cfg.FS * cfg.WINDOW_SIZE
        end_sample = start_sample + cfg.FS * cfg.WINDOW_SIZE
        segment_audio = audio_data[start_sample:end_sample]
        
        end_time_sec = (segment_idx + 1) * cfg.WINDOW_SIZE
        row_id = f"{soundscape_id}_{end_time_sec}"
        row_ids.append(row_id)


        
        mel_spec = process_audio_segment(segment_audio, cfg)
        ov_input = mel_spec[np.newaxis, np.newaxis, :, :].astype(np.float32)
        
        if len(models) == 1:
            outputs = model[0](ov_input)['output_0']
            final_preds = sigmoid(outputs).squeeze()
        else:
            segment_preds = []
            for model in models:
                outputs = model(ov_input)['output_0']
                probs = sigmoid(outputs).squeeze()
                segment_preds.append(probs)

            final_preds = np.mean(segment_preds, axis=0)
                
        predictions.append(final_preds)
            
    # except Exception as e:
    #     print(f"predict_on_spectrogram:Error processing {audio_path}: {e}")
        
    # 第一平滑
    # print(len(predictions))
    # print(predictions)
    predictions = np.vstack(predictions) 
    predictions = apply_power_to_low_ranked_cols(predictions, top_k=30,exponent=2)
    # print(predictions.shape)
    return row_ids, predictions

def run_inference(cfg, models, species_ids):
    """Run inference on all test soundscapes"""
    test_files = list(Path(cfg.test_soundscapes).glob('*.ogg'))
    
    if cfg.debug:
        print(f"run_inference:Debug mode enabled, using only {cfg.debug_count} files")
        test_files = test_files[:cfg.debug_count]
    
    print(f"run_inference:Found {len(test_files)} test soundscapes")

    all_row_ids = []
    all_predictions = []
    

    for audio_path in tqdm(test_files):
        row_ids, predictions = predict_on_spectrogram(str(audio_path), models, cfg, species_ids)
        all_row_ids.extend(row_ids)
        all_predictions.extend(predictions.tolist())
    
    return all_row_ids, all_predictions   # 记录的id列表。还有对应的预测向量。

def create_submission(row_ids, predictions, species_ids, cfg):
    """Create submission dataframe"""
    print("create_submission:Creating submission dataframe...")

    submission_dict = {'row_id': row_ids}
    # print(f"create_submission:row_ids:{row_ids}")
    
    for i, species in enumerate(species_ids):
        submission_dict[species] = [pred[i] for pred in predictions]

    submission_df = pd.DataFrame(submission_dict)

    submission_df.set_index('row_id', inplace=True)

    sample_sub = pd.read_csv(cfg.submission_csv, index_col='row_id')

    missing_cols = set(sample_sub.columns) - set(submission_df.columns)
    if missing_cols:
        print(f"create_submission:Warning: Missing {len(missing_cols)} species columns in submission")
        for col in missing_cols:
            submission_df[col] = 0.0

    submission_df = submission_df[sample_sub.columns]
    # print(f'create_submission:使用row_id作为索引的submission_df:{submission_df}')

    submission_df = submission_df.reset_index()
    # print(f'create_submission:reset_index之后的submission_df:{submission_df}')
    
    return submission_df
def main():

    start_time = time.time()
    print("main:Starting BirdCLEF-2025 inference...")
    print(f"TTA enabled: {cfg.use_tta} (variations: {cfg.tta_count if cfg.use_tta else 0})")

    models = load_openvino_model(xml_paths=model_dicts["efficientnet_b0"])

    
    if not models:
        print("main:No models found! Please check model paths.")
        return
    
    print(f"Model usage: {'Single model' if len(models) == 1 else f'Ensemble of {len(models)} models'}")

    row_ids, predictions = run_inference(cfg, models, species_ids)

    submission_df = create_submission(row_ids, predictions, species_ids, cfg)

    submission_path = 'submission0.csv'
    submission_df.to_csv(submission_path, index=False)
    # 时间平滑

    sub = pd.read_csv('submission0.csv')
    cols = sub.columns[1:]
    groups = sub['row_id'].str.rsplit('_', n=1).str[0]
    groups = groups.values
    for group in np.unique(groups):
        sub_group = sub[group == groups]
        predictions = sub_group[cols].values
        new_predictions = predictions.copy()
        for i in range(1, predictions.shape[0]-1):
            new_predictions[i] = (predictions[i-1] * 0.1) + (predictions[i] * 0.8) + (predictions[i+1] * 0.1)
        new_predictions[0] = (predictions[0] * 0.8) + (predictions[1] * 0.2)
        new_predictions[-1] = (predictions[-1] * 0.8) + (predictions[-2] * 0.2)
        sub_group[cols] = new_predictions
        sub[group == groups] = sub_group
    sub.to_csv("submission0.csv", index=False)
        
    
    end_time = time.time()
    print(f"Inference completed in {(end_time - start_time)/60:.2f} minutes")
if __name__ == "__main__":
    main()


Using device: cpu
Loading taxonomy data...
Number of classes: 206
main:Starting BirdCLEF-2025 inference...
TTA enabled: False (variations: 0)
OpenVINO 模型 'efficientnet_b0' 已编译到设备: CPU
OpenVINO 模型 'efficientnet_b0' 已编译到设备: CPU
OpenVINO 模型 'efficientnet_b0' 已编译到设备: CPU
Model usage: Ensemble of 3 models
run_inference:Found 0 test soundscapes


0it [00:00, ?it/s]

create_submission:Creating submission dataframe...
Inference completed in 0.03 minutes


In [4]:
# df = pd.read_csv("/kaggle/working/submission0.csv")
# df.head()

# Second seresnext26t_32

In [5]:
class CFG:
    
    seed = 42
    print_freq = 100
    num_workers = 4

    stage = 'train_bce'

    train_datadir = '/kaggle/input/birdclef-2025/test_audio'
    train_csv = '/kaggle/input/birdclef-2025/train.csv'
    test_soundscapes = '/kaggle/input/birdclef-2025/test_soundscapes'
    submission_csv = '/kaggle/input/birdclef-2025/sample_submission.csv'
    taxonomy_csv = '/kaggle/input/birdclef-2025/taxonomy.csv'
                  
 
    pretrained = False
    in_channels = 1

    
    SR = 32000
    target_duration = 5
    train_duration = 10
    infer_duration  = 5


    sample_rate=32000
    hop_length=417
    n_mels=256
    f_min=20
    f_max=16000
    n_fft=2048
    normal=80

    device = 'cpu'

cfg = CFG()

In [6]:
print(f"Using device: {cfg.device}")
print(f"Loading taxonomy data...")
taxonomy_df = pd.read_csv(cfg.taxonomy_csv)
species_ids = taxonomy_df['primary_label'].tolist()
num_classes = len(species_ids)
print(f"Number of classes: {num_classes}")

Using device: cpu
Loading taxonomy data...
Number of classes: 206


In [7]:
def set_seed(seed=42):
    """
    Set seed for reproducibility
    """
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(cfg.seed)


In [8]:


class BirdCLEFModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg
        
        taxonomy_df = pd.read_csv('/kaggle/input/birdclef-2025/taxonomy.csv')
        self.num_classes = len(taxonomy_df)

        self.bn0 = nn.BatchNorm2d(cfg['n_mels'])
        
        self.backbone = timm.create_model(
            cfg['model_name'],
            pretrained=False,
            in_chans=cfg['in_channels'],
            drop_rate=0.0,
            drop_path_rate=0.0,
        )

        layers = list(self.backbone.children())[:-2]
        self.encoder = nn.Sequential(*layers)
        
        if "efficientnet" in self.cfg['model_name']:
            backbone_out = self.backbone.classifier.in_features
        elif "eca" in self.cfg['model_name']:
            backbone_out = self.backbone.head.fc.in_features
        elif "res" in self.cfg['model_name']:
            backbone_out = self.backbone.fc.in_features
        else:
            backbone_out = self.backbone.num_features
            
        
        self.fc1 = nn.Linear(backbone_out, backbone_out, bias=True)
        self.att_block = AttBlockV2(backbone_out, self.num_classes, activation="sigmoid")

        self.melspec_transform = torchaudio.transforms.MelSpectrogram(
            sample_rate=self.cfg['SR'],
            hop_length=self.cfg['hop_length'],
            n_mels=self.cfg['n_mels'],
            f_min=self.cfg['f_min'],
            f_max=self.cfg['f_max'],
            n_fft=self.cfg['n_fft'],
            pad_mode="constant",
            norm="slaney",
            onesided=True,
            mel_scale="htk",
        )
        if self.cfg['device'] == "cuda":
            self.melspec_transform = self.melspec_transform.cuda()
        else:
            self.melspec_transform = self.melspec_transform.cpu()

        self.db_transform = torchaudio.transforms.AmplitudeToDB(
            stype="power", top_db=80
        )


    def extract_feature(self,x):
        x = x.permute((0, 1, 3, 2))
        frames_num = x.shape[2]
        
        x = x.transpose(1, 3)
        x = self.bn0(x)
        x = x.transpose(1, 3)
        
        # if self.training:
        #    x = self.spec_augmenter(x)
        
        x = x.transpose(2, 3)
        # (batch_size, channels, freq, frames)
        x = self.encoder(x)
        
        # (batch_size, channels, frames)
        x = torch.mean(x, dim=2)
        
        # channel smoothing
        x1 = F.max_pool1d(x, kernel_size=3, stride=1, padding=1)
        x2 = F.avg_pool1d(x, kernel_size=3, stride=1, padding=1)
        x = x1 + x2
        
        x = F.dropout(x, p=0.5, training=self.training)
        x = x.transpose(1, 2)
        x = F.relu_(self.fc1(x))
        x = x.transpose(1, 2)
        x = F.dropout(x, p=0.5, training=self.training)
        return x, frames_num
        
    @torch.cuda.amp.autocast(enabled=False)
    def transform_to_spec(self, audio):

        audio = audio.float()
        
        spec = self.melspec_transform(audio)
        spec = self.db_transform(spec)

        if self.cfg['normal'] == 80:
            spec = (spec + 80) / 80
        elif self.cfg['normal'] == 255:
            spec = spec / 255
        else:
            raise NotImplementedError
                
        if self.cfg['in_channels'] == 3:
            spec = image_delta(spec)
        
        return spec

    def forward(self, x):

        with torch.no_grad():
            x = self.transform_to_spec(x)
        
        x, frames_num = self.extract_feature(x)
        
        (clipwise_output, norm_att, segmentwise_output) = self.att_block(x)
        logit = torch.sum(norm_att * self.att_block.cla(x), dim=2)
        segmentwise_logit = self.att_block.cla(x).transpose(1, 2)
        segmentwise_output = segmentwise_output.transpose(1, 2)

        return torch.logit(clipwise_output)

    def infer(self, x, tta_delta=2):
        with torch.no_grad():
            x = self.transform_to_spec(x)
        x,_ = self.extract_feature(x)
        time_att = torch.tanh(self.att_block.att(x))
        feat_time = x.size(-1)
        start = (
            feat_time / 2 - feat_time * (self.cfg['infer_duration'] / self.cfg['duration_train']) / 2
        )
        end = start + feat_time * (self.cfg['infer_duration'] / self.cfg['duration_train'])
        start = int(start)
        end = int(end)
        pred = self.attention_infer(start,end,x,time_att)

        start_minus = max(0, start-tta_delta)
        end_minus=end-tta_delta
        pred_minus = self.attention_infer(start_minus,end_minus,x,time_att)

        start_plus = start+tta_delta
        end_plus=min(feat_time, end+tta_delta)
        pred_plus = self.attention_infer(start_plus,end_plus,x,time_att)

        pred = 0.5*pred + 0.25*pred_minus + 0.25*pred_plus
        return pred
        
    def attention_infer(self,start,end,x,time_att):
        feat = x[:, :, start:end]
        framewise_pred = torch.sigmoid(self.att_block.cla(feat))
        framewise_pred_max = framewise_pred.max(dim=2)[0]
        return framewise_pred_max

In [9]:
def load_sample(path, cfg):
    audio, orig_sr = sf.read(path, dtype="float32")
    seconds = []
    audio_length = cfg.SR * cfg.target_duration
    step = audio_length
    for i in range(audio_length, len(audio) + step, step):
        start = max(0, i - audio_length)
        end = start + audio_length
        if end > len(audio):
            pass
        else:
            seconds.append(int(end/cfg.SR))

    audio = np.concatenate([audio,audio,audio])
    audios = []
    for i,second in enumerate(seconds):
        end_seconds = int(second)
        start_seconds = int(end_seconds - cfg.target_duration)
    
        end_index = int(cfg.SR * (end_seconds + (cfg.train_duration - cfg.target_duration) / 2) ) + len(audio) // 3
        start_index = int(cfg.SR * (start_seconds - (cfg.train_duration - cfg.target_duration) / 2) ) + len(audio) // 3
        end_pad = int(cfg.SR * (cfg.train_duration - cfg.target_duration) / 2) 
        start_pad = int(cfg.SR * (cfg.train_duration - cfg.target_duration) / 2) 
        y = audio[start_index:end_index].astype(np.float32)
        if i==0:
            y[:start_pad] = 0
        elif i==(len(seconds)-1):
            y[-end_pad:] = 0
        audios.append(y)

    return audios
def gen_melspec(x):
    melspec_transform = torchaudio.transforms.MelSpectrogram(
            sample_rate = cfg.SR,
            hop_length = cfg.hop_length,
            n_mels = cfg.n_mels,
            f_min = cfg.f_min,
            f_max = cfg.f_max,
            n_fft = cfg.n_fft,
            pad_mode="constant",
            norm="slaney",
            onesided=True,
            mel_scale="htk")
    
    melspec_transform = melspec_transform.cpu()

    db_transform = torchaudio.transforms.AmplitudeToDB(
            stype="power", top_db=80)
    x = x.astype(np.float32)    
    x  = torch.from_numpy(x)
    spec = melspec_transform(x)
    spec = db_transform(spec)

    if cfg.normal == 80:
        spec = (spec + 80) / 80
    elif cfg.normal == 255:
        spec = spec / 255
    else:
        raise NotImplementedError
    return spec
    
def sigmoid(x):
    s = 1 / (1 + np.exp(-x))
    return s

In [10]:
def predict_on_spectrogram(audio_path, compiled_model, cfg, species_ids):
    """Process a single audio file and predict species presence for each 5-second segment"""
    audio_path = str(audio_path)
    predictions = []
    row_ids = []
    soundscape_id = Path(audio_path).stem

    print(f"Processing {soundscape_id}")
    audio_data = load_sample(audio_path, cfg)

    model_input_name = compiled_model.input(0).get_any_name()
    model_output_name = compiled_model.output(1).get_any_name()
    infer_request = compiled_model.create_infer_request()

    

    for segment_idx, audio_input in enumerate(audio_data):
        
        end_time_sec = (segment_idx + 1) * cfg.target_duration
        row_id = f"{soundscape_id}_{end_time_sec}"
        row_ids.append(row_id)
        
        x = gen_melspec(audio_input).numpy()
        ov_input_audio = x[np.newaxis, np.newaxis, :].astype(np.float32)

        infer_request.set_tensor(model_input_name, ov.Tensor(ov_input_audio))
        infer_request.infer()
        segmentwise_probabilities = infer_request.get_tensor(model_output_name).data
        # print(f"Shape of segmentwise_probabilities: {segmentwise_probabilities.shape}") 
        
        
        feat_time = segmentwise_probabilities.shape[1] 
        
        infer_duration_ratio = cfg.infer_duration / cfg.train_duration 
        start_center = int(feat_time / 2 - feat_time * infer_duration_ratio / 2)
        end_center = int(start_center + feat_time * infer_duration_ratio)
        tta_delta_feat = 2
        
        start_minus = max(0, start_center - tta_delta_feat)
        end_minus = end_center - tta_delta_feat

        start_plus = start_center + tta_delta_feat
        end_plus = min(feat_time, end_center + tta_delta_feat)
        pred_center = np.max(segmentwise_probabilities[:, start_center:end_center, :], axis=1)
        pred_minus = np.max(segmentwise_probabilities[:, start_minus:end_minus, :], axis=1)
        pred_plus = np.max(segmentwise_probabilities[:, start_plus:end_plus, :], axis=1)

        final_preds = 0.5 * pred_center + 0.25 * pred_minus + 0.25 * pred_plus

        predictions.append(final_preds.squeeze())

    predictions = np.stack(predictions, axis=0)
    
    return row_ids, predictions

In [11]:
def run_inference(cfg, models, species_ids):
    """Run inference on all test soundscapes"""
    test_files = list(Path(cfg.test_soundscapes).glob('*.ogg'))
    if len(test_files) == 0:
        test_files = sorted(glob(str(Path('/kaggle/input/birdclef-2025/train_soundscapes') / '*.ogg')))[:3]
    
    print(f"Found {len(test_files)} test soundscapes")

    all_row_ids = []
    all_predictions = []
    compiled_model  = models[0] 
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:

        partial_predict_on_spectrogram = functools.partial(
            predict_on_spectrogram,
            compiled_model=compiled_model, 
            cfg=cfg,
            species_ids=species_ids
        )
        
        results = list(
        executor.map(
            partial_predict_on_spectrogram,
            test_files
        )
    )

    for rids, preds in results:
        all_row_ids.extend(rids)
        all_predictions.extend(preds)
    
    return all_row_ids, all_predictions

def create_submission(row_ids, predictions, species_ids, cfg):
    """Create submission dataframe"""
    print("Creating submission dataframe...")

    submission_dict = {'row_id': row_ids}
    
    for i, species in enumerate(species_ids):
        submission_dict[species] = [pred[i] for pred in predictions]

    submission_df = pd.DataFrame(submission_dict)

    submission_df.set_index('row_id', inplace=True)

    sample_sub = pd.read_csv(cfg.submission_csv, index_col='row_id')

    missing_cols = set(sample_sub.columns) - set(submission_df.columns)
    if missing_cols:
        print(f"Warning: Missing {len(missing_cols)} species columns in submission")
        for col in missing_cols:
            submission_df[col] = 0.0

    submission_df = submission_df[sample_sub.columns]

    submission_df = submission_df.reset_index()
    
    return submission_df


def smooth_submission(submission_path):
        """
        Post-process the submission CSV by smoothing predictions to enforce temporal consistency.
        
        For each soundscape (grouped by the file name part of 'row_id'), each row's predictions
        are averaged with those of its neighbors using defined weights.
        
        :param submission_path: Path to the submission CSV file.
        """
        print("Smoothing submission predictions...")
        sub = pd.read_csv(submission_path)
        cols = sub.columns[1:]
        # Extract group names by splitting row_id on the last underscore
        groups = sub['row_id'].str.rsplit('_', n=1).str[0].values
        unique_groups = np.unique(groups)
        
        for group in unique_groups:
            # Get indices for the current group
            idx = np.where(groups == group)[0]
            sub_group = sub.iloc[idx].copy()
            predictions = sub_group[cols].values
            new_predictions = predictions.copy()
            
            if predictions.shape[0] > 1:
                # Smooth the predictions using neighboring segments
                new_predictions[0] = (predictions[0] * 0.8) + (predictions[1] * 0.2)
                new_predictions[-1] = (predictions[-1] * 0.8) + (predictions[-2] * 0.2)
                for i in range(1, predictions.shape[0]-1):
                    new_predictions[i] = (predictions[i-1] * 0.1) + (predictions[i] * 0.8) + (predictions[i+1] * 0.1)
            # Replace the smoothed values in the submission dataframe
            sub.iloc[idx, 1:] = new_predictions
        
        sub.to_csv(submission_path, index=False)
        print(f"Smoothed submission saved to {submission_path}")

In [12]:
def main():
    start_time = time.time()
    print("Starting BirdCLEF-2025 inference...")

    models = load_openvino_model(xml_paths=model_dicts["seresnext26t_32x4d"])
    
    if not models:
        print("No models found! Please check model paths.")
        return
    
    print(f"Model usage: {'Single model' if len(models) == 1 else f'Ensemble of {len(models)} models'}")

    row_ids, predictions = run_inference(cfg, models, species_ids)

    submission_df = create_submission(row_ids, predictions, species_ids, cfg)

    submission_path = 'submission1.csv'
    submission_df.to_csv(submission_path, index=False)
    print(f"Submission saved to {submission_path}")

    smooth_submission(submission_path)
    
    end_time = time.time()
    print(f"Inference completed in {(end_time - start_time)/60:.2f} minutes")

In [13]:
if __name__ == "__main__":
    main()

Starting BirdCLEF-2025 inference...
OpenVINO 模型 'seresnext26t_32x4d' 已编译到设备: CPU
Model usage: Single model
Found 3 test soundscapes
Processing H02_20230420_074000
Processing H02_20230420_112000
Processing H02_20230420_154500
Creating submission dataframe...
Submission saved to submission1.csv
Smoothing submission predictions...
Smoothed submission saved to submission1.csv
Inference completed in 0.14 minutes


In [14]:
# df = pd.read_csv("/kaggle/working/submission1.csv")
# df.head()

# third eca_nfnet_l0


In [15]:
def apply_power_to_low_ranked_cols(
    p: np.ndarray,
    top_k: int = 30,
    exponent: Union[int, float] = 2,
    inplace: bool = True
) -> np.ndarray:
    """
    Rank columns by their column‑wise maximum and raise every column whose
    rank falls below `top_k` to a given power.

    Parameters
    ----------
    p : np.ndarray
        A 2‑D array of shape **(n_chunks, n_classes)**.

        - **n_chunks** is the number of fixed‑length time chunks obtained
          after slicing the input audio (or other sequential data).  
          *Example:* In the BirdCLEF `test_soundscapes` set, each file is
          60 s long. If you extract non‑overlapping 5 s windows,  
          `n_chunks = 60 s / 5 s = 12`.
        - **n_classes** is the number of classes being predicted.
        - Each element `p[i, j]` is the score or probability of class *j*
          in chunk *i*.

    top_k : int, default=30
        The highest‑ranked columns (by their maximum value) that remain
        unchanged.

    exponent : int or float, default=2
        The power applied to the selected low‑ranked columns  
        (e.g. `2` squares, `0.5` takes the square root, `3` cubes).

    inplace : bool, default=True
        If `True`, modify `p` in place.  
        If `False`, operate on a copy and leave the original array intact.

    Returns
    -------
    np.ndarray
        The transformed array. It is the same object as `p` when
        `inplace=True`; otherwise, it is a new array.

    """
    if not inplace:
        p = p.copy()

    # Identify columns whose max value ranks below `top_k`
    tail_cols = np.argsort(-p.max(axis=0))[top_k:]

    # Apply the power transformation to those columns
    p[:, tail_cols] = p[:, tail_cols] ** exponent
    return p

In [16]:
test_audio_dir = '../input/birdclef-2025/test_soundscapes/'
file_list = [f for f in sorted(os.listdir(test_audio_dir))]
file_list = [file.split('.')[0] for file in file_list if file.endswith('.ogg')]
debug = False

if debug == True:
    file_list = file_list[:3]
    
print('Debug mode:', debug)
print('Number of test soundscapes:', len(file_list))

Debug mode: False
Number of test soundscapes: 0


In [17]:
wav_sec = 5
sample_rate = 32000
min_segment = sample_rate*wav_sec

class_labels = sorted(os.listdir('../input/birdclef-2025/train_audio/'))

n_fft=1024
win_length=1024
hop_length=512
f_min=50
f_max=16000
n_mels=128

mel_spectrogram = AT.MelSpectrogram(
    sample_rate=sample_rate,
    n_fft=n_fft,
    win_length=win_length,
    hop_length=hop_length,
    center=True,
    f_min=f_min,
    f_max=f_max,
    pad_mode="reflect",
    power=2.0,
    norm='slaney',
    n_mels=n_mels,
    mel_scale="htk",
    # normalized=True
)

def normalize_std(spec, eps=1e-6):
    mean = torch.mean(spec)
    std = torch.std(spec)
    return torch.where(std == 0, spec-mean, (spec - mean) / (std+eps))

def audio_to_mel(filepath=None):
    waveform, sample_rate = torchaudio.load(filepath,backend="soundfile")
    len_wav = waveform.shape[1]
    waveform = waveform[0,:].reshape(1, len_wav) # stereo->mono mono->mono
    PREDS = []
    for i in range(12):
        waveform2 = waveform[:,i*sample_rate*5:i*sample_rate*5+sample_rate*5]
        melspec = mel_spectrogram(waveform2)
        melspec = torch.log(melspec+1e-6)
        melspec = normalize_std(melspec)
        melspec = torch.unsqueeze(melspec, dim=0)
        
        PREDS.append(melspec)
    return torch.vstack(PREDS)

In [18]:
def init_layer(layer):
    nn.init.xavier_uniform_(layer.weight)
    if hasattr(layer, "bias"):
        if layer.bias is not None:
            layer.bias.data.fill_(0.)


def init_bn(bn):
    bn.bias.data.fill_(0.)
    bn.weight.data.fill_(1.0)


def init_weights(model):
    classname = model.__class__.__name__
    if classname.find("Conv2d") != -1:
        nn.init.xavier_uniform_(model.weight, gain=np.sqrt(2))
        model.bias.data.fill_(0)
    elif classname.find("BatchNorm") != -1:
        model.weight.data.normal_(1.0, 0.02)
        model.bias.data.fill_(0)
    elif classname.find("GRU") != -1:
        for weight in model.parameters():
            if len(weight.size()) > 1:
                nn.init.orghogonal_(weight.data)
    elif classname.find("Linear") != -1:
        model.weight.data.normal_(0, 0.01)
        model.bias.data.zero_()


def interpolate(x, ratio):
    (batch_size, time_steps, classes_num) = x.shape
    upsampled = x[:, :, None, :].repeat(1, 1, ratio, 1)
    upsampled = upsampled.reshape(batch_size, time_steps * ratio, classes_num)
    return upsampled


def pad_framewise_output(framewise_output, frames_num):
    output = F.interpolate(
        framewise_output.unsqueeze(1),
        size=(frames_num, framewise_output.size(2)),
        align_corners=True,
        mode="bilinear").squeeze(1)

    return output


class AttBlockV2(nn.Module):
    def __init__(self,
                 in_features: int,
                 out_features: int,
                 activation="linear"):
        super().__init__()

        self.activation = activation
        self.att = nn.Conv1d(
            in_channels=in_features,
            out_channels=out_features,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=True)
        self.cla = nn.Conv1d(
            in_channels=in_features,
            out_channels=out_features,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=True)

        self.init_weights()

    def init_weights(self):
        init_layer(self.att)
        init_layer(self.cla)

    def forward(self, x):
        norm_att = torch.softmax(torch.tanh(self.att(x)), dim=-1)
        cla = self.nonlinear_transform(self.cla(x))
        x = torch.sum(norm_att * cla, dim=2)
        return x, norm_att, cla

    def nonlinear_transform(self, x):
        if self.activation == 'linear':
            return x
        elif self.activation == 'sigmoid':
            return torch.sigmoid(x)


class TimmSED(nn.Module):
    def __init__(self, base_model_name: str, pretrained=False, num_classes=24, in_channels=1, n_mels=24):
        super().__init__()

        self.bn0 = nn.BatchNorm2d(n_mels)

        base_model = timm.create_model(
            base_model_name, pretrained=pretrained, in_chans=in_channels)
        layers = list(base_model.children())[:-2]
        self.encoder = nn.Sequential(*layers)

        in_features = base_model.num_features

        self.fc1 = nn.Linear(in_features, in_features, bias=True)
        self.att_block2 = AttBlockV2(
            in_features, num_classes, activation="sigmoid")

        self.init_weight()

    def init_weight(self):
        init_bn(self.bn0)
        init_layer(self.fc1)
        

    def forward(self, input_data):
        x = input_data.transpose(2,3)
        x = torch.cat((x,x,x),1)

        x = x.transpose(2, 3)

        x = self.encoder(x)
        
        x = torch.mean(x, dim=2)

        x1 = F.max_pool1d(x, kernel_size=3, stride=1, padding=1)
        x2 = F.avg_pool1d(x, kernel_size=3, stride=1, padding=1)
        x = x1 + x2

        x = x.transpose(1, 2)
        x = F.relu_(self.fc1(x))
        x = x.transpose(1, 2)

        (clipwise_output, norm_att, segmentwise_output) = self.att_block2(x)
        logit = torch.sum(norm_att * self.att_block2.cla(x), dim=2)

        output_dict = {
            'logit': logit,
        }

        return output_dict

In [19]:
base_model_name='eca_nfnet_l0'
pretrained=False
in_channels=3
models= load_openvino_model(xml_paths=model_dicts['eca_nfnet_l0'])



OpenVINO 模型 'eca_nfnet_l0_0' 已编译到设备: CPU
OpenVINO 模型 'eca_nfnet_l0_1' 已编译到设备: CPU
OpenVINO 模型 'eca_nfnet_l0_2' 已编译到设备: CPU


In [20]:
def prediction(afile):    
    path = test_audio_dir + afile + '.ogg'
    
    sig = audio_to_mel(path).numpy()
    ov_input = sig.astype(np.float32)
    
    outputs = None
    for model in models:
        
        infer_request = model.create_infer_request()
        output = infer_request.infer(ov_input)['output0'] 
        
        probs = sigmoid(output)
        p = apply_power_to_low_ranked_cols(probs, top_k=30,exponent=2)
        if outputs is None: outputs = p
        else: outputs += p
    
    outputs /= len(models)
    
    local_pred_results = {'row_id': []}
    for species_code in class_labels: # class_labels 应该在全局或作为参数传入
        local_pred_results[species_code] = []
        
    for i in range(12):        
        chunk_end_time = (i + 1) * 5
        row_id = afile + '_' + str(chunk_end_time)
        local_pred_results['row_id'].append(row_id)
        bird_no = 0
        for bird in class_labels:         
            local_pred_results[bird].append(outputs[i,bird_no])
            bird_no += 1
    gc.collect()

    return local_pred_results 




            

In [21]:
all_prediction_results = []
start = time.time()
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
    # executor.map 返回一个迭代器，其元素是每个 prediction 函数的返回值
    for result_dict in executor.map(prediction, file_list):
        all_prediction_results.append(result_dict)
end_t = time.time()

print(f"推理完成，耗时 {end_t - start:.2f} 秒")

final_pred_dataframe_data = {'row_id': []}
for species_code in class_labels: # class_labels 应该可访问
    final_pred_dataframe_data[species_code] = []

for single_file_results in all_prediction_results:
    final_pred_dataframe_data['row_id'].extend(single_file_results['row_id'])
    for species_code in class_labels:
        final_pred_dataframe_data[species_code].extend(single_file_results[species_code])
print("finish")

推理完成，耗时 0.00 秒
finish


In [22]:
results = pd.DataFrame(final_pred_dataframe_data, columns = ['row_id'] + class_labels) 
display(results.head())

Unnamed: 0,row_id,1139490,1192948,1194042,126247,1346504,134933,135045,1462711,1462737,...,yebfly1,yebsee1,yecspi2,yectyr1,yehbla2,yehcar1,yelori1,yeofly1,yercac1,ywcpar


In [23]:
results.to_csv("submission2.csv", index=False)    

sub = pd.read_csv('submission2.csv')
cols = sub.columns[1:]
groups = sub['row_id'].str.rsplit('_', n=1).str[0]
groups = groups.values
for group in np.unique(groups):
    sub_group = sub[group == groups]
    predictions = sub_group[cols].values
    new_predictions = predictions.copy()
    for i in range(1, predictions.shape[0]-1):
        new_predictions[i] = (predictions[i-1] * 0.1) + (predictions[i] * 0.8) + (predictions[i+1] * 0.1)
    new_predictions[0] = (predictions[0] * 0.9) + (predictions[1] * 0.1)
    new_predictions[-1] = (predictions[-1] * 0.9) + (predictions[-2] * 0.1)
    sub_group[cols] = new_predictions
    sub[group == groups] = sub_group
sub.to_csv("submission2.csv", index=False)


if debug:
    display(results)

# fourth convnestv


In [24]:
def apply_power_to_low_ranked_cols(
    p: np.ndarray,
    top_k: int = 30,
    exponent: Union[int, float] = 2,
    inplace: bool = True
) -> np.ndarray:
    if not inplace:
        p = p.copy()
    tail_cols = np.argsort(-p.max(axis=0))[top_k:]
    p[:, tail_cols] = p[:, tail_cols] ** exponent
    return p
    

class CFG: 
    test_soundscapes = '/kaggle/input/birdclef-2025/test_soundscapes'
    submission_csv = '/kaggle/input/birdclef-2025/sample_submission.csv'
    taxonomy_csv = '/kaggle/input/birdclef-2025/taxonomy.csv'
    
    # ------------------------------------------- #
    # [IMPORTANT]
    # * Melspectrogram & Audio Params
    # ------------------------------------------- #
    FS = 32000  
    WINDOW_SIZE = 5
    N_FFT = 2048
    HOP_LENGTH = 512
    N_MELS = 512
    FMIN = 20
    FMAX = 16000
    TARGET_SHAPE = (256, 256)

    # ------------------------------------------- #
    # * Model def
    # ------------------------------------------- #
    use_specific_folds = True
    folds = [0,1]
    in_channels = 1
    device = 'cpu'  
    
    # Inference parameters
    batch_size = 16
    use_tta = False  
    tta_count = 3
    threshold = 0.5

    # util
    debug = False
    debug_count = 3

cfg = CFG()

print(f"Using device: {cfg.device}")
print(f"Loading taxonomy data...")
taxonomy_df = pd.read_csv(cfg.taxonomy_csv)
species_ids = taxonomy_df['primary_label'].tolist()
num_classes = len(species_ids)
print(f"Number of classes: {num_classes}")



class GeM(nn.Module):
    def __init__(self, p=3, eps=1e-6):
        super(GeM, self).__init__()
        self.p = nn.Parameter(torch.ones(1)*p)
        self.eps = eps

    def forward(self, x):
        return self.gem(x, p=self.p, eps=self.eps)

    def gem(self, x, p=3, eps=1e-6):
        return F.avg_pool2d(x.clamp(min=eps).pow(p), (x.size(-2), x.size(-1))).pow(1./p)

    def __repr__(self):
        return self.__class__.__name__ + \
                '(' + 'p=' + '{:.4f}'.format(self.p.data.tolist()[0]) + \
                ', ' + 'eps=' + str(self.eps) + ')'
class BirdCLEFModel(nn.Module):
    def __init__(self, cfg, num_classes):
        super().__init__()
        self.cfg = cfg
        
        self.backbone = timm.create_model(
            cfg.model_name,
            pretrained=False,  
            in_chans=cfg.in_channels,
            drop_rate=0.0,    
            drop_path_rate=0.0
        )
        
        if 'efficientnet' in cfg.model_name:
            backbone_out = self.backbone.classifier.in_features
            self.backbone.classifier = nn.Identity()
        elif 'resnet' in cfg.model_name:
            backbone_out = self.backbone.fc.in_features
            self.backbone.fc = nn.Identity()
        else:
            backbone_out = self.backbone.get_classifier().in_features
            self.backbone.reset_classifier(0, '')
        
        self.pooling = nn.AdaptiveAvgPool2d(1)
        self.feat_dim = backbone_out
        self.classifier = nn.Linear(backbone_out, num_classes)
        
    def forward(self, x):
        features = self.backbone(x)
        
        if isinstance(features, dict):
            features = features['features']
            
        if len(features.shape) == 4:
            features = self.pooling(features)
            features = features.view(features.size(0), -1)
        
        logits = self.classifier(features)
        return logits
def audio2melspec(audio_data, cfg):
    """Convert audio data to mel spectrogram"""
    if np.isnan(audio_data).any():
        mean_signal = np.nanmean(audio_data)
        audio_data = np.nan_to_num(audio_data, nan=mean_signal)

    mel_spec = librosa.feature.melspectrogram(
        y=audio_data,
        sr=cfg.FS,
        n_fft=cfg.N_FFT,
        hop_length=cfg.HOP_LENGTH,
        n_mels=cfg.N_MELS,
        fmin=cfg.FMIN,
        fmax=cfg.FMAX,
        power=2.0,
        pad_mode="reflect",
        norm='slaney',
        htk=True,
        center=True,
    )

    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    mel_spec_norm = (mel_spec_db - mel_spec_db.min()) / (mel_spec_db.max() - mel_spec_db.min() + 1e-8)
    
    return mel_spec_norm

def process_audio_segment(audio_data, cfg):
    """Process audio segment to get mel spectrogram"""
    if len(audio_data) < cfg.FS * cfg.WINDOW_SIZE:
        audio_data = np.pad(audio_data, 
                          (0, cfg.FS * cfg.WINDOW_SIZE - len(audio_data)), 
                          mode='constant')
    
    mel_spec = audio2melspec(audio_data, cfg)
    
    # Resize if needed
    if mel_spec.shape != cfg.TARGET_SHAPE:
        mel_spec = cv2.resize(mel_spec, cfg.TARGET_SHAPE, interpolation=cv2.INTER_LINEAR)
        
    return mel_spec
    

def predict_on_spectrogram(audio_path, models, cfg, species_ids):
    """Process a single audio file and predict species presence for each 5-second segment"""
    predictions = []
    row_ids = []
    soundscape_id = Path(audio_path).stem
    
    try:
        print(f"Processing {soundscape_id}")
        audio_data, _ = librosa.load(audio_path, sr=cfg.FS)
        
        total_segments = int(len(audio_data) / (cfg.FS * cfg.WINDOW_SIZE))
        
        for segment_idx in range(total_segments):
            start_sample = segment_idx * cfg.FS * cfg.WINDOW_SIZE
            end_sample = start_sample + cfg.FS * cfg.WINDOW_SIZE
            segment_audio = audio_data[start_sample:end_sample]
#            
            end_time_sec = (segment_idx + 1) * cfg.WINDOW_SIZE
            row_id = f"{soundscape_id}_{end_time_sec}"
            row_ids.append(row_id)

  
   
            mel_spec = process_audio_segment(segment_audio, cfg)
            # print("000000000000000000")
            # print(mel_spec)
            # print("00000000000000")
            ov_input = mel_spec[np.newaxis, np.newaxis, :, :].astype(np.float32)
            # print(ov_input.shape)
            if len(models) == 1:
                outputs = models[0](ov_input)['output_0']
                final_preds = sigmoid(outputs).squeeze()
            else:
                segment_preds = []
                for model in models:
                    outputs = model(ov_input)['output_0']
                    probs = sigmoid(outputs).squeeze()
                    segment_preds.append(probs)
    
                final_preds = np.mean(segment_preds, axis=0)
                
            predictions.append(final_preds)
            
    except Exception as e:
        print(f"Error processing {audio_path}: {e}")

    predictions = np.vstack(predictions) 
    predictions = apply_power_to_low_ranked_cols(predictions, top_k=30,exponent=2)
    # print(predictions.shape)
    return row_ids, predictions


def apply_tta(spec, tta_idx):
    """Apply test-time augmentation"""
    if tta_idx == 0:
        # Original spectrogram
        return spec
    elif tta_idx == 1:
        # Time shift (horizontal flip)
        return np.flip(spec, axis=1)
    elif tta_idx == 2:
        # Frequency shift (vertical flip)
        return np.flip(spec, axis=0)
    else:
        return spec

def run_inference(cfg, models, species_ids):
    """Run inference on all test soundscapes"""
    test_files = list(Path(cfg.test_soundscapes).glob('*.ogg'))
    
    if cfg.debug:
        print(f"Debug mode enabled, using only {cfg.debug_count} files")
        test_files = test_files[:cfg.debug_count]
    
    print(f"Found {len(test_files)} test soundscapes")

    all_row_ids = []
    all_predictions = []

    for audio_path in tqdm(test_files):
        row_ids, predictions = predict_on_spectrogram(str(audio_path), models, cfg, species_ids)
        all_row_ids.extend(row_ids)
        all_predictions.extend(predictions)
    
    return all_row_ids, all_predictions

def create_submission(row_ids, predictions, species_ids, cfg):
    """Create submission dataframe"""
    print("Creating submission dataframe...")

    submission_dict = {'row_id': row_ids}
    
    for i, species in enumerate(species_ids):
        submission_dict[species] = [pred[i] for pred in predictions]

    submission_df = pd.DataFrame(submission_dict)

    submission_df.set_index('row_id', inplace=True)

    sample_sub = pd.read_csv(cfg.submission_csv, index_col='row_id')

    missing_cols = set(sample_sub.columns) - set(submission_df.columns)
    if missing_cols:
        print(f"Warning: Missing {len(missing_cols)} species columns in submission")
        for col in missing_cols:
            submission_df[col] = 0.0

    submission_df = submission_df[sample_sub.columns]

    submission_df = submission_df.reset_index()
    
    return submission_df
def main():
    start_time = time.time()
    print("main:Starting BirdCLEF-2025 inference...")
    print(f"TTA enabled: {cfg.use_tta} (variations: {cfg.tta_count if cfg.use_tta else 0})")

    models  =  load_openvino_model(xml_paths=model_dicts['convnextv2_nano.fcmae_ft_in22k_in1k'])

    
    if not models:
        print("main:No models found! Please check model paths.")
        return
    
    print(f"Model usage: {'Single model' if len(models) == 1 else f'Ensemble of {len(models)} models'}")

    row_ids, predictions = run_inference(cfg, models, species_ids)

    submission_df = create_submission(row_ids, predictions, species_ids, cfg)

    submission_path = 'submission3.csv'
    submission_df.to_csv(submission_path, index=False)
    # 时间平滑

    sub = pd.read_csv('submission3.csv')
    cols = sub.columns[1:]
    groups = sub['row_id'].str.rsplit('_', n=1).str[0]
    groups = groups.values
    for group in np.unique(groups):
        sub_group = sub[group == groups]
        predictions = sub_group[cols].values
        new_predictions = predictions.copy()
        for i in range(1, predictions.shape[0]-1):
            new_predictions[i] = (predictions[i-1] * 0.1) + (predictions[i] * 0.8) + (predictions[i+1] * 0.1)
        new_predictions[0] = (predictions[0] * 0.8) + (predictions[1] * 0.2)
        new_predictions[-1] = (predictions[-1] * 0.8) + (predictions[-2] * 0.2)
        sub_group[cols] = new_predictions
        sub[group == groups] = sub_group
    sub.to_csv("submission3.csv", index=False)
        
    
    end_time = time.time()
    print(f"Inference completed in {(end_time - start_time)/60:.2f} minutes")

if __name__ == "__main__":
    main()

Using device: cpu
Loading taxonomy data...
Number of classes: 206
main:Starting BirdCLEF-2025 inference...
TTA enabled: False (variations: 0)
OpenVINO 模型 'convnextv2_nano.fcmae_ft_in22k_in1k' 已编译到设备: CPU
OpenVINO 模型 'convnextv2_nano.fcmae_ft_in22k_in1k' 已编译到设备: CPU
Model usage: Ensemble of 2 models
Found 0 test soundscapes


0it [00:00, ?it/s]

Creating submission dataframe...
Inference completed in 0.03 minutes


# 融合

In [25]:
# ------------------------------------------- #
# [IMPORTANT]
# * Blending Weight
# ------------------------------------------- #
sub_w=[0.2,0.4,0.3,0.1]
list_TARGETs = sorted(os.listdir('/kaggle/input/birdclef-2025/train_audio/'))
list_targets_0 = [f'{TARGET} 0' for TARGET in list_TARGETs]
list_targets_1 = [f'{TARGET} 1' for TARGET in list_TARGETs]
list_targets_2 = [f'{TARGET} 2' for TARGET in list_TARGETs]
list_targets_3 = [f'{TARGET} 3' for TARGET in list_TARGETs]



df0 = pd.read_csv("/kaggle/working/submission0.csv")
df1 = pd.read_csv("/kaggle/working/submission1.csv")
df2 = pd.read_csv("/kaggle/working/submission2.csv")
df3 = pd.read_csv("/kaggle/working/submission3.csv")



df0 = df0.rename(columns={TARGET : f'{TARGET} 0' for TARGET in list_TARGETs})
df1 = df1.rename(columns={TARGET : f'{TARGET} 1' for TARGET in list_TARGETs})
df2 = df2.rename(columns={TARGET : f'{TARGET} 2' for TARGET in list_TARGETs})
df3 = df3.rename(columns={TARGET : f'{TARGET} 3' for TARGET in list_TARGETs})



dfs_merged1 = pd.merge(df0, df1, on='row_id', how='inner') 
dfs_merged2 = pd.merge(df3, df2, on='row_id', how='inner')
dfs = pd.merge(dfs_merged1, dfs_merged2, on='row_id', how='inner')


for i in range(len(list_TARGETs)):
    dfs[list_TARGETs[i]] = dfs[list_targets_0[i]]*sub_w[0] +  sub_w[1]*dfs[list_targets_1[i]] + sub_w[2]*dfs[list_targets_2[i]]+ sub_w[3]*dfs[list_targets_3[i]]
             
for col0,col1 in zip(list_targets_0, list_targets_1):
    del dfs[col0]
    del dfs[col1]
for col2,col3 in zip(list_targets_2, list_targets_3):
    del dfs[col2]
    del dfs[col3]

    
dfs.to_csv("submission.csv", index=False)
print("finish")

finish
