In [1]:
import glob
import os
import random
import sys
import warnings
from pathlib import Path

import numpy as np
import pandas as pd
import librosa
import timm
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as torchdata
from torchaudio.transforms import AmplitudeToDB, MelSpectrogram
from tqdm.auto import tqdm
import glob
import concurrent.futures
import shutil
import albumentations as A
import torchaudio
warnings.filterwarnings("ignore")

In [7]:
sub = pd.read_csv("../input/birdclef-2024/sample_submission.csv")
target_columns_ = sub.columns.tolist()
target_columns = sub.columns.tolist()[1:]


In [17]:
TOTAL_SECONDS_CHUNKS = 48
test_path = "/home/simon/Code/kaggle_competion_list/Birdclef/birdclef-2024/input/birdclef-2024/test_soundscapes/"
files = glob.glob(f'{test_path}*')
if len(files) == 1:
    TOTAL_SECONDS_CHUNKS = 2

seconds = [i for i in range(5, (TOTAL_SECONDS_CHUNKS*5) + 5, 5)]

In [19]:
ROOT = "/home/simon/Code/kaggle_competion_list/Birdclef/birdclef-2024/"

In [20]:
test_path = f"{ROOT}/input/birdclef-2024/test_soundscapes/"

files = glob.glob(f'{test_path}*')
if len(files) == 1:
    shutil.copy(f'{ROOT}/input/birdclef-2024/train_audio/redspu1/XC312771.ogg', f'{ROOT}/working/soundscape_1446779.ogg')
    shutil.copy(f'{ROOT}/input/birdclef-2024/train_audio/redspu1/XC312771.ogg', f'{ROOT}/working/soundscape_1442779.ogg')
    shutil.copy(f'{ROOT}/input/birdclef-2024/train_audio/redspu1/XC312771.ogg', f'{ROOT}/working/soundscape_1446779.ogg')
    shutil.copy(f'{ROOT}/input/birdclef-2024/train_audio/redspu1/XC312771.ogg', f'{ROOT}/working/soundscape_1446379.ogg')
    shutil.copy(f'{ROOT}/input/birdclef-2024/train_audio/redspu1/XC312771.ogg', f'{ROOT}/working/soundscape_1146779.ogg')
    shutil.copy(f'{ROOT}/input/birdclef-2024/train_audio/redspu1/XC312771.ogg', f'{ROOT}/working/soundscape_1426779.ogg')
    shutil.copy(f'{ROOT}/input/birdclef-2024/train_audio/redspu1/XC312771.ogg', f'{ROOT}/working/soundscape_1441779.ogg')
    shutil.copy(f'{ROOT}/input/birdclef-2024/train_audio/redspu1/XC312771.ogg', f'{ROOT}/working/soundscape_1446179.ogg')
    shutil.copy(f'{ROOT}/input/birdclef-2024/train_audio/redspu1/XC312771.ogg', f'{ROOT}/working/soundscape_1446719.ogg')
    shutil.copy(f'{ROOT}/input/birdclef-2024/train_audio/redspu1/XC312771.ogg', f'{ROOT}/working/soundscape_1446771.ogg')
    shutil.copy(f'{ROOT}/input/birdclef-2024/train_audio/redspu1/XC312771.ogg', f'{ROOT}/working/soundscape_1446789.ogg')
    shutil.copy(f'{ROOT}/input/birdclef-2024/train_audio/redspu1/XC312771.ogg', f'{ROOT}/working/soundscape_1448779.ogg')
    test_path = f"{ROOT}/working/"
    
print (test_path)

/home/simon/Code/kaggle_competion_list/Birdclef/birdclef-2024//working/


In [21]:
mel_spec_params = {
    "sample_rate": 32000,
    "n_mels": 128,
    "f_min": 20,
    "f_max": 16000,
    "n_fft": 2048,
    "hop_length": 512,
    "normalized": True,
    "center" : True,
    "pad_mode" : "constant",
    "norm" : "slaney",
    "onesided" : True,
    "mel_scale" : "slaney"
}
top_db = 80

In [22]:
def normalize_melspec(X, eps=1e-6):
    mean = X.mean((1, 2), keepdim=True)
    std = X.std((1, 2), keepdim=True)
    Xstd = (X - mean) / (std + eps)

    norm_min, norm_max = (
        Xstd.min(-1)[0].min(-1)[0],
        Xstd.max(-1)[0].max(-1)[0],
    )
    fix_ind = (norm_max - norm_min) > eps * torch.ones_like(
        (norm_max - norm_min)
    )
    V = torch.zeros_like(Xstd)
    if fix_ind.sum():
        V_fix = Xstd[fix_ind]
        norm_max_fix = norm_max[fix_ind, None, None]
        norm_min_fix = norm_min[fix_ind, None, None]
        V_fix = torch.max(
            torch.min(V_fix, norm_max_fix),
            norm_min_fix,
        )
        V_fix = (V_fix - norm_min_fix) / (norm_max_fix - norm_min_fix)
        V[fix_ind] = V_fix
    return V

In [23]:
transforms_val = A.Compose([
    A.Resize(256, 256),
    A.Normalize()
])

In [24]:
class TestDataset(torchdata.Dataset):
    def __init__(self, 
                 df: pd.DataFrame, 
                 clip: np.ndarray,
                ):
        
        self.df = df
        self.clip = clip
        self.mel_transform = torchaudio.transforms.MelSpectrogram(**mel_spec_params)
        self.db_transform = torchaudio.transforms.AmplitudeToDB(stype='power', top_db=top_db)
        self.transform = transforms_val

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx: int):

        sample = self.df.loc[idx, :]
        row_id = sample.row_id

        end_seconds = int(sample.seconds)
        start_seconds = int(end_seconds - 5)
        
        wave = self.clip[:, 32000 * start_seconds : 32000 * end_seconds]
        
        mel_spectrogram = normalize_melspec(self.db_transform(self.mel_transform(wave)))
        mel_spectrogram = mel_spectrogram * 255
        mel_spectrogram = mel_spectrogram.expand(3, -1, -1).permute(1, 2, 0).numpy()
        
        res = self.transform(image=mel_spectrogram)
        spec = res['image'].astype(np.float32)
        spec = spec.transpose(2, 0, 1)
        
        return {
            "row_id": row_id,
            "wave": spec,
        }

In [25]:
def prediction_for_clip(audio_path):
    
    prediction_dict = {}
    
    wav, org_sr = torchaudio.load(audio_path, normalize=True)
    clip = torchaudio.functional.resample(wav, orig_freq=org_sr, new_freq=32000)
    
    name_ = audio_path.split(".ogg")[0].split("/")[-1]
    row_ids = [name_+f"_{second}" for second in seconds]

    test_df = pd.DataFrame({
        "row_id": row_ids,
        "seconds": seconds,
    })
    
    dataset = TestDataset(
        df=test_df, 
        clip=clip,
    )
        
    loader = torchdata.DataLoader(
        dataset,
        batch_size=4, 
        num_workers=os.cpu_count(),
        drop_last=False,
        shuffle=False,
        pin_memory=True
    )
    
    for inputs in loader:

        row_ids = inputs['row_id']
        inputs.pop('row_id')

        for row_id in row_ids:
            if row_id not in prediction_dict:
                prediction_dict[str(row_id)] = []

        probas = []

        with torch.no_grad():
            output = model(inputs["wave"])

        for row_id_idx, row_id in enumerate(row_ids):
            prediction_dict[str(row_id)].append(output[row_id_idx, :].sigmoid().detach().numpy())
                                                        
    for row_id in list(prediction_dict.keys()):
        logits = prediction_dict[row_id]
        logits = np.array(logits)[0]#.mean(0)
        prediction_dict[row_id] = {}
        for label in range(len(target_columns)):
            prediction_dict[row_id][target_columns[label]] = logits[label]

    return prediction_dict