In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="1"

import pandas as pd
import numpy as np
import librosa
import seaborn as sns
import os
import json
import IPython.display as ipd
import soundfile as sf
import torch
import h5py
import onnxruntime as ort
import openvino as ov
import re

from glob import glob
from tqdm import tqdm
from matplotlib import pyplot as plt
from itertools import chain
from os.path import join as pjoin
from torchaudio.transforms import AmplitudeToDB, MelSpectrogram
from copy import deepcopy
from pprint import pprint
from sklearn.model_selection import StratifiedGroupKFold

from code_base.utils.onnx_utils import ONNXEnsemble, convert_to_onnx
from code_base.models import WaveCNNAttenClasifier
from code_base.datasets import WaveDataset, WaveAllFileDataset
from code_base.utils.swa import avarage_weights, delete_prefix_from_chkp
from code_base.inefernce import BirdsInference
from code_base.utils import load_json, compose_submission_dataframe, groupby_np_array, stack_and_max_by_samples, write_json
from code_base.utils.metrics import score_numpy
%matplotlib inline


# Export Models

In [None]:
!ls -lt ../logdirs/ | head -20

In [None]:
# EXP_NAME = "eca_nfnet_l0_Exp_noamp_64bs_5sec_mixupP05_RandomFiltering_SqrtBalancing_Radamlr1e3_CosBatchLR1e6_Epoch50_BackGroundSoundScapeORESC50P05_SpecAugV1_FocalBCELoss_LSF1005_5Folds_ScoredPrevCompsAndXCsnipet28032025_FromV2Y2025Last_PseudoF2PT05MT01P04I1OOF"
# POSTFIX = ""
# conf_path = glob(f"../logdirs/{EXP_NAME}/code/*train_configs*.py")
# assert len(conf_path) == 1
# conf_path = conf_path[0]
# !cat {conf_path}

In [None]:
MODEL_CLASS = WaveCNNAttenClasifier
TRAIN_PERIOD = 5

In [None]:
MODELS = [
    {
        "model_config": dict(
            backbone="tf_efficientnetv2_s_in21k",
            mel_spec_paramms={
                "sample_rate": 32000,
                "n_mels": 128,
                "f_min": 20,
                "n_fft": 2048,
                "hop_length": 512,
                "normalized": True,
            },
            head_config={
                "p": 0.5,
                "num_class": 206,
                "train_period": TRAIN_PERIOD,
                "infer_period": TRAIN_PERIOD,
                "output_type": "clipwise_pred_long",
            },
            exportable=True,
            fixed_amplitude_to_db=True
        ),
        "exp_name": "tf_efficientnetv2_s_in21k_Exp_noamp_64bs_5sec_BasicAug_EqualBalancing_AdamW1e4_CosBatchLR1e6_Epoch50_FocalBCELoss_LSF1005_FromPrebs1_PseudoF2PT05MT01P04I2_AddRareBirdsNoLeak",
        "fold": [0, 1, 2, 3, 4],
        "chkp_name":"last.ckpt",
        "swa_checkpoint_regex": r'(?P<key>\w+)=(?P<value>[\d.]+)(?=\.ckpt|$)',
        "swa_sort_rule": lambda x: -float(x["valid_roc_auc"]),
        "delete_prefix": "model.",
        "n_swa_models": 1,
        "model_output_key": None,
    },
    {
        "model_config": dict(
            backbone="eca_nfnet_l0",
            mel_spec_paramms={
                "sample_rate": 32000,
                "n_mels": 128,
                "f_min": 20,
                "n_fft": 2048,
                "hop_length": 512,
                "normalized": True,
            },
            head_config={
                "p": 0.5,
                "num_class": 206,
                "train_period": TRAIN_PERIOD,
                "infer_period": TRAIN_PERIOD,
                "output_type": "clipwise_pred_long",
            },
            exportable=True,
            fixed_amplitude_to_db=True
        ),
        "exp_name": "eca_nfnet_l0_Exp_noamp_64bs_5sec_BasicAug_SqrtBalancing_Radamlr1e3_CosBatchLR1e6_Epoch50_FocalBCELoss_LSF1005_FromXCV2Best_PseudoF2PT05MT01P04I3_MinorOverSampleV1",
        "fold": [0, 1, 2, 3, 4],
        "chkp_name":"last.ckpt",
        "swa_checkpoint_regex": r'(?P<key>\w+)=(?P<value>[\d.]+)(?=\.ckpt|$)',
        "swa_sort_rule": lambda x: -float(x["valid_roc_auc"]),
        "delete_prefix": "model.",
        "n_swa_models": 1,
        "model_output_key": None,
    },
]

INFERENCE_CONFIG = {
    # Inference Class
    "use_sigmoid": False,
    # Data
    "test_data_root":"../data/birdclef_2025/train_soundscapes/*.ogg",
    "label_map_data_path": "../data/bird2int_2025.json",
    "scored_birds_path":"../data/sb_2025.json",  
    "lookback":None,
    "lookahead":None,
    "segment_len":5,
    "step": None,
    "late_normalize": True,

    "model_output_key": None,
}

In [None]:
def create_model_and_upload_chkp(
    model_class,
    model_config,
    model_device,
    model_chkp_root,
    model_chkp_basename=None,
    model_chkp_regex=None,
    delete_prefix=None,
    swa_sort_rule=None,
    n_swa_to_take=3,
    prune_checkpoint_func=None
):
    if model_chkp_basename is None:
        basenames = os.listdir(model_chkp_root)
        checkpoints = []
        for el in basenames:
            matches = re.findall(model_chkp_regex, el)
            if not matches:
                continue
            parsed_dict = {key: value for key, value in matches}
            parsed_dict["name"] = el
            checkpoints.append(parsed_dict)
        print("SWA checkpoints")
        pprint(checkpoints)
        checkpoints = sorted(checkpoints, key=swa_sort_rule)
        checkpoints = checkpoints[:n_swa_to_take]
        print("SWA sorted checkpoints")
        pprint(checkpoints)
        if len(checkpoints) > 1:
            checkpoints = [
                torch.load(os.path.join(model_chkp_root, el["name"]), map_location="cpu")["state_dict"] for el in checkpoints
            ]
            t_chkp = avarage_weights(
                nn_weights=checkpoints,
                delete_prefix=delete_prefix
            )
        else:
            chkp_path = os.path.join(model_chkp_root, checkpoints[0]["name"])
            print("vanilla model")
            print("Loading", chkp_path)
            t_chkp = torch.load(
                chkp_path, 
                map_location="cpu"
            )["state_dict"]
            if delete_prefix is not None:
                t_chkp = delete_prefix_from_chkp(t_chkp, delete_prefix)
    else:
        chkp_path = os.path.join(model_chkp_root, model_chkp_basename)
        print("vanilla model")
        print("Loading", chkp_path)
        t_chkp = torch.load(
            chkp_path, 
            map_location="cpu"
        )["state_dict"]
        if delete_prefix is not None:
            t_chkp = delete_prefix_from_chkp(t_chkp, delete_prefix)

    if prune_checkpoint_func is not None:
        t_chkp = prune_checkpoint_func(t_chkp)
    t_model = model_class(**model_config, device=model_device) 
    print("Missing keys: ", set(t_model.state_dict().keys()) - set(t_chkp))
    print("Extra keys: ",  set(t_chkp) - set(t_model.state_dict().keys()))
    t_model.load_state_dict(t_chkp, strict=False)
    t_model.eval()
    return t_model

In [None]:
model = []
for model_config in MODELS:
    if model_config["fold"] is None:
        model.append(create_model_and_upload_chkp(
            model_class=MODEL_CLASS,
            model_config=model_config['model_config'],
            model_device="cuda",
            model_chkp_root=f"../logdirs/{model_config['exp_name']}/checkpoints",
            model_chkp_basename=model_config["chkp_name"] if model_config["swa_checkpoint_regex"] is None else None,
            model_chkp_regex=model_config.get("swa_checkpoint_regex"),
            swa_sort_rule=model_config.get("swa_sort_rule"),
            n_swa_to_take=model_config.get("n_swa_models", 3),
            delete_prefix=model_config.get("delete_prefix"),
            prune_checkpoint_func=model_config.get("prune_checkpoint_func")
        ))
    else:
        model.extend([create_model_and_upload_chkp(
            model_class=MODEL_CLASS,
            model_config=model_config['model_config'],
            model_device="cuda",
            model_chkp_root=f"../logdirs/{model_config['exp_name']}/fold_{m_i}/checkpoints",
            # model_chkp_root=f"../logdirs/{CONFIG['exp_name']}/checkpoints",
            model_chkp_basename=model_config["chkp_name"] if model_config["swa_checkpoint_regex"] is None else None,
            model_chkp_regex=model_config.get("swa_checkpoint_regex"),
            swa_sort_rule=model_config.get("swa_sort_rule"),
            n_swa_to_take=model_config.get("n_swa_models", 3),
            delete_prefix=model_config.get("delete_prefix"),
            prune_checkpoint_func=model_config.get("prune_checkpoint_func")
        ) for m_i in model_config["fold"]])

In [None]:
len(model)

# Prepare Data

In [None]:
bird2id = load_json(INFERENCE_CONFIG["label_map_data_path"])

test_au_pathes = glob(INFERENCE_CONFIG["test_data_root"])#[:100]

test_df = pd.DataFrame({
    "filename": test_au_pathes,
    "duration_s": [librosa.get_duration(filename=el) for el in tqdm(test_au_pathes)]
})

In [None]:
previou_itter_folds = pd.read_csv(
    "../data/birdclef_2025_pseudo/eca_124__eca_117__eca_112__eca_118__eca_113__ebs_123__eca_120__eca_121/v0_0_filteredProb05_grouped_cv_split5.csv"
)
previou_itter_folds["sample_id"] = previou_itter_folds["row_id"].apply(lambda x: "_".join(x.split("_")[:-1]))
previou_itter_folds = previou_itter_folds[["sample_id", "fold_id"]].drop_duplicates()

test_df["sample_id"] = test_df["filename"].apply(lambda x: os.path.splitext(os.path.basename(x))[0])

test_df = test_df.merge(previou_itter_folds, on="sample_id", how="left")
test_df = test_df.drop(columns=["sample_id"])

In [None]:
test_df

In [None]:
ds_config_test = {
   "root": "",
   "label_str2int_mapping_path": INFERENCE_CONFIG["label_map_data_path"],
   "n_cores": 8,
   "use_audio_cache": True,
   "test_mode": True,
   "segment_len": INFERENCE_CONFIG["segment_len"],
   "lookback":INFERENCE_CONFIG["lookback"],
   "lookahead":INFERENCE_CONFIG["lookahead"],
    "sample_id": None,
    "late_normalize": INFERENCE_CONFIG["late_normalize"],
    "step": INFERENCE_CONFIG["step"],
    "validate_sr": 32_000,
    "verbose": False
}
loader_config = {
    "batch_size": 8,
    "drop_last": False,
    "shuffle": False,
    "num_workers": 0,
}

In [None]:
ds_test = WaveAllFileDataset(
    df=test_df[test_df["fold_id"].isna()].reset_index(drop=True), 
    **ds_config_test
)
loader_test = torch.utils.data.DataLoader(
    ds_test,
    **loader_config,
)

In [None]:
ds_test_oofs = [
    WaveAllFileDataset(df=test_df[test_df["fold_id"] == i], **ds_config_test) for i in range(int(test_df["fold_id"].max()) + 1) 
]
loader_test_oofs = [
    torch.utils.data.DataLoader(
        ds,
        **loader_config,
    ) for ds in ds_test_oofs
]

# Ineference Class

In [None]:
inference_class = BirdsInference(
    device="cuda",
    verbose_tqdm=True,
    use_sigmoid=INFERENCE_CONFIG["use_sigmoid"],
    model_output_key=INFERENCE_CONFIG["model_output_key"],
)

# Prediction

## Predict NOT OOF Part

In [None]:
test_preds, test_dfidx, test_end = inference_class.predict_test_loader(
    nn_models=model,
    data_loader=loader_test
)
test_pred_df = compose_submission_dataframe(
    probs=test_preds,
    dfidxs=test_dfidx,
    end_seconds=test_end,
    filenames=loader_test.dataset.df[loader_test.dataset.name_col].copy(),
    bird2id=bird2id
)

In [None]:
plt.title("Most 'Probable' class probability distribution")
plt.hist(test_preds.max(axis=1), bins=30)
plt.show()

print(
    "Max Prob: ", test_preds.max(), 
    "Min Prob: ", test_preds.min(),
    "Median Prob: ", np.median(test_preds)
)

In [None]:
test_pred_df["sample_id"] = test_pred_df["row_id"].apply(lambda x: "_".join(x.split("_")[:-1]))

## Predict OOF Part

In [None]:
all_folds_test_pred_df = []
for one_model, one_loader in zip(model, loader_test_oofs):
    fold_test_preds, fold_test_dfidx, fold_test_end = inference_class.predict_test_loader(
        nn_models=[one_model],
        data_loader=one_loader
    )
    fold_test_pred_df = compose_submission_dataframe(
        probs=fold_test_preds,
        dfidxs=fold_test_dfidx,
        end_seconds=fold_test_end,
        filenames=one_loader.dataset.df[one_loader.dataset.name_col].copy(),
        bird2id=bird2id
    )
    all_folds_test_pred_df.append(fold_test_pred_df)

In [None]:
all_folds_test_pred_df = pd.concat(all_folds_test_pred_df).reset_index(drop=True)

In [None]:
plt.title("Most 'Probable' class probability distribution")
plt.hist(all_folds_test_pred_df.iloc[:,1:].values.max(axis=1), bins=30)
plt.show()

print(
    "Max Prob: ", all_folds_test_pred_df.iloc[:,1:].values.max(), 
    "Min Prob: ", all_folds_test_pred_df.iloc[:,1:].values.min(),
    "Median Prob: ", np.median(all_folds_test_pred_df.iloc[:,1:].values)
)

## Merge

In [None]:
test_df["sample_id"] = test_df["filename"].apply(lambda x: os.path.splitext(os.path.basename(x))[0])

concat_test_pred_df = pd.concat([test_pred_df, all_folds_test_pred_df]).reset_index(drop=True)
concat_test_pred_df["sample_id"] = concat_test_pred_df["row_id"].apply(lambda x: "_".join(x.split("_")[:-1]))

assert set(concat_test_pred_df["sample_id"]) == set(test_df["sample_id"])

concat_test_pred_df = concat_test_pred_df.drop(columns=["sample_id"])

In [None]:
plt.title("Most 'Probable' class probability distribution")
plt.hist(concat_test_pred_df.iloc[:,1:].values.max(axis=1), bins=30)
plt.show()

print(
    "Max Prob: ", concat_test_pred_df.iloc[:,1:].values.max(), 
    "Min Prob: ", concat_test_pred_df.iloc[:,1:].values.min(),
    "Median Prob: ", np.median(concat_test_pred_df.iloc[:,1:].values)
)

## Save

In [None]:
EXP_NAME + POSTFIX

In [None]:
# save_path = f"../data/pseudo/{EXP_NAME + POSTFIX}/v0_oof.csv"
# assert not os.path.exists(save_path)
# os.makedirs(os.path.dirname(save_path), exist_ok=True)
# test_pred_df.to_csv(save_path, index=False)

save_path = f"../data/pseudo/{EXP_NAME + POSTFIX}/v0_oof.csv"
assert not os.path.exists(save_path)
os.makedirs(os.path.dirname(save_path), exist_ok=True)
concat_test_pred_df.to_csv(save_path, index=False)

# Load Pseudo DF

In [None]:
test_pred_df = pd.read_csv(
    save_path
)

# Prepare Ready2Use DF (2025)

In [None]:
!ls -lt ../data/pseudo/ | head

In [None]:
test_pred_df_eca = pd.read_csv(
    "../data/pseudo/eca_nfnet_l0_Exp_noamp_64bs_5sec_mixupP05_RandomFiltering_SqrtBalancing_Radamlr1e3_CosBatchLR1e6_Epoch50_BackGroundSoundScapeORESC50P05_SpecAugV1_FocalBCELoss_LSF1005_5Folds_ScoredPrevCompsAndXCsnipet28032025_FromV2Y2025Last_PseudoF2PT05MT01P04I1OOF/v0_oof.csv"
)
test_pred_df_ebs = pd.read_csv(
    "../data/pseudo/eca_nfnet_l0_Exp_noamp_64bs_5sec_mixupP05_RandomFiltering_SqrtBalancing_Radamlr1e3_CosBatchLR1e6_Epoch50_BackGroundSoundScapeORESC50P05_SpecAugV1_FocalBCELoss_LSF1005_5Folds_ScoredPrevCompsAndXCsnipet28032025_FromPreca4_PseudoF2PT05MT01P04I1OOF/v0_oof.csv"
)

In [None]:
assert (test_pred_df_eca["row_id"] == test_pred_df_ebs["row_id"]).all()

In [None]:
test_pred_df_eca

In [None]:
test_pred_df = test_pred_df_eca.copy()

In [None]:
test_pred_df = test_pred_df_eca.copy()

test_pred_df.iloc[:,1:] = (
    test_pred_df_eca.iloc[:,1:].values + test_pred_df_ebs.iloc[:,1:].values
) / 2

In [None]:
CLASSES = test_pred_df.columns[1:].to_list()

In [None]:
primary_label = [CLASSES[argmax_idx] for argmax_idx in np.argmax(test_pred_df.iloc[:,1:].values, axis=1)]
primary_label_prob = np.max(test_pred_df.iloc[:,1:].values, axis=1)
test_pred_df["primary_label"] = primary_label
test_pred_df["primary_label_prob"] = primary_label_prob

In [None]:
test_pred_df.drop(columns=CLASSES)

In [None]:
save_path = f"../data/pseudo/ensem_26052025_fromVer1OOF/v0.csv"
assert not os.path.exists(save_path)
os.makedirs(os.path.dirname(save_path), exist_ok=True)
test_pred_df.to_csv(save_path, index=False)

# Split Pseudo DF(2025)

In [None]:
previou_itter_folds = pd.read_csv(
    "../data/pseudo/eca_124__eca_117__eca_112__eca_118__eca_113__ebs_123__eca_120__eca_121/v0_0_filteredProb05_grouped_cv_split5.csv"
)
previou_itter_folds["sample_id"] = previou_itter_folds["row_id"].apply(lambda x: "_".join(x.split("_")[:-1]))
previou_itter_folds = previou_itter_folds[["sample_id", "fold_id"]].drop_duplicates()

test_pred_df = pd.read_csv(
    "../data/pseudo/ensem_26052025_fromVer1OOF/v0.csv"
)
test_pred_df["sample_id"] = test_pred_df["row_id"].apply(lambda x: "_".join(x.split("_")[:-1]))

In [None]:
test_pred_df_selected = test_pred_df[test_pred_df["primary_label_prob"] > 0.5].reset_index(drop=True)

In [None]:
# test_pred_df_selected = test_pred_df_selected.merge(
#     previou_itter_folds, on="sample_id", how="left"
# ).rename(columns={"fold_id": "group_col"})
# test_pred_df_selected.loc[~test_pred_df_selected["group_col"].isna(), "group_col"] = test_pred_df_selected.loc[~test_pred_df_selected["group_col"].isna(), "group_col"].astype(str) 
# test_pred_df_selected.loc[test_pred_df_selected["group_col"].isna(), "group_col"] = test_pred_df_selected.loc[test_pred_df_selected["group_col"].isna(), "sample_id"] 

In [None]:
# test_pred_df_selected = test_pred_df_selected.merge(
#     previou_itter_folds, on="sample_id", how="left"
# )

In [None]:
# test_pred_df_selected_for_split = test_pred_df_selected[test_pred_df_selected["fold_id"].isna()]

In [None]:
cv_split = list(StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42).split(
    test_pred_df_selected,
    test_pred_df_selected["primary_label"],
    # test_pred_df_selected["group_col"]
    test_pred_df_selected["sample_id"]
))

In [None]:
# cv_split = list(StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42).split(
#     test_pred_df_selected_for_split,
#     test_pred_df_selected_for_split["primary_label"],
#     # test_pred_df_selected["group_col"]
#     test_pred_df_selected_for_split["sample_id"]
# ))

In [None]:
for train_fold, val_fold in cv_split:
    assert not set(test_pred_df_selected["sample_id"].iloc[train_fold]) & set(test_pred_df_selected["sample_id"].iloc[val_fold])

In [None]:
np.save(
    "../data/pseudo/ensem_26052025_fromVer1OOF/grouped_filteredProb05_cv_split5.npy",
    np.array(cv_split, dtype=object)
)

In [None]:
# test_pred_df_selected.drop(columns=["sample_id", "group_col"]).to_csv(
#     "../data/pseudo/ensem_25052025_fromVer1OOF/v0_0_filteredProb05_respectPrev.csv",
#     index=False
# )

test_pred_df_selected.drop(columns=["sample_id"]).to_csv(
    "../data/pseudo/ensem_26052025_fromVer1OOF/v0_0_filteredProb05.csv",
    index=False
)

In [None]:
test_pred_df_selected["fold_id"] = None
for fold_id, (train_fold, val_fold) in enumerate(cv_split):
    assert test_pred_df_selected.iloc[val_fold, -1].isna().all()
    test_pred_df_selected.iloc[val_fold, -1] = fold_id
assert not test_pred_df_selected["fold_id"].isna().any()

In [None]:
# test_pred_df_selected_for_split["fold_id"] = None
# for fold_id, (train_fold, val_fold) in enumerate(cv_split):
#     assert test_pred_df_selected_for_split.iloc[val_fold, -1].isna().all()
#     test_pred_df_selected_for_split.iloc[val_fold, -1] = fold_id
# assert not test_pred_df_selected_for_split["fold_id"].isna().any()

In [None]:
# test_pred_df_selected.loc[test_pred_df_selected["fold_id"].isna(), "fold_id"] = test_pred_df_selected_for_split["fold_id"]

In [None]:
test_pred_df_selected_for_split["fold_id"].isna().sum()

In [None]:
test_pred_df_selected["fold_id"].value_counts()

In [None]:
cv_split = []
for f_id in range(5):
    cv_split.append([
        np.where(test_pred_df_selected["fold_id"] != f_id)[0],
        np.where(test_pred_df_selected["fold_id"] == f_id)[0],
    ])

for train_fold, val_fold in cv_split:
    assert not set(test_pred_df_selected["sample_id"].iloc[train_fold]) & set(test_pred_df_selected["sample_id"].iloc[val_fold])

In [None]:
np.save(
    "../data/pseudo/ensem_26052025_fromVer1OOF/grouped_filteredProb05_cv_split5_respectPrev.npy",
    np.array(cv_split, dtype=object)
)

In [None]:
# test_pred_df_selected.drop(columns=["sample_id"]).to_csv(
#     "../data/pseudo/ensem_26052025_fromVer1OOF/v0_0_filteredProb05_grouped_cv_split5_respectPrev.csv",
#     index=False
# )

test_pred_df_selected.drop(columns=["sample_id"]).to_csv(
    "../data/pseudo/ensem_26052025_fromVer1OOF/v0_0_filteredProb05_grouped_cv_split5.csv",
    index=False
)

In [None]:
ver1_df = pd.read_csv(
    "../data/pseudo/ensem_26052025_fromVer1OOF/v0_0_filteredProb05_grouped_cv_split5_respectPrev.csv"
)
ver2_df = pd.read_csv(
    "../data/pseudo/ensem_26052025_fromVer1OOF/v0_0_filteredProb05_grouped_cv_split5.csv"
)
prev_ver = pd.read_csv(
    "../data/pseudo/eca_124__eca_117__eca_112__eca_118__eca_113__ebs_123__eca_120__eca_121/v0_0_filteredProb05_grouped_cv_split5.csv"
)

In [None]:
assert (ver1_df["row_id"] == ver2_df["row_id"]).all()

In [None]:
ver1_df["fold_id"].value_counts(dropna=False)

In [None]:
ver2_df["fold_id"].value_counts(dropna=False)

In [None]:
prev_ver["fold_id"].value_counts()