In [None]:
import pandas as pd
import numpy as np
import librosa
import seaborn as sns
import os
import json
import IPython.display as ipd
import soundfile as sf
import math
import torch
import h5py
import re
from scipy.io import wavfile

from glob import glob
from tqdm import tqdm
from matplotlib import pyplot as plt
from itertools import chain
from os.path import join as pjoin
from shutil import copyfile
from copy import deepcopy
from itertools import chain
from sklearn.model_selection import train_test_split
from joblib import delayed

from code_base.utils import write_json, load_json
from code_base.utils.main_utils import ProgressParallel
from code_base.utils.audio_utils import get_audio_metadata

from code_base.utils import parallel_librosa_load
%matplotlib inline

In [None]:
def check_url_structure(input_df, with_xc=True):
    if with_xc:
        return (
            input_df["url"].apply(lambda x: x.split("/")[-1]) == 
            input_df["filename"].apply(lambda x: os.path.splitext(x.split("/")[-1])[0][2:])
        ).all()
    else:
        return (
            input_df["url"].apply(lambda x: x.split("/")[-1]) == 
            input_df["filename"].apply(lambda x: os.path.splitext(x.split("/")[-1])[0])
        ).all()

def get_recording_id(fname):
    fname_pp = os.path.splitext(os.path.basename(fname))[0]
    for dataset_id in ["XC", "CSA", "iNat"]:
        if fname_pp.startswith(dataset_id):
            return int(fname_pp.replace(dataset_id, ""))
    raise RuntimeError(f"{fname} does not follow convention")

# 2025 Data

In [None]:
eBird_Taxonomy_v2025 = pd.read_csv("/gpfs/space/projects/BetterMedicine/volodymyr1/exps/bird_clef_2025/birdclef_2025/taxonomy.csv")
sample_submission = pd.read_csv("/gpfs/space/projects/BetterMedicine/volodymyr1/exps/bird_clef_2025/birdclef_2025/sample_submission.csv")
train_metadata = pd.read_csv("/gpfs/space/projects/BetterMedicine/volodymyr1/exps/bird_clef_2025/birdclef_2025/train_extendedv1.csv", converters={"secondary_labels":eval, "all_labels": eval})

In [None]:
train_metadata.loc[train_metadata["collection"] == "XC", "filename"]

In [None]:
scored_birds = set(sample_submission.columns[1:].tolist())

In [None]:
assert scored_birds == set(eBird_Taxonomy_v2025["primary_label"])

In [None]:
check_url_structure(train_metadata[train_metadata["collection"] == "XC"])

In [None]:
train_metadata["id"] = train_metadata["filename"].apply(get_recording_id)

In [None]:
train_metadata["dataset"] = "comp_2025"

# 2024 Data

In [None]:
train_metadata = pd.read_csv("/gpfs/space/projects/BetterMedicine/volodymyr1/exps/bird_clef_2025/birdclef_2024/train_metadata.csv", converters={"secondary_labels": eval, "all_labels": eval})

In [None]:
check_url_structure(train_metadata)

In [None]:
train_metadata["id"] = train_metadata["filename"].apply(get_recording_id)
train_metadata["dataset"] = "comp_2024"

# 2023 Data

In [None]:
train_metadata_2023 = pd.read_csv("/gpfs/space/projects/BetterMedicine/volodymyr1/exps/bird_clef_2025/birdclef_2023/train_metadata.csv", converters={"secondary_labels": eval, "all_labels": eval})

In [None]:
check_url_structure(train_metadata_2023)

In [None]:
train_metadata_2023["id"] = train_metadata_2023["filename"].apply(get_recording_id)
train_metadata_2023["dataset"] = "comp_2023"

In [None]:
train_metadata_2023[
   (~train_metadata_2023["id"].isin(train_metadata["id"]))
]

In [None]:
train_metadata = pd.concat([
    train_metadata,
    train_metadata_2023[
       (~train_metadata_2023["id"].isin(train_metadata["id"]))
    ]
]).reset_index(drop=True)

In [None]:
train_metadata.shape

# 2022 Data

In [None]:
train_metadata_2022 = pd.read_csv("/gpfs/space/projects/BetterMedicine/volodymyr1/exps/bird_clef_2025/birdclef_2022/train_metadata.csv", converters={"secondary_labels": eval, "all_labels": eval})

In [None]:
check_url_structure(train_metadata_2022)

In [None]:
train_metadata_2022["id"] = train_metadata_2022["filename"].apply(get_recording_id)
train_metadata_2022["dataset"] = "comp_2022"

In [None]:
train_metadata_2022[
    (~train_metadata_2022["id"].isin(train_metadata["id"]))
]

In [None]:
train_metadata = pd.concat([
    train_metadata,
    train_metadata_2022[
       (~train_metadata_2022["id"].isin(train_metadata["id"]))
    ]
]).reset_index(drop=True)

In [None]:
train_metadata.shape

# 2021 Data

In [None]:
train_metadata_2021 = pd.read_csv("/gpfs/space/projects/BetterMedicine/volodymyr1/exps/bird_clef_2025/birdclef_2021/train_metadata.csv", converters={"secondary_labels": eval, "all_labels": eval})
train_metadata_2021.secondary_labels = train_metadata_2021.secondary_labels.apply(lambda x: ["rocpig" if el == "rocpig1" else el for el in x])

In [None]:
train_metadata_2021

In [None]:
# Create filename
train_metadata_2021["filename"] = train_metadata_2021.apply(lambda x: pjoin(x["primary_label"], x["filename"]), axis=1)

In [None]:
check_url_structure(train_metadata_2021)

In [None]:
train_metadata_2021["id"] = train_metadata_2021["filename"].apply(get_recording_id)
train_metadata_2021["dataset"] = "comp_2021"

In [None]:
train_metadata_2021[
   (~train_metadata_2021["id"].isin(train_metadata["id"]))
]

In [None]:
train_metadata = pd.concat([
    train_metadata,
    train_metadata_2021[
        (~train_metadata_2021["id"].isin(train_metadata["id"]))
    ]
]).reset_index(drop=True)

In [None]:
train_metadata.shape

# 2020 Data

In [None]:
train_2020 = pd.read_csv("/gpfs/space/projects/BetterMedicine/volodymyr1/exps/bird_clef_2025/birdclef_2020/train.csv", converters={"secondary_labels": eval})

train_xc_a_m_2020 = pd.read_csv("/gpfs/space/projects/BetterMedicine/volodymyr1/exps/bird_clef_2025/birdclef_2020_add_data/train_extended.csv", converters={"secondary_labels": eval})

In [None]:
(
    len(glob("/gpfs/space/projects/BetterMedicine/volodymyr1/exps/bird_clef_2025/birdclef_2020_add_data/A-M/*/*.mp3")) + 
    len(glob("/gpfs/space/projects/BetterMedicine/volodymyr1/exps/bird_clef_2025/birdclef_2020_add_data/N-Z/*/*.mp3"))
) == train_xc_a_m_2020.shape[0]

In [None]:
check_url_structure(train_2020)
check_url_structure(train_xc_a_m_2020)

In [None]:
train_xc_a_m_2020["dataset"] = None
train_xc_a_m_2020.loc[
    train_xc_a_m_2020["ebird_code"].isin(os.listdir("/gpfs/space/projects/BetterMedicine/volodymyr1/exps/bird_clef_2025/birdclef_2020_add_data/A-M/")),
    "dataset"
] = "a_m_2020"
train_xc_a_m_2020.loc[
    train_xc_a_m_2020["ebird_code"].isin(os.listdir("/gpfs/space/projects/BetterMedicine/volodymyr1/exps/bird_clef_2025/birdclef_2020_add_data/N-Z/")),
    "dataset"
] = "n_z_2020"
train_xc_a_m_2020["dataset"].isna().sum()

In [None]:
train_2020["dataset"] = "comp_2020"
train_2020 = pd.concat([
    train_2020, train_xc_a_m_2020
])
train_2020 = train_2020.drop_duplicates("url").reset_index(drop=True)

In [None]:
# Rename to other years convention
train_2020 = train_2020.rename(columns={"sci_name": "scientific_name", "species": "common_name"})
# Transform secondary_labels
ebird2name = train_2020.drop_duplicates("ebird_code")[["ebird_code", "primary_label"]].set_index("ebird_code")["primary_label"].to_dict()
name2ebird = {v:k for k,v in ebird2name.items()}
train_2020['secondary_labels'] = train_2020['secondary_labels'].apply(lambda x: [name2ebird[el] for el in x if el in name2ebird])
# ebird_code to primary_label
train_2020["primary_label"] = train_2020["ebird_code"]
# Create filename
train_2020["filename"] = train_2020.apply(lambda x: pjoin(x["primary_label"], x["filename"]).replace(".mp3", ".ogg"), axis=1)
# Create ID
train_2020["id"] = train_2020["filename"].apply(get_recording_id)

In [None]:
train_2020[
    (~train_2020["id"].isin(train_metadata["id"]))
]

In [None]:
train_metadata = pd.concat([
    train_metadata,
    train_2020[
        (~train_2020["id"].isin(train_metadata["id"]))
    ]
]).reset_index(drop=True)

# Prune Columns and Compute Stats

In [None]:
main_train_df = pd.read_csv("/gpfs/space/projects/BetterMedicine/volodymyr1/exps/bird_clef_2025/birdclef_2025/train_and_prev_comps_extendedv1_pruneSL_XConly2025_snipet28032025_hdf5.csv", converters={"secondary_labels":eval, "all_labels": eval})

In [None]:
print(f"New columns: {set(train_metadata.columns) - set(main_train_df.columns)}")

In [None]:
train_metadata = train_metadata.drop(columns=[
    'time', 'channels', 'recordist', 'ebird_code', 
    'background', 'elevation', 'title', 'volume', 
    'xc_id', 'number_of_notes', 'description', 
    'country', 'length', 'speed', 'location', 
    'bitrate_of_mp3', 'pitch', 'sampling_rate', 
    'duration', 'bird_seen', 'file_type', 'playback_used', 'date'
])

In [None]:
DATASET2ROOT = {
    "comp_2021": "/gpfs/space/projects/BetterMedicine/volodymyr1/exps/bird_clef_2025/birdclef_2021/train_short_audio/",
    "comp_2022": "/gpfs/space/projects/BetterMedicine/volodymyr1/exps/bird_clef_2025/birdclef_2022/train_audio/",
    "comp_2024": "/gpfs/space/projects/BetterMedicine/volodymyr1/exps/bird_clef_2025/birdclef_2024/train_audio/",
    "comp_2023": "/gpfs/space/projects/BetterMedicine/volodymyr1/exps/bird_clef_2025/birdclef_2023/train_audio/",
    "a_m_2020": "/gpfs/space/projects/BetterMedicine/volodymyr1/exps/bird_clef_2025/birdclef_2020_add_data/A-M/",
    "n_z_2020": "/gpfs/space/projects/BetterMedicine/volodymyr1/exps/bird_clef_2025/birdclef_2020_add_data/N-Z/",
    "comp_2020": "/gpfs/space/projects/BetterMedicine/volodymyr1/exps/bird_clef_2025/birdclef_2020/train_audio/",
}
def get_add_data_fname(row):
    fname = os.path.join(DATASET2ROOT[row["dataset"]], row["filename"])
    if not os.path.exists(fname):
        fname = fname.replace(".ogg", ".mp3")
    return fname

In [None]:
train_metadata = train_metadata[~train_metadata["id"].isin(main_train_df["id"])].reset_index(drop=True)

In [None]:
for filename in tqdm(train_metadata.apply(get_add_data_fname, axis=1).to_list()):
    shorten_filename = "/".join(filename.split("/")[-2:])
    destination_filename = os.path.join(
        "/gpfs/helios/home/volodymyr1/src/bird_clef_2025/data/pretrain_audio_from_prev_comps",
        shorten_filename
    )
    if not os.path.exists(os.path.dirname(destination_filename)):
        os.makedirs(os.path.dirname(destination_filename))
    copyfile(
        filename,
        destination_filename
    )

In [None]:
train_metadata.loc[train_metadata["dataset"].isin(["a_m_2020", "comp_2020"]), "filename"] = train_metadata.loc[
    train_metadata["dataset"].isin(["a_m_2020", "comp_2020"]), "filename"
].apply(lambda x: x.replace(".ogg", ".mp3"))

In [None]:
train_metadata["data_root_id"] = "train_audio"

train_metadata.loc[train_metadata["dataset"] != "comp_2025", "data_root_id"] = "pretrain_audio_from_prev_comps"

train_metadata["data_root_id"].value_counts()

In [None]:
def get_audio_metadata_with_diff_dtypes(input):
    returned_meta = get_audio_metadata(input)
    if returned_meta["sample_rate"] is None:
        returned_meta = get_audio_metadata(input.replace(".ogg", ".mp3"))
    return returned_meta

add_audio_meta = pd.DataFrame(train_metadata["filename"].apply(lambda x: get_audio_metadata_with_diff_dtypes(os.path.join(
    "/gpfs/helios/home/volodymyr1/src/bird_clef_2025/data/pretrain_audio_from_prev_comps",
    x
))).to_list())

In [None]:
train_metadata = pd.concat([train_metadata, add_audio_meta], axis=1)

print("Failed samples:", train_metadata["sample_rate"].isna().sum())

train_metadata = train_metadata[~train_metadata["sample_rate"].isna()].reset_index(drop=True)

In [None]:
train_metadata["collection"] = "XC"

In [None]:
train_metadata["primary_label"].value_counts()

In [None]:
train_metadata.shape

In [None]:
train_metadata = train_metadata[
    train_metadata["filename"].apply(lambda x: os.path.join("../data/pretrain_features_from_prev_comps/", os.path.splitext(x)[0] + ".hdf5")).apply(os.path.exists)
].reset_index(drop=True)

In [None]:
train_metadata.to_csv(
    "/gpfs/space/projects/BetterMedicine/volodymyr1/exps/bird_clef_2025/birdclef_2025/pretraintrain_prev_comps_extendedv1.csv", index=False
)

## Shorten pretrain without undersampled classes

In [None]:
train_metadata = pd.read_csv(
    "/gpfs/space/projects/BetterMedicine/volodymyr1/exps/bird_clef_2025/birdclef_2025/pretraintrain_prev_comps_extendedv1.csv", 
    converters={"secondary_labels":eval, "all_labels": eval}
)

In [None]:
train_metadata["all_labels"] = train_metadata.apply(lambda row: [row["primary_label"]] + row["secondary_labels"], axis=1)

In [None]:
pl_vc = pd.Series(list(chain(*train_metadata["all_labels"]))).value_counts()
selected_species = set(pl_vc[pl_vc > 10].index)

print(f"Selecting {len(selected_species)} out of {pl_vc.shape[0]}")

In [None]:
train_metadata_shorten = train_metadata[train_metadata["primary_label"].isin(selected_species)].reset_index(drop=True)

print(f"Selecting rows {train_metadata_shorten.shape[0]} out of {train_metadata.shape[0]}")

In [None]:
train_metadata_shorten["secondary_labels"] = train_metadata_shorten["secondary_labels"].apply(lambda x: [el for el in x if el in selected_species]) 
train_metadata_shorten["all_labels"] = train_metadata_shorten["all_labels"].apply(lambda x: [el for el in x if el in selected_species]) 

In [None]:
assert set(chain(*train_metadata_shorten["all_labels"])) == (set(chain(*train_metadata_shorten["secondary_labels"])) | set(train_metadata_shorten["primary_label"]))
assert set(list(chain(*train_metadata_shorten["secondary_labels"]))) <= selected_species

In [None]:
train_metadata_shorten.to_csv(
    "/gpfs/space/projects/BetterMedicine/volodymyr1/exps/bird_clef_2025/birdclef_2025/pretraintrain_prev_comps_nosmall10sp_extendedv1.csv", index=False
)

# Prune Additional labels

In [None]:
train_metadata = pd.read_csv(
    "/gpfs/space/projects/BetterMedicine/volodymyr1/exps/bird_clef_2025/birdclef_2025/train_and_prev_comps_extendedv1.csv",
    converters={"secondary_labels":eval}
)

In [None]:
sb_2025 = load_json(
    "/gpfs/space/projects/BetterMedicine/volodymyr1/exps/bird_clef_2025/birdclef_2025/sb_2025.json"
)

In [None]:
set([el for el in list(chain(*train_metadata["secondary_labels"])) if el]) - set(sb_2025)

In [None]:
train_metadata["secondary_labels"] = train_metadata["secondary_labels"].apply(
    lambda x: [el for el in x if el in sb_2025]
)

In [None]:
set([el for el in list(chain(*train_metadata["secondary_labels"])) if el]) - set(sb_2025)

In [None]:
train_metadata.to_csv(
    "/gpfs/space/projects/BetterMedicine/volodymyr1/exps/bird_clef_2025/birdclef_2025/train_and_prev_comps_extendedv1_pruneSL.csv", index=False
)