In [38]:
import json
import os
import IPython.display as ipd

from glob import glob
from os.path import splitext, join as pjoin
from itertools import chain
from shutil import copyfile
from tqdm import tqdm
from copy import deepcopy
import xenocanto
import pandas as pd
import numpy as np
import librosa
import soundfile as SF

from code_base.utils.audio_utils import get_audio_metadata
from code_base.utils import load_json, write_json

tqdm.pandas()

# Load Taxonomies

In [2]:
eBird_Taxonomy_v2021 = pd.read_csv("/gpfs/space/projects/BetterMedicine/volodymyr1/exps/bird_clef_2025/birdclef_2025/taxonomy.csv")

eBird_Taxonomy_v2021["scientific_name"] = eBird_Taxonomy_v2021["scientific_name"].str.lower()
eBird_Taxonomy_v2021["common_name"] = eBird_Taxonomy_v2021["common_name"].str.lower()

undersampled_df = pd.read_csv("undersampled_classes_birdclef_2025.csv")

In [3]:
# Prepare load JSON for 2025

# write_json(
#     "/gpfs/space/projects/BetterMedicine/volodymyr1/exps/bird_clef_2025/xeno_canto/data_2025/download_species.json",
#     eBird_Taxonomy_v2021["scientific_name"].str.lower().to_list(),
# )

In [4]:
# Enrich taxonomy with previous year taxonomy
# It was checked that there are no conflicts

taxonomy_2024 = pd.read_csv(
    "/gpfs/space/projects/BetterMedicine/volodymyr1/exps/bird_clef_2025/birdclef_2024/eBird_Taxonomy_v2021.csv"
)
taxonomy_2024 = taxonomy_2024.rename(columns={
    "SPECIES_CODE": "primary_label",
    "PRIMARY_COM_NAME": "common_name",
    "SCI_NAME": "scientific_name"
})[["primary_label", "common_name", "scientific_name"]]

taxonomy_2024["scientific_name"] = taxonomy_2024["scientific_name"].str.lower()
taxonomy_2024["common_name"] = taxonomy_2024["common_name"].str.lower()

eBird_Taxonomy_v2021 = pd.concat([
    eBird_Taxonomy_v2021,
    taxonomy_2024
])

eBird_Taxonomy_v2021 = eBird_Taxonomy_v2021.drop_duplicates("primary_label")

In [5]:
eBird_Taxonomy_v2021.shape

(16813, 5)

In [None]:
# Prepare load JSON for all years and more

# write_json(
#     "/gpfs/space/projects/BetterMedicine/volodymyr1/exps/bird_clef_2025/xeno_canto/data_all_years_and_more/download_species.json",
#     eBird_Taxonomy_v2021["scientific_name"].str.lower().to_list(),
# )

# Check Loading Process

In [None]:
downloaded_files = glob("/gpfs/space/projects/BetterMedicine/volodymyr1/exps/bird_clef_2025/xeno_canto/data_all_years_and_more/dataset/audio/**/*.mp3", recursive=True)

In [None]:
# Validate that all IDs are unique
len(set([os.path.basename(el) for el in downloaded_files])) == len(downloaded_files)

In [None]:
id_vc = pd.Series([os.path.basename(el) for el in downloaded_files]).value_counts()

In [None]:
duplicated_ids = set(id_vc[id_vc > 1].index)

In [None]:
duplicated_ids

In [None]:
len(downloaded_files)

In [None]:
[el for el in downloaded_files if os.path.basename(el) in duplicated_ids]

In [None]:
"877358.mp3" in set([os.path.basename(el) for el in downloaded_files])

In [None]:
# Check loaded files distribution

pd.Series([el.split("/")[-2] for el in downloaded_files]).value_counts()

# Process metadata

In [6]:
train_metadata = pd.read_csv(
    "/gpfs/space/projects/BetterMedicine/volodymyr1/exps/bird_clef_2025/birdclef_2025/pretraintrain_prev_comps_extendedv1.csv"
)

  train_metadata = pd.read_csv(


In [None]:
# Compose initial XC DF

all_meta_files = glob("/gpfs/space/projects/BetterMedicine/volodymyr1/exps/bird_clef_2025/xeno_canto/data_all_years_and_more/dataset/metadata/**/*.json")

NEEDED_COLUMNS = [
    "id",
    "type",
    "lat",
    "lng",
    "en",
    "gen",
    "sp",
    "rec",
    "time",
    "url",
    "also",
    "file-name",
    "lic"
]
print(f"Jsons found = {len(all_meta_files)}")

def process_dict(
    input,
    folder_name,
    needed_cols=NEEDED_COLUMNS
):
    input = {k:v for k,v in input.items() if k in NEEDED_COLUMNS}
    input["foldername"] = folder_name
    return input

add_xeno_canto_df = pd.DataFrame()

for meta_file_path in tqdm(all_meta_files):
    meta_file = json.load(open(meta_file_path))['recordings']
    foldname = meta_file_path.split("/")[-2]
    # add_xeno_canto_df = add_xeno_canto_df.append(
    #     [process_dict(bird_info, foldname) for bird_info in meta_file],
    #     ignore_index=True
    # )
    add_xeno_canto_df = pd.concat([
        add_xeno_canto_df,
        pd.DataFrame([process_dict(bird_info, foldname) for bird_info in meta_file])
    ]).reset_index(drop=True)

Jsons found = 15980


 88%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                    | 13998/15980 [1:10:08<17:31,  1.89it/s]

In [17]:
# Create existing filename columns

add_xeno_canto_df["id"] = add_xeno_canto_df["id"].astype(int) 

# Exclude id duplicates in DataFrame
add_xeno_canto_df_id_vc = add_xeno_canto_df["id"].value_counts()
add_xeno_canto_df = add_xeno_canto_df[~add_xeno_canto_df["id"].isin(add_xeno_canto_df_id_vc[add_xeno_canto_df_id_vc > 1].index)].reset_index(drop=True)

downloaded_files = glob("/gpfs/space/projects/BetterMedicine/volodymyr1/exps/bird_clef_2025/xeno_canto/data_all_years_and_more/dataset/audio/**/*.mp3", recursive=True)

# Exclude id duplicates
id_vc = pd.Series([os.path.basename(el) for el in downloaded_files]).value_counts()
duplicated_ids = set(id_vc[id_vc > 1].index)

print(f"Downloaded files before filtering: {len(downloaded_files)}")

downloaded_files = [el for el in downloaded_files if os.path.basename(el) not in duplicated_ids]

print(f"Downloaded files after filtering: {len(downloaded_files)}")

xcid2filename = {
    int(os.path.splitext(os.path.basename(el))[0]):el for el in downloaded_files
}

add_xeno_canto_df["filename"] = add_xeno_canto_df["id"].map(xcid2filename)

print("Not Loaded Paritition:", add_xeno_canto_df["filename"].isna().sum() / add_xeno_canto_df.shape[0])

Downloaded files before filtering: 873215
Downloaded files after filtering: 873201
Not Loaded Paritition: 0.007171929153285404


In [18]:
# Map columns to casual names and create `scientific_name` column
add_xeno_canto_df = add_xeno_canto_df.rename(columns={
    "en":"common_name",
    "lat": "latitude",
    "lng": "longitude",
    "rec": "author",
    "lic": "license"
})

add_xeno_canto_df["scientific_name"] = add_xeno_canto_df["gen"] + " " + add_xeno_canto_df["sp"]

In [19]:
DROP_PRIMARY_CN_SPECIES = ["yetgre1"]

# 1. Check which files we have in taxonomy by `common_name` and `scientific_name`
# 2. Validate that there no inconsistency in primary label mapping by common_name and scientific_name
# 3. Map to primary_label

common_name_intersection = set(eBird_Taxonomy_v2021["common_name"]) & set(add_xeno_canto_df["common_name"].str.lower())
scientific_name_intersection = set(eBird_Taxonomy_v2021["scientific_name"]) & set(add_xeno_canto_df["scientific_name"].str.lower())

add_xeno_canto_df["primary_label_cn"] = None
add_xeno_canto_df["primary_label_sn"] = None
add_xeno_canto_df.loc[add_xeno_canto_df["common_name"].str.lower().isin(common_name_intersection), "primary_label_cn"] = (
    add_xeno_canto_df.loc[add_xeno_canto_df["common_name"].str.lower().isin(common_name_intersection), "common_name"].str.lower().map(eBird_Taxonomy_v2021.set_index("common_name")["primary_label"].to_dict())
)
add_xeno_canto_df.loc[add_xeno_canto_df["scientific_name"].str.lower().isin(scientific_name_intersection), "primary_label_sn"] = (
    add_xeno_canto_df.loc[add_xeno_canto_df["scientific_name"].str.lower().isin(scientific_name_intersection), "scientific_name"].str.lower().map(eBird_Taxonomy_v2021.set_index("scientific_name")["primary_label"].to_dict())
)

# Drop faulty species
add_xeno_canto_df = add_xeno_canto_df[~add_xeno_canto_df["primary_label_cn"].isin(DROP_PRIMARY_CN_SPECIES)].reset_index(drop=True)

assert (add_xeno_canto_df.loc[
    (~add_xeno_canto_df["primary_label_cn"].isna()) & (~add_xeno_canto_df["primary_label_sn"].isna()),
    "primary_label_cn"
] != add_xeno_canto_df.loc[
    (~add_xeno_canto_df["primary_label_cn"].isna()) & (~add_xeno_canto_df["primary_label_sn"].isna()),
    "primary_label_sn"
]).sum() == 0

add_xeno_canto_df["primary_label"] = add_xeno_canto_df["primary_label_cn"]
add_xeno_canto_df.loc[add_xeno_canto_df["primary_label"].isna(), "primary_label"] = add_xeno_canto_df.loc[add_xeno_canto_df["primary_label"].isna(), "primary_label_sn"]

In [20]:
print("Appropriate Labels Paritition:", (~add_xeno_canto_df["primary_label"].isna()).sum() / add_xeno_canto_df.shape[0])

Appropriate Labels Paritition: 0.9686217094326196


In [21]:
# add_xeno_canto_df.loc[
#     (add_xeno_canto_df["primary_label_cn"].isna()) & (add_xeno_canto_df["primary_label_sn"].isna()),
#     "common_name"
# ].drop_duplicates().to_list()

In [22]:
# add_xeno_canto_df.loc[
#     (add_xeno_canto_df["primary_label_cn"].isna()) & (add_xeno_canto_df["primary_label_sn"].isna()),
#     "scientific_name"
# ].drop_duplicates().to_list()

In [23]:
# Apply Filters

xc_new_ids = set(add_xeno_canto_df["id"].astype(int)) - set(train_metadata["id"].astype(int))

# Remove already existing files in train
add_xeno_canto_df = add_xeno_canto_df[add_xeno_canto_df["id"].astype(int).isin(xc_new_ids)].reset_index(drop=True)
# Remove not matched primary_label
add_xeno_canto_df = add_xeno_canto_df[~add_xeno_canto_df["primary_label"].isna()].reset_index(drop=True)
# Remove not existing filenames
add_xeno_canto_df = add_xeno_canto_df[~add_xeno_canto_df["filename"].isna()].reset_index(drop=True)

## Check Left Classes

In [24]:
add_xeno_canto_df["primary_label"].value_counts()

primary_label
gretit1    8496
eurbla     7871
comcha     7517
comchi1    6860
eurrob1    6739
           ... 
bishao1       1
ceywop1       1
pirwar2       1
coopet        1
sgdpet1       1
Name: count, Length: 9840, dtype: int64

In [25]:
# Continue processing

In [26]:
# Map also, which is secondary_labels, from common_names to primary_label
# Ignore species that are not presented in target taxonomy

print("Number of matched secondary_labels:", len(set([el.lower() for el in list(chain(*add_xeno_canto_df["also"].tolist()))]) & set(eBird_Taxonomy_v2021["scientific_name"])))
print("Number of not matched secondary_labels:", len(set([el.lower() for el in list(chain(*add_xeno_canto_df["also"].tolist()))]) - set(eBird_Taxonomy_v2021["scientific_name"])))

pl2sn = eBird_Taxonomy_v2021.set_index("scientific_name")["primary_label"].to_dict()
add_xeno_canto_df["secondary_labels"] = add_xeno_canto_df["also"].apply(lambda x: [pl2sn[el.lower()] for el in x if el.lower() in pl2sn])

# Double Check
print("Number of matched secondary_labels:", len(set([el.lower() for el in list(chain(*add_xeno_canto_df["secondary_labels"].tolist()))])))

Number of matched secondary_labels: 6824
Number of not matched secondary_labels: 461
Number of matched secondary_labels: 6824


In [27]:
# Retrieve other columns

add_xeno_canto_df["url"] = "https:" + add_xeno_canto_df["url"]
add_xeno_canto_df["type"] = add_xeno_canto_df["type"].apply(lambda x: [el.strip() for el in x.split(",")])
add_xeno_canto_df["rating"] = None
add_xeno_canto_df["collection"] = "XC"

In [29]:
# Enrich with audio meta

add_xeno_canto_audio_meta = pd.DataFrame(add_xeno_canto_df["filename"].apply(get_audio_metadata).to_list())

add_xeno_canto_df = pd.concat([
    add_xeno_canto_df, add_xeno_canto_audio_meta
], axis=1)

add_xeno_canto_df = add_xeno_canto_df[~add_xeno_canto_df["sample_rate"].isna()].reset_index(drop=True)

[src/libmpg123/parse.c:skip_junk():1276] error: Giving up searching valid MPEG header after 65536 bytes of junk.
[src/libmpg123/parse.c:skip_junk():1276] error: Giving up searching valid MPEG header after 65536 bytes of junk.
[src/libmpg123/parse.c:skip_junk():1276] error: Giving up searching valid MPEG header after 65536 bytes of junk.
[src/libmpg123/parse.c:skip_junk():1276] error: Giving up searching valid MPEG header after 65536 bytes of junk.
[src/libmpg123/parse.c:skip_junk():1276] error: Giving up searching valid MPEG header after 65536 bytes of junk.
[src/libmpg123/parse.c:skip_junk():1276] error: Giving up searching valid MPEG header after 65536 bytes of junk.
[src/libmpg123/parse.c:skip_junk():1276] error: Giving up searching valid MPEG header after 65536 bytes of junk.
[src/libmpg123/parse.c:skip_junk():1276] error: Giving up searching valid MPEG header after 65536 bytes of junk.
[src/libmpg123/parse.c:skip_junk():1276] error: Giving up searching valid MPEG header after 6553

In [30]:
# Add dataset specific columns

SAVE_DATASET_NAME = "add_pretrain_audio_from_xeno_canto_03042025"

add_xeno_canto_df["dataset"] = "xeno_canto_03042025_onlytaxonomy"
add_xeno_canto_df["data_root_id"] = SAVE_DATASET_NAME

In [31]:
# Shrink columns

add_xeno_canto_df = add_xeno_canto_df[train_metadata.columns]
assert set(train_metadata.columns) == set(add_xeno_canto_df.columns)

In [32]:
# Copy only needed audio files

new_filenames = []
for or_fname, pl_label, sample_id in tqdm(zip(
        add_xeno_canto_df["filename"].to_list(),
        add_xeno_canto_df["primary_label"].to_list(), 
        add_xeno_canto_df["id"].to_list()
)):
    assert int(os.path.splitext(os.path.basename(or_fname))[0]) == sample_id
    new_fname = os.path.join(
        "/gpfs/space/projects/BetterMedicine/volodymyr1/exps/bird_clef_2025/xeno_canto/", SAVE_DATASET_NAME, pl_label, os.path.basename(or_fname)
    )
    # if not os.path.exists(os.path.dirname(new_fname)):
    #     os.makedirs(os.path.dirname(new_fname))
    # copyfile(
    #     or_fname,
    #     new_fname
    # )
    new_filenames.append(new_fname)

720624it [00:03, 190301.36it/s]


In [39]:
# Double check that all fnames exists

assert add_xeno_canto_df["filename"].progress_apply(os.path.exists).all()

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 720624/720624 [04:06<00:00, 2919.83it/s]


True

In [40]:
# Preserve only primary_label/filename.mp3 in filename

add_xeno_canto_df["filename"] = [
    "/".join(el.split("/")[-2:]) for el in new_filenames
]

In [42]:
# Check unused copied files for pruning them

all_saved_files = glob(
    os.path.join("/gpfs/space/projects/BetterMedicine/volodymyr1/exps/bird_clef_2025/xeno_canto/", SAVE_DATASET_NAME, "**/*.mp3"), recursive=True
)
all_saved_files = ["/".join(el.split("/")[-2:]) for el in all_saved_files]

assert set(all_saved_files) >= set(add_xeno_canto_df["filename"])

files_to_delete = list(
    set(all_saved_files) - set(add_xeno_canto_df["filename"])
)
print(f"{len(files_to_delete)} files will be deleted")

for fname in tqdm(files_to_delete):
    mp3_fname = os.path.join(
        "/gpfs/space/projects/BetterMedicine/volodymyr1/exps/bird_clef_2025/xeno_canto/", SAVE_DATASET_NAME, fname
    )
    hdf5_fname = mp3_fname.replace("audio", "features").replace(".mp3", ".hdf5")

    if os.path.exists(mp3_fname):
        os.remove(mp3_fname)
    else:
        print(mp3_fname, "does not exist")
    if os.path.exists(hdf5_fname):
        os.remove(hdf5_fname)
    else:
        print(hdf5_fname, "does not exist")

739002

In [58]:
assert not set(train_metadata["id"].apply(int)) & set(add_xeno_canto_df["id"])

In [59]:
print("Extended dataset will contain next Partition of XC Add Data: ", add_xeno_canto_df.shape[0] / (train_metadata.shape[0] + add_xeno_canto_df.shape[0]))

Extended dataset will contain next Partition of XC Add Data:  0.8679853437497742


In [60]:
add_xeno_canto_df.to_csv(
    "/gpfs/space/projects/BetterMedicine/volodymyr1/exps/bird_clef_2025/xeno_canto/data_all_years_and_more/dataset/pretrain_allyearstaxonomy_snipet03042025.csv",
    index=False
)

In [61]:
add_xeno_canto_df["filename"].value_counts()

filename
pyghap1/513805.mp3    1
blackc1/966117.mp3    1
blackc1/971764.mp3    1
blackc1/971760.mp3    1
blackc1/971754.mp3    1
                     ..
linwoo4/87231.mp3     1
linwoo4/86605.mp3     1
linwoo4/66436.mp3     1
linwoo4/66426.mp3     1
ceghor1/68882.mp3     1
Name: count, Length: 720624, dtype: int64

In [62]:
# Merge for training

train_metadata = pd.concat([
    train_metadata,
    add_xeno_canto_df
], axis=0).reset_index(drop=True)

  train_metadata = pd.concat([


In [64]:
train_metadata["filename"].value_counts()

filename
asbfly/XC134896.ogg    1
parcro2/601371.mp3     1
parcro2/607411.mp3     1
parcro2/607393.mp3     1
parcro2/607211.mp3     1
                      ..
marwar3/811425.mp3     1
marwar3/811420.mp3     1
marwar3/811417.mp3     1
marwar3/811414.mp3     1
ceghor1/68882.mp3      1
Name: count, Length: 830226, dtype: int64

In [65]:
train_metadata.to_csv(
    "/gpfs/space/projects/BetterMedicine/volodymyr1/exps/bird_clef_2025/xeno_canto/data_all_years_and_more/dataset/train_and_prev_comps_extendedv1_pruneSL_XCallyearstaxonomy_snipet03042025.csv", index=False
)

In [66]:
!ls /gpfs/space/projects/BetterMedicine/volodymyr1/exps/bird_clef_2025/xeno_canto/data_all_years_and_more/dataset/ -lt

total 487808
-rw-r--r--     1 volodymyr1 BetterMedicine 265056059 Apr  5 18:44 train_and_prev_comps_extendedv1_pruneSL_XCallyearstaxonomy_snipet03042025.csv
-rw-r--r--     1 volodymyr1 BetterMedicine 232196087 Apr  5 18:41 pretrain_allyearstaxonomy_snipet03042025.csv
drwxr-sr-x 10390 volodymyr1 BetterMedicine   1048576 Mar 30 18:17 audio
drwxr-sr-x 15991 volodymyr1 BetterMedicine   1048576 Mar 30 05:38 metadata


In [67]:
train_metadata["data_root_id"].value_counts()

data_root_id
add_pretrain_audio_from_xeno_canto_03042025    720624
pretrain_audio_from_prev_comps                 109602
Name: count, dtype: int64

## Prune failed on h5py convertion

In [79]:
DATA_ROOT_ID_MAPPING = {
    "pretrain_audio_from_prev_comps": "/gpfs/helios/home/volodymyr1/src/bird_clef_2025/data/pretrain_features_from_prev_comps",
    "add_pretrain_audio_from_xeno_canto_03042025": "/gpfs/space/projects/BetterMedicine/volodymyr1/exps/bird_clef_2025/xeno_canto/add_pretrain_features_from_xeno_canto_03042025/"
}

def get_h5py_fname(input_fname, data_root_id):
    root = DATA_ROOT_ID_MAPPING[data_root_id]
    return os.path.splitext(os.path.join(root, input_fname))[0] + ".hdf5"

hdf5_exsists_mask = train_metadata.progress_apply(lambda row: get_h5py_fname(row["filename"], row["data_root_id"]), axis=1).progress_apply(os.path.exists)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 830226/830226 [00:10<00:00, 79393.71it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 830226/830226 [11:08<00:00, 1241.23it/s]


In [82]:
print(f"Not Converted Files: {(~hdf5_exsists_mask).sum()}")

Not Converted Files: 183


In [84]:
train_metadata = train_metadata[hdf5_exsists_mask].reset_index(drop=True)

In [85]:
train_metadata.shape

(830043, 21)

In [89]:
train_metadata.to_csv(
    "/gpfs/space/projects/BetterMedicine/volodymyr1/exps/bird_clef_2025/xeno_canto/data_all_years_and_more/dataset/train_and_prev_comps_extendedv1_pruneSL_XCallyearstaxonomy_snipet03042025_hdf5.csv",
    index=False
)

## Shorten pretrain without undersampled classes

In [97]:
train_metadata["secondary_labels"] = train_metadata["secondary_labels"].apply(lambda x: eval(x) if isinstance(x, str) else x) 

In [98]:
train_metadata["all_labels"] = train_metadata.apply(lambda row: [row["primary_label"]] + row["secondary_labels"], axis=1)

In [105]:
pl_vc = train_metadata["primary_label"].value_counts()
selected_species = set(pl_vc[pl_vc >= 10].index)

print(f"Selecting {len(selected_species)} out of {pl_vc.shape[0]}")

Selecting 7489 out of 9858


In [106]:
train_metadata_shorten = train_metadata[train_metadata["primary_label"].isin(selected_species)].reset_index(drop=True)

print(f"Selecting rows {train_metadata_shorten.shape[0]} out of {train_metadata.shape[0]}")

Selecting rows 819032 out of 830043


In [107]:
train_metadata_shorten["secondary_labels"] = train_metadata_shorten["secondary_labels"].apply(lambda x: [el for el in x if el in selected_species]) 
train_metadata_shorten["all_labels"] = train_metadata_shorten["all_labels"].apply(lambda x: [el for el in x if el in selected_species]) 

In [108]:
train_metadata_shorten["primary_label"].value_counts()

primary_label
gretit1    8493
eurbla     7869
comcha     7515
comchi1    6859
eurrob1    6736
           ... 
grrspa1      10
whhbab2      10
babcuc4      10
pewnig1      10
ochpew1      10
Name: count, Length: 7489, dtype: int64

In [109]:
assert set(chain(*train_metadata_shorten["all_labels"])) == (set(chain(*train_metadata_shorten["secondary_labels"])) | set(train_metadata_shorten["primary_label"]))
assert set(list(chain(*train_metadata_shorten["secondary_labels"]))) <= selected_species

In [110]:
train_metadata_shorten.to_csv(
    "/gpfs/space/projects/BetterMedicine/volodymyr1/exps/bird_clef_2025/xeno_canto/data_all_years_and_more/dataset/train_and_prev_comps_extendedv1_pruneSL_XCallyearstaxonomy_snipet03042025_hdf5_nosmall10sp.csv", index=False
)

## Remove 2025 files

In [123]:
train_metadata_2025 = pd.read_csv(
    "/gpfs/space/projects/BetterMedicine/volodymyr1/exps/bird_clef_2025/birdclef_2025/train_and_prev_comps_extendedv1_pruneSL_XConly2025_snipet28032025_hdf5.csv",
    converters={"secondary_labels":eval, "all_labels": eval}
)
train_metadata_2025 = train_metadata_2025[train_metadata_2025["collection"] == "XC"].reset_index(drop=True)

In [124]:
birds_2025 = set(train_metadata_2025["primary_label"])

pretrain_birds = set(train_metadata_shorten["primary_label"])

pruned_pretrain_birds = pretrain_birds - birds_2025

print(f"Selecting {len(pruned_pretrain_birds)} out of {len(pretrain_birds)}")

Selecting 7338 out of 7489


In [125]:
train_metadata_shorten_pruned = train_metadata_shorten[
    train_metadata_shorten["primary_label"].isin(pruned_pretrain_birds)
].reset_index(drop=True)

print(f"Selecting rows {train_metadata_shorten_pruned.shape[0]} out of {train_metadata_shorten.shape[0]}")

Selecting rows 791706 out of 819032


In [133]:
train_metadata_shorten.to_csv(
    "/gpfs/space/projects/BetterMedicine/volodymyr1/exps/bird_clef_2025/xeno_canto/data_all_years_and_more/dataset/train_and_prev_comps_extendedv1_pruneSL_XCallyearstaxonomy_snipet03042025_hdf5_nosmall10sp_no2025.csv", index=False
)

In [126]:
intersecting_ids = list(set(train_metadata_shorten_pruned["id"].astype(int)) & set(train_metadata_2025["id"].astype(int)))

In [131]:
train_metadata_2025[train_metadata_2025["id"].isin(intersecting_ids)]

Unnamed: 0,primary_label,secondary_labels,type,filename,collection,rating,url,latitude,longitude,scientific_name,...,author,license,sample_rate,duration_s,num_channels,bit_depth,encoding,id,dataset,data_root_id
8871,linwoo1,[],['call'],linwoo1/XC350215.ogg,XC,4.0,https://xeno-canto.org/350215,19.6799,-104.3881,Dryocopus lineatus,...,Manuel Grosselet,cc-by-nc-sa 4.0,32000.0,28.656312,1.0,0.0,VORBIS,350215,comp_2025,train_audio
16215,thbeup1,[],['call'],thbeup1/XC355995.ogg,XC,2.0,https://xeno-canto.org/355995,-3.8983,-80.087,Euphonia laniirostris,...,Vanessa Luzuriaga,cc-by-nc-sa 4.0,32000.0,1.33225,1.0,0.0,VORBIS,355995,comp_2025,train_audio
21228,butsal1,[],['song'],butsal1/XC441484.ogg,XC,0.0,https://www.xeno-canto.org/441484,-11.5833,-72.95,Saltator maximus,...,Rosendo Fraga,Creative Commons Attribution-NonCommercial-Sha...,32000.0,36.723,1.0,0.0,VORBIS,441484,comp_2021,add_train_audio_from_prev_comps
21230,creoro1,[],"['call', 'song']",creoro1/XC452809.ogg,XC,4.0,https://www.xeno-canto.org/452809,-0.4383,-76.2791,Psarocolius decumanus,...,Lars Lachmann,Creative Commons Attribution-NonCommercial-Sha...,32000.0,50.639469,1.0,0.0,VORBIS,452809,comp_2021,add_train_audio_from_prev_comps
21274,gretin1,[],['song'],gretin1/XC497201.ogg,XC,4.0,https://www.xeno-canto.org/497201,-11.0907,-69.1371,Tinamus major,...,Jacob Wijpkema,Creative Commons Attribution-NonCommercial-Sha...,32000.0,75.973,1.0,0.0,VORBIS,497201,comp_2021,add_train_audio_from_prev_comps
21275,laufal1,[],"['call', 'song']",laufal1/XC117979.ogg,XC,5.0,https://www.xeno-canto.org/117979,9.5775,-75.46111,Herpetotheres cachinnans,...,Mauricio Álvarez Rebolledo (Colección de Sonid...,Creative Commons Attribution-NonCommercial-Sha...,32000.0,268.422344,1.0,0.0,VORBIS,117979,comp_2021,add_train_audio_from_prev_comps
21276,laufal1,[],['call'],laufal1/XC117981.ogg,XC,5.0,https://www.xeno-canto.org/117981,9.5775,-75.46111,Herpetotheres cachinnans,...,Fernando Forero (Colección de Sonidos Ambienta...,Creative Commons Attribution-NonCommercial-Sha...,32000.0,74.724375,1.0,0.0,VORBIS,117981,comp_2021,add_train_audio_from_prev_comps
21277,laufal1,[],['adult'],laufal1/XC597587.ogg,XC,5.0,https://www.xeno-canto.org/597587,2.7648,-75.2333,Herpetotheres cachinnans,...,Erik Camilo Gaitán López,Creative Commons Attribution-NonCommercial-Sha...,32000.0,27.853,1.0,0.0,VORBIS,597587,comp_2021,add_train_audio_from_prev_comps
21279,linwoo1,[],['call'],linwoo1/XC522517.ogg,XC,0.0,https://www.xeno-canto.org/522517,17.539,-89.1111,Dryocopus lineatus,...,Paul Driver,Creative Commons Attribution-NonCommercial-Sha...,32000.0,80.288438,1.0,0.0,VORBIS,522517,comp_2021,add_train_audio_from_prev_comps
21280,linwoo1,[],['call'],linwoo1/XC522518.ogg,XC,0.0,https://www.xeno-canto.org/522518,17.539,-89.1111,Dryocopus lineatus,...,Paul Driver,Creative Commons Attribution-NonCommercial-Sha...,32000.0,16.810906,1.0,0.0,VORBIS,522518,comp_2021,add_train_audio_from_prev_comps


In [132]:
train_metadata_shorten_pruned[train_metadata_shorten_pruned["id"].isin(intersecting_ids)]

Unnamed: 0,primary_label,secondary_labels,type,latitude,longitude,scientific_name,common_name,author,license,rating,...,id,dataset,data_root_id,sample_rate,duration_s,num_channels,bit_depth,encoding,collection,all_labels
237368,bratin1,[],[song],-11.0907,-69.1371,Crypturellus strigulosus,Brazilian Tinamou,Jacob Wijpkema,//creativecommons.org/licenses/by-nc-sa/4.0/,,...,497201,xeno_canto_03042025_onlytaxonomy,add_pretrain_audio_from_xeno_canto_03042025,48000.0,76.299708,2.0,0.0,UNKNOWN,XC,[bratin1]
264815,grbani,[],[song],10.1479,-68.9014,Crotophaga sulcirostris,Groove-billed Ani,Miguel Angel Torres,//creativecommons.org/licenses/by-nc-sa/4.0/,,...,306861,xeno_canto_03042025_onlytaxonomy,add_pretrain_audio_from_xeno_canto_03042025,44100.0,80.1539,1.0,0.0,UNKNOWN,XC,[grbani]
274990,grasal3,[],[song],-11.5833,-72.95,Saltator coerulescens,Blue-grey Saltator,Rosendo Fraga,//creativecommons.org/licenses/by-nc-sa/4.0/,,...,441484,xeno_canto_03042025_onlytaxonomy,add_pretrain_audio_from_xeno_canto_03042025,16000.0,37.97925,1.0,0.0,UNKNOWN,XC,[grasal3]
411585,grcfly1,[],[call],10.4347,-84.7095,Myiozetetes granadensis,Grey-capped Flycatcher,GABRIEL LEITE,//creativecommons.org/licenses/by-nc-sa/4.0/,,...,489481,xeno_canto_03042025_onlytaxonomy,add_pretrain_audio_from_xeno_canto_03042025,48000.0,8.660917,1.0,0.0,UNKNOWN,XC,[grcfly1]
520775,coffal1,[],[],2.7648,-75.2333,Micrastur semitorquatus,Collared Forest Falcon,Erik Camilo Gaitán López,//creativecommons.org/licenses/by-nc-sa/4.0/,,...,597587,xeno_canto_03042025_onlytaxonomy,add_pretrain_audio_from_xeno_canto_03042025,48000.0,27.829083,2.0,0.0,UNKNOWN,XC,[coffal1]
520928,coffal1,[],[call],9.5775,-75.46111,Micrastur semitorquatus,Collared Forest Falcon,Fernando Forero (Colección de Sonidos Ambienta...,//creativecommons.org/licenses/by-nc-sa/3.0/,,...,117981,xeno_canto_03042025_onlytaxonomy,add_pretrain_audio_from_xeno_canto_03042025,44100.0,74.72,2.0,0.0,UNKNOWN,XC,[coffal1]
520930,coffal1,[],"[call, song]",9.5775,-75.46111,Micrastur semitorquatus,Collared Forest Falcon,Mauricio Álvarez Rebolledo (Colección de Sonid...,//creativecommons.org/licenses/by-nc-sa/3.0/,,...,117979,xeno_canto_03042025_onlytaxonomy,add_pretrain_audio_from_xeno_canto_03042025,44100.0,268.413333,2.0,0.0,UNKNOWN,XC,[coffal1]
521244,scrbla1,[],[call],-3.8983,-80.087,Dives warczewiczi,Scrub Blackbird,Vanessa Luzuriaga,//creativecommons.org/licenses/by-nc-sa/4.0/,,...,355995,xeno_canto_03042025_onlytaxonomy,add_pretrain_audio_from_xeno_canto_03042025,44100.0,1.447166,1.0,0.0,UNKNOWN,XC,[scrbla1]
608344,ruboro1,[],"[call, song]",-0.4383,-76.2791,Psarocolius angustifrons,Russet-backed Oropendola,Lars Lachmann,//creativecommons.org/licenses/by-nc-sa/4.0/,,...,452809,xeno_canto_03042025_onlytaxonomy,add_pretrain_audio_from_xeno_canto_03042025,44100.0,50.616757,1.0,0.0,UNKNOWN,XC,[ruboro1]
739614,pabwoo1,[],[call],17.539,-89.1111,Campephilus guatemalensis,Pale-billed Woodpecker,Paul Driver,//creativecommons.org/licenses/by-nc-sa/4.0/,,...,522521,xeno_canto_03042025_onlytaxonomy,add_pretrain_audio_from_xeno_canto_03042025,44100.0,137.208526,2.0,0.0,UNKNOWN,XC,[pabwoo1]
