In [None]:
import json
import os
import IPython.display as ipd

from glob import glob
from os.path import splitext, join as pjoin
from itertools import chain
from shutil import copyfile
from tqdm import tqdm
from copy import deepcopy
import xenocanto
import pandas as pd
import numpy as np
import librosa
import soundfile as SF

from code_base.utils.audio_utils import get_audio_metadata
from code_base.utils import load_json, write_json

# Load Taxonomies

In [None]:
eBird_Taxonomy_v2021 = pd.read_csv("/gpfs/space/projects/BetterMedicine/volodymyr1/exps/bird_clef_2025/birdclef_2025/taxonomy.csv")

eBird_Taxonomy_v2021["scientific_name"] = eBird_Taxonomy_v2021["scientific_name"].str.lower()
eBird_Taxonomy_v2021["common_name"] = eBird_Taxonomy_v2021["common_name"].str.lower()

undersampled_df = pd.read_csv("undersampled_classes_birdclef_2025.csv")

In [None]:
# Prepare load JSON for 2025

# write_json(
#     "/gpfs/space/projects/BetterMedicine/volodymyr1/exps/bird_clef_2025/xeno_canto/data_2025/download_species.json",
#     eBird_Taxonomy_v2021["scientific_name"].str.lower().to_list(),
# )

In [None]:
# Enrich taxonomy with previous year taxonomy
# It was checked that there are no conflicts

taxonomy_2024 = pd.read_csv(
    "/gpfs/space/projects/BetterMedicine/volodymyr1/exps/bird_clef_2025/birdclef_2024/eBird_Taxonomy_v2021.csv"
)
taxonomy_2024 = taxonomy_2024.rename(columns={
    "SPECIES_CODE": "primary_label",
    "PRIMARY_COM_NAME": "common_name",
    "SCI_NAME": "scientific_name"
})[["primary_label", "common_name", "scientific_name"]]

taxonomy_2024["scientific_name"] = taxonomy_2024["scientific_name"].str.lower()
taxonomy_2024["common_name"] = taxonomy_2024["common_name"].str.lower()

eBird_Taxonomy_v2021 = pd.concat([
    eBird_Taxonomy_v2021,
    taxonomy_2024
])

eBird_Taxonomy_v2021 = eBird_Taxonomy_v2021.drop_duplicates("primary_label")

In [None]:
eBird_Taxonomy_v2021.shape

In [None]:
eBird_Taxonomy_v2021["scientific_name"]

In [None]:
# Prepare load JSON for all years and more

write_json(
    "/gpfs/space/projects/BetterMedicine/volodymyr1/exps/bird_clef_2025/xeno_canto/data_all_years_and_more/download_species.json",
    eBird_Taxonomy_v2021["scientific_name"].str.lower().to_list(),
)

# Check Loading Process

In [None]:
downloaded_files = glob("/gpfs/space/projects/BetterMedicine/volodymyr1/exps/bird_clef_2025/xeno_canto/data_2025/dataset/audio/**/*.mp3", recursive=True)

In [None]:
# Validate that all IDs are unique
len(set([os.path.basename(el) for el in downloaded_files])) == len(downloaded_files)

In [None]:
"877358.mp3" in set([os.path.basename(el) for el in downloaded_files])

In [None]:
# Check loaded files distribution

pd.Series([el.split("/")[-2] for el in downloaded_files]).value_counts()

# Process metadata

In [None]:
train_metadata = pd.read_csv(
    "/gpfs/space/projects/BetterMedicine/volodymyr1/exps/bird_clef_2025/birdclef_2025/train_and_prev_comps_extendedv1_pruneSL.csv"
)

In [None]:
# Compose initial XC DF

all_meta_files = glob("/gpfs/space/projects/BetterMedicine/volodymyr1/exps/bird_clef_2025/xeno_canto/data_2025/dataset/metadata/**/*.json")

NEEDED_COLUMNS = [
    "id",
    "type",
    "lat",
    "lng",
    "en",
    "gen",
    "sp",
    "rec",
    "time",
    "url",
    "also",
    "file-name",
    "lic"
]
print(f"Jsons found = {len(all_meta_files)}")

def process_dict(
    input,
    folder_name,
    needed_cols=NEEDED_COLUMNS
):
    input = {k:v for k,v in input.items() if k in NEEDED_COLUMNS}
    input["foldername"] = folder_name
    return input

add_xeno_canto_df = pd.DataFrame()

for meta_file_path in tqdm(all_meta_files):
    meta_file = json.load(open(meta_file_path))['recordings']
    foldname = meta_file_path.split("/")[-2]
    # add_xeno_canto_df = add_xeno_canto_df.append(
    #     [process_dict(bird_info, foldname) for bird_info in meta_file],
    #     ignore_index=True
    # )
    add_xeno_canto_df = pd.concat([
        add_xeno_canto_df,
        pd.DataFrame([process_dict(bird_info, foldname) for bird_info in meta_file])
    ]).reset_index(drop=True)

In [None]:
# Create existing filename columns

add_xeno_canto_df["id"] = add_xeno_canto_df["id"].astype(int) 

downloaded_files = glob("/gpfs/space/projects/BetterMedicine/volodymyr1/exps/bird_clef_2025/xeno_canto/data_2025/dataset/audio/**/*.mp3", recursive=True)

xcid2filename = {
    int(os.path.splitext(os.path.basename(el))[0]):el for el in downloaded_files
}

add_xeno_canto_df["filename"] = add_xeno_canto_df["id"].map(xcid2filename)

print("Not Loaded Paritition:", add_xeno_canto_df["filename"].isna().sum() / add_xeno_canto_df.shape[0])

In [None]:
# Map columns to casual names and create `scientific_name` column
add_xeno_canto_df = add_xeno_canto_df.rename(columns={
    "en":"common_name",
    "lat": "latitude",
    "lng": "longitude",
    "rec": "author",
    "lic": "license"
})

add_xeno_canto_df["scientific_name"] = add_xeno_canto_df["gen"] + " " + add_xeno_canto_df["sp"]

In [None]:
# 1. Check which files we have in taxonomy by `common_name` and `scientific_name`
# 2. Validate that there no inconsistency in primary label mapping by common_name and scientific_name
# 3. Map to primary_label

common_name_intersection = set(eBird_Taxonomy_v2021["common_name"]) & set(add_xeno_canto_df["common_name"].str.lower())
scientific_name_intersection = set(eBird_Taxonomy_v2021["scientific_name"]) & set(add_xeno_canto_df["scientific_name"].str.lower())

add_xeno_canto_df["primary_label_cn"] = None
add_xeno_canto_df["primary_label_sn"] = None
add_xeno_canto_df.loc[add_xeno_canto_df["common_name"].str.lower().isin(common_name_intersection), "primary_label_cn"] = (
    add_xeno_canto_df.loc[add_xeno_canto_df["common_name"].str.lower().isin(common_name_intersection), "common_name"].str.lower().map(eBird_Taxonomy_v2021.set_index("common_name")["primary_label"].to_dict())
)
add_xeno_canto_df.loc[add_xeno_canto_df["scientific_name"].str.lower().isin(scientific_name_intersection), "primary_label_sn"] = (
    add_xeno_canto_df.loc[add_xeno_canto_df["scientific_name"].str.lower().isin(scientific_name_intersection), "scientific_name"].str.lower().map(eBird_Taxonomy_v2021.set_index("scientific_name")["primary_label"].to_dict())
)

assert (add_xeno_canto_df.loc[
    (~add_xeno_canto_df["primary_label_cn"].isna()) & (~add_xeno_canto_df["primary_label_sn"].isna()),
    "primary_label_cn"
] != add_xeno_canto_df.loc[
    (~add_xeno_canto_df["primary_label_cn"].isna()) & (~add_xeno_canto_df["primary_label_sn"].isna()),
    "primary_label_sn"
]).sum() == 0

add_xeno_canto_df["primary_label"] = add_xeno_canto_df["primary_label_cn"]
add_xeno_canto_df.loc[add_xeno_canto_df["primary_label"].isna(), "primary_label"] = add_xeno_canto_df.loc[add_xeno_canto_df["primary_label"].isna(), "primary_label_sn"]

In [None]:
print("Not Appropriate Labels Paritition:", (~add_xeno_canto_df["primary_label"].isna()).sum() / add_xeno_canto_df.shape[0])

In [None]:
# add_xeno_canto_df.loc[
#     (add_xeno_canto_df["primary_label_cn"].isna()) & (add_xeno_canto_df["primary_label_sn"].isna()),
#     "common_name"
# ].drop_duplicates().to_list()

In [None]:
# add_xeno_canto_df.loc[
#     (add_xeno_canto_df["primary_label_cn"].isna()) & (add_xeno_canto_df["primary_label_sn"].isna()),
#     "scientific_name"
# ].drop_duplicates().to_list()

In [None]:
# Apply Filters

xc_new_ids = set(add_xeno_canto_df["id"].astype(int)) - set(train_metadata.loc[(train_metadata["collection"].isna()) | (train_metadata["collection"] == "XC"), "id"].astype(int))

# Remove already existing files in train
add_xeno_canto_df = add_xeno_canto_df[add_xeno_canto_df["id"].astype(int).isin(xc_new_ids)].reset_index(drop=True)
# Remove not matched primary_label
add_xeno_canto_df = add_xeno_canto_df[~add_xeno_canto_df["primary_label"].isna()].reset_index(drop=True)
# Remove not existing filenames
add_xeno_canto_df = add_xeno_canto_df[~add_xeno_canto_df["filename"].isna()].reset_index(drop=True)

## Check Left Classes

In [None]:
add_xeno_canto_df["primary_label"].value_counts()

In [None]:
undersampled_df[undersampled_df["primary_label"].isin(add_xeno_canto_df["primary_label"])].merge(
    add_xeno_canto_df.loc[add_xeno_canto_df["primary_label"].isin(undersampled_df["primary_label"]), "scientific_name"].value_counts().reset_index().rename(columns={"count":"add_count_from_XC"}),
    on="scientific_name"
)

In [None]:
# Continue processing

In [None]:
# Map also, which is secondary_labels, from common_names to primary_label
# Ignore species that are not presented in target taxonomy

print("Number of matched secondary_labels:", len(set([el.lower() for el in list(chain(*add_xeno_canto_df["also"].tolist()))]) & set(eBird_Taxonomy_v2021["scientific_name"])))
print("Number of not matched secondary_labels:", len(set([el.lower() for el in list(chain(*add_xeno_canto_df["also"].tolist()))]) - set(eBird_Taxonomy_v2021["scientific_name"])))

pl2sn = eBird_Taxonomy_v2021.set_index("scientific_name")["primary_label"].to_dict()
add_xeno_canto_df["secondary_labels"] = add_xeno_canto_df["also"].apply(lambda x: [pl2sn[el.lower()] for el in x if el.lower() in pl2sn])

# Double Check
print("Number of matched secondary_labels:", len(set([el.lower() for el in list(chain(*add_xeno_canto_df["secondary_labels"].tolist()))])))

In [None]:
# Retrieve other columns

add_xeno_canto_df["url"] = "https:" + add_xeno_canto_df["url"]
add_xeno_canto_df["type"] = add_xeno_canto_df["type"].apply(lambda x: [el.strip() for el in x.split(",")])
add_xeno_canto_df["rating"] = None
add_xeno_canto_df["collection"] = "XC"

In [None]:
# Enrich with audio meta

add_xeno_canto_audio_meta = pd.DataFrame(add_xeno_canto_df["filename"].apply(get_audio_metadata).to_list())

add_xeno_canto_df = pd.concat([
    add_xeno_canto_df, add_xeno_canto_audio_meta
], axis=1)

add_xeno_canto_df = add_xeno_canto_df[~add_xeno_canto_df["sample_rate"].isna()].reset_index(drop=True)

In [None]:
# Add dataset specific columns

SAVE_DATASET_NAME = "add_train_audio_from_xeno_canto_28032025"

add_xeno_canto_df["dataset"] = "xeno_canto_only_2025"
add_xeno_canto_df["data_root_id"] = SAVE_DATASET_NAME

In [None]:
# Shrink columns

add_xeno_canto_df = add_xeno_canto_df[train_metadata.columns]
assert set(train_metadata.columns) == set(add_xeno_canto_df.columns)

In [None]:
# Copy only needed audio files

new_filenames = []
for or_fname, pl_label, sample_id in tqdm(zip(
        add_xeno_canto_df["filename"].to_list(),
        add_xeno_canto_df["primary_label"].to_list(), 
        add_xeno_canto_df["id"].to_list()
)):
    assert int(os.path.splitext(os.path.basename(or_fname))[0]) == sample_id
    new_fname = os.path.join(
        "../data", SAVE_DATASET_NAME, pl_label, os.path.basename(or_fname)
    )
    if not os.path.exists(os.path.dirname(new_fname)):
        os.makedirs(os.path.dirname(new_fname))
    copyfile(
        or_fname,
        new_fname
    )
    new_filenames.append(new_fname)

In [None]:
# Preserve only primary_label/filename.mp3 in filename

add_xeno_canto_df["filename"] = [
    "/".join(el.split("/")[-2:]) for el in new_filenames
]

In [None]:
assert not set(train_metadata.loc[train_metadata["collection"] == "XC", "id"].apply(int)) & set(add_xeno_canto_df["id"])

In [None]:
print("Extended dataset will contain next Partition of XC Add Data: ", add_xeno_canto_df.shape[0] / (train_metadata.shape[0] + add_xeno_canto_df.shape[0]))

In [None]:
add_xeno_canto_df.to_csv(
    "/gpfs/space/projects/BetterMedicine/volodymyr1/exps/bird_clef_2025/xeno_canto/data_2025/dataset/train_only_2025_snipet28032025.csv",
    index=False
)

In [None]:
# Merge for training

train_metadata = pd.concat([
    train_metadata,
    add_xeno_canto_df
], axis=0).reset_index(drop=True)

In [None]:
train_metadata["filename"].value_counts()

In [None]:
train_metadata.to_csv(
    "/gpfs/space/projects/BetterMedicine/volodymyr1/exps/bird_clef_2025/birdclef_2025/train_and_prev_comps_extendedv1_pruneSL_XConly2025_snipet28032025.csv", index=False
)

In [None]:
train_metadata["data_root_id"].value_counts()

## Prune failed on h5py convertion

In [None]:
add_xeno_canto_df = pd.read_csv(
    "/gpfs/space/projects/BetterMedicine/volodymyr1/exps/bird_clef_2025/xeno_canto/data_2025/dataset/train_only_2025_snipet28032025.csv",
)
train_metadata = pd.read_csv(
    "/gpfs/space/projects/BetterMedicine/volodymyr1/exps/bird_clef_2025/birdclef_2025/train_and_prev_comps_extendedv1_pruneSL.csv"
)

In [None]:
add_xeno_canto_df = add_xeno_canto_df[
    add_xeno_canto_df["filename"].apply(lambda x: os.path.join("../data/add_train_features_from_xeno_canto_28032025", x.replace(".mp3", ".hdf5"))).apply(os.path.exists)
].reset_index(drop=True)

In [None]:
add_xeno_canto_df.to_csv(
    "/gpfs/space/projects/BetterMedicine/volodymyr1/exps/bird_clef_2025/xeno_canto/data_2025/dataset/train_only_2025_snipet28032025_hdf5.csv",
    index=False
)

In [None]:
# Merge for training

train_metadata = pd.concat([
    train_metadata,
    add_xeno_canto_df
], axis=0).reset_index(drop=True)

train_metadata.to_csv(
    "/gpfs/space/projects/BetterMedicine/volodymyr1/exps/bird_clef_2025/birdclef_2025/train_and_prev_comps_extendedv1_pruneSL_XConly2025_snipet28032025_hdf5.csv", index=False
)

# Prepare big Xeno-Canto DF

In [None]:
dataset_root="/home/vova/data/exps/birdclef_2024/xeno_canto/dataset_2024_classes"

In [None]:
print(f"Downloaded\nspecies = {len(glob(os.path.join(dataset_root, 'audio/*')))}\nsamples = {len(glob(os.path.join(dataset_root, 'audio/*/*.mp3')))}")
print(f"Ready samples {round(len(glob(os.path.join(dataset_root, 'audio/*'))) / len(json.load(open(os.path.join(dataset_root, 'all_xeno_canto_species.json')))), 3)*100}%")

In [None]:
all_meta_files = glob(os.path.join(dataset_root, "metadata/**/*.json"))

NEEDED_COLUMNS = [
    "id",
    "type",
    "lat",
    "lng",
    "en",
    "gen",
    "sp",
    "rec",
    "time",
    "url",
    "also",
    "file-name",
    "lic"
]
print(f"Jsons found = {len(all_meta_files)}")

def process_dict(
    input,
    folder_name,
    needed_cols=NEEDED_COLUMNS
):
    input = {k:v for k,v in input.items() if k in NEEDED_COLUMNS}
    input["foldername"] = folder_name
    return input

add_xeno_canto_df = pd.DataFrame()

for meta_file_path in tqdm(all_meta_files):
    meta_file = json.load(open(meta_file_path))['recordings']
    foldname = meta_file_path.split("/")[-2]
    add_xeno_canto_df = pd.concat([
        add_xeno_canto_df,
        pd.DataFrame([process_dict(bird_info, foldname) for bird_info in meta_file])
    ]).reset_index(drop=True)
    
add_xeno_canto_df = add_xeno_canto_df.rename(columns={
    "en":"common_name",
    "lat": "latitude",
    "lng": "longitude",
    "rec": "author",
    "lic": "license"
})

add_xeno_canto_df["scientific_name"] = add_xeno_canto_df["gen"] + " " + add_xeno_canto_df["sp"]
add_xeno_canto_df["primary_label"] = add_xeno_canto_df["common_name"]
add_xeno_canto_df["url"] = "https:" + add_xeno_canto_df["url"]
add_xeno_canto_df["type"] = add_xeno_canto_df["type"].apply(lambda x: [el.strip() for el in x.split(",")])
add_xeno_canto_df["rating"] = None

add_xeno_canto_df["secondary_labels"] = add_xeno_canto_df["also"]
add_xeno_canto_df["filename"] = add_xeno_canto_df["primary_label"] + "/" + add_xeno_canto_df["id"].astype(str) + ".mp3"

In [None]:
add_xeno_canto_df.shape

In [None]:
# add_xeno_canto_df.to_csv("dataset/raw_metadata_07_05_2022.csv", index=False)
# add_xeno_canto_df = pd.read_csv("dataset/raw_metadata_07_05_2022.csv", converters={"secondary_labels": eval})

In [None]:
all_loaded_samples = glob(os.path.join(dataset_root, "audio/*/*.mp3"))
all_loaded_samples_id = [int(os.path.splitext(os.path.basename(el))[0]) for el in all_loaded_samples]

add_xeno_canto_df["id"] = add_xeno_canto_df["id"].astype(int)
add_xeno_canto_df = add_xeno_canto_df.drop_duplicates("id").reset_index(drop=True)
add_xeno_canto_df = add_xeno_canto_df[add_xeno_canto_df["id"].isin(set(all_loaded_samples_id))].reset_index(drop=True)

In [None]:
train_metadata = pd.read_csv("/home/vova/data/exps/birdclef_2024/birdclef_2024/train_metadata_extended.csv", converters={"secondary_labels": eval, "all_labels": eval})
train_metadata_prev_comps = pd.read_csv(
    "/home/vova/data/exps/birdclef_2024/dfs/full_nodupls_meta_prev_comps_extended.csv", 
    converters={"secondary_labels": eval, "all_labels": eval}
)

In [None]:
comp_data = pd.concat([
    train_metadata[["primary_label", "scientific_name", "common_name", "url", "secondary_labels"]],
    train_metadata_prev_comps[["primary_label", "scientific_name", "common_name", "url", "secondary_labels"]]
]).reset_index(drop=True)
comp_data["id"] = comp_data["url"].apply(lambda x: int(x.split("/")[-1]))

In [None]:
xeno_canto_all_unique_sec_labels = set([el for el in set(list(chain(*add_xeno_canto_df["secondary_labels"]))) if el != ""])
xeno_canto_all_unique_sn_labels = set([el for el in set(add_xeno_canto_df["scientific_name"]) if el != ""])
xeno_canto_all_unique_cn_labels = set([el for el in set(add_xeno_canto_df["common_name"]) if el != ""])
xeno_canto_all_unique_sec_and_sn_labels = xeno_canto_all_unique_sec_labels | xeno_canto_all_unique_sn_labels

In [None]:
# Xeno Canto secondary labels are taken from `scientific_name`
assert not (xeno_canto_all_unique_sec_labels & xeno_canto_all_unique_cn_labels)

In [None]:
comp_data_pl_2_id = comp_data.groupby("primary_label")["id"].apply(set).to_dict()
add_xeno_canto_df_sn_2_id = add_xeno_canto_df.groupby("scientific_name")["id"].apply(set).to_dict()
add_xeno_canto_df_sn_2_id = {k:v for k,v in add_xeno_canto_df_sn_2_id.items()}

In [None]:
def get_common_name(input):
    if len(set(input)) > 1:
        # raise RuntimeError(f"There should be one2one mapping between `primary_label` and `common_name`. Failed example: {set(input)}")
        print(f"There should be one2one mapping between `primary_label` and `common_name`. Failed example: {set(input)}")
    return input.iloc[0]

In [None]:
pl2sc = comp_data.groupby("primary_label")["scientific_name"].apply(get_common_name).to_dict()
# pl2sc = {v:k.lower() for k,v in pl2sc.items()}
sc2pl = {v:k for k,v in pl2sc.items()}

In [None]:
lost_classes = {}
for k in set(comp_data["scientific_name"]) - xeno_canto_all_unique_sec_and_sn_labels:
    lost_classes[sc2pl[k]] = {}
    for k_ref in add_xeno_canto_df_sn_2_id.keys():
        lost_classes[sc2pl[k]][k_ref] = len(add_xeno_canto_df_sn_2_id[k_ref] & comp_data_pl_2_id[sc2pl[k]]) / len(comp_data_pl_2_id[sc2pl[k]])
    lost_classes[sc2pl[k]] = pd.Series(lost_classes[sc2pl[k]]).sort_values(ascending=False)

In [None]:
top_lost_classes = {k:v.reset_index().iloc[0].to_dict() for k,v in lost_classes.items()}

In [None]:
top_lost_classes = {k:v for k,v in top_lost_classes.items() if v[0] > 0}

In [None]:
top_lost_classes

In [None]:
sc2pl_xeno_canto = deepcopy(sc2pl)
for k,v in top_lost_classes.items():
    # if v[0] > 0:
    sc2pl_xeno_canto.pop(pl2sc[k])
    sc2pl_xeno_canto[v["index"]] = k

In [None]:
# All keys unique
assert len(set(sc2pl_xeno_canto.keys())) == len(sc2pl_xeno_canto)
# All values unique
assert len(set(sc2pl_xeno_canto.values())) == len(sc2pl_xeno_canto)
# # All keys are present in Xeno Canto
# assert not set(sc2pl_xeno_canto.keys()) - xeno_canto_all_unique_sec_and_sn_labels

In [None]:
add_xeno_canto_df["primary_label"] = add_xeno_canto_df["scientific_name"].apply(lambda x: sc2pl_xeno_canto.get(x,x))
add_xeno_canto_df["secondary_labels"] = add_xeno_canto_df["secondary_labels"].apply(lambda x: [sc2pl_xeno_canto.get(el,el) for el in x])
add_xeno_canto_df["secondary_labels"] = add_xeno_canto_df["secondary_labels"].apply(lambda x: [el for el in x if el != ""])

In [None]:
# Create all_labels
add_xeno_canto_df['all_labels'] = add_xeno_canto_df.apply(lambda x: [x["primary_label"]] + x["secondary_labels"], axis=1)

In [None]:
add_xeno_canto_df["dataset"] = "xc_2024_classes"

In [None]:
set(train_metadata.columns) - set(add_xeno_canto_df.columns)

In [None]:
add_xeno_canto_df["filename"] = add_xeno_canto_df["filename"].apply(lambda x: x.replace(" ", ""))

In [None]:
xeno_canto_lengts_and_srs =  ProgressParallel(n_jobs=32, total=len(add_xeno_canto_df))(
    delayed(read_length_and_sr)(path) for path in add_xeno_canto_df["filename"].apply(
        lambda x: os.path.join(
            "/home/vova/data/exps/birdclef_2024/xeno_canto/dataset_2024_classes/train_features", x.replace(".mp3", ".hdf5")
        )
    )
)

In [None]:
add_xeno_canto_df["is_valid"] = [
    el is not None for el in xeno_canto_lengts_and_srs
]
add_xeno_canto_df["sample_rate"] = [el[1] if el is not None else None for el in xeno_canto_lengts_and_srs]
add_xeno_canto_df["au_len"] = [el[0] if el is not None else None for el in xeno_canto_lengts_and_srs]
add_xeno_canto_df = add_xeno_canto_df[add_xeno_canto_df["is_valid"]]
add_xeno_canto_df = add_xeno_canto_df.drop(columns=["is_valid"])
add_xeno_canto_df["duration_s"] = add_xeno_canto_df["au_len"] / add_xeno_canto_df["sample_rate"]

In [None]:
set(train_metadata.columns) - set(add_xeno_canto_df.columns)

In [None]:
add_xeno_canto_df.to_csv("/home/vova/data/exps/birdclef_2024/xeno_canto/dataset_2024_classes/train_metadata_extended.csv", index=False)

In [None]:
add_xeno_canto_df = add_xeno_canto_df[~add_xeno_canto_df["id"].isin(comp_data["id"])].reset_index(drop=True)

In [None]:
add_xeno_canto_df.shape

In [None]:
add_xeno_canto_df.to_csv("/home/vova/data/exps/birdclef_2024/xeno_canto/dataset_2024_classes/train_metadata_nodupl_extended.csv", index=False)

In [None]:
add_xeno_canto_df = pd.read_csv(
    "/home/vova/data/exps/birdclef_2024/xeno_canto/dataset_2024_classes/train_metadata_noduplV3_extended.csv", 
    converters={"secondary_labels": eval, "all_labels": eval}
)
train_metadata = pd.read_csv(
    "/home/vova/data/exps/birdclef_2024/birdclef_2024/train_metadata_extended_noduplv1.csv", 
    converters={"secondary_labels": eval, "all_labels": eval}
)
train_metadata_prev_comp = pd.read_csv(
    "/home/vova/data/exps/birdclef_2024/dfs/full_noduplsV3_scored_meta_prev_comps_extended.csv", 
    converters={"secondary_labels": eval, "all_labels": eval}
)

In [None]:
all_classes = set(chain(*train_metadata["all_labels"]))
len(all_classes)

In [None]:
add_xeno_canto_df["all_labels"] = add_xeno_canto_df["all_labels"].apply(lambda x: [bird for bird in x if bird in all_classes])
add_xeno_canto_df["secondary_labels"] = add_xeno_canto_df["secondary_labels"].apply(lambda x: [bird for bird in x if bird in all_classes])

train_metadata_prev_comp = train_metadata_prev_comp[train_metadata_prev_comp["primary_label"].isin(all_classes)].reset_index(drop=True)
train_metadata_prev_comp["all_labels"] = train_metadata_prev_comp["all_labels"].apply(lambda x: [bird for bird in x if bird in all_classes])
train_metadata_prev_comp["secondary_labels"] = train_metadata_prev_comp["secondary_labels"].apply(lambda x: [bird for bird in x if bird in all_classes])

In [None]:
add_xeno_canto_df.to_csv(
    "/home/vova/data/exps/birdclef_2024/xeno_canto/dataset_2024_classes/train_metadata_noduplV3_extended_2024SecLabels.csv",
    index=False
)
train_metadata_prev_comp.to_csv(
    "/home/vova/data/exps/birdclef_2024/dfs/full_noduplsV3_scored_meta_prev_comps_extended_2024SecLabels.csv",
    index=False
)