In [1]:
import json
import os
import IPython.display as ipd

from glob import glob
from os.path import splitext, join as pjoin
from itertools import chain
from shutil import copyfile
from tqdm import tqdm
from copy import deepcopy
import xenocanto
import pandas as pd
import numpy as np
import librosa
import soundfile as sf


# from code_base.utils.constants import SAMPLE_RATE
from code_base.utils import load_json

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from code_base.utils.main_utils import ProgressParallel
from joblib import delayed
import h5py

def read_length_and_sr(file_path: str):
    try:
        with h5py.File(file_path, "r") as data_file:
            au_length = data_file["au"].shape[0]
            sr = int(np.array(data_file["sr"]))
        return au_length, sr
    except:
        return None

In [None]:
# ipd.Audio(np.random.choice(glob("/home/vova/data/main/public/vctk_mic1/wav48_silence_trimmed/p260/*")))

In [None]:
df = pd.read_csv("/home/vova/data/exps/birdclef_2024/birdclef_2024/train_metadata_extended_noduplv1.csv")
mapping = pd.read_csv("chirp/chirp/taxonomy/data/mappings/ioc_12_2_to_ebird2021.csv")
# master_ioc_list_v12_2 = pd.read_excel("master_ioc_list_v12.2.xlsx")

In [None]:
# mapping_2024 = mapping[mapping["ebird2021"].isin(df["primary_label"])].reset_index(drop=True)
# mapping_2024.to_csv("chirp/chirp/taxonomy/data/mappings/ioc_12_2_to_ebird2021_only2024.csv", index=False)

In [None]:
mapping_2023 = mapping[mapping["ebird2021"].isin(set(df.primary_label))]

In [None]:
downloaded_files = glob("/home/vova/data/exps/birdclef_2024/xeno_canto/dataset_2024_classes/audio/**/*.mp3", recursive=True)

In [None]:
pd.Series([el.split("/")[-2] for el in downloaded_files]).value_counts()

In [None]:
pd.Series([el.split("/")[-2] for el in downloaded_files]).value_counts().sum()

# Process metadata

In [None]:
train_df = pd.read_csv("/home/vova/data/exps/bird_clef_2022/train_metadata_extended.csv", converters={"all_labels": eval})
train_metadata_2021 = pd.read_csv("/home/vova/data/exps/bird_clef_2022/add_data/comp_2021/train_metadata_processed.csv")
scored_birds = json.load(open("/home/vova/data/exps/bird_clef_2022/scored_birds.json"))parallel_librosa_load

In [None]:
names_df = pd.concat([
    train_df[["primary_label", "common_name", "scientific_name"]],
    train_metadata_2021[["primary_label", "common_name", "scientific_name"]]
]).reset_index(drop=True)

In [None]:
def get_common_name(input):
    if len(set(input)) > 1:
        raise RuntimeError("There should be one2one mapping between `primary_label` and `common_name`")
    return input.iloc[0]

In [None]:
cn_correction = {
    "Band-rumped Storm-Petrel":"Band-rumped Storm Petrel",
    "Erckel's Francolin":"Erckel's Spurfowl",
    "Hawaiian Goose":"Nene"
}

pl2cn = names_df.groupby("primary_label")["common_name"].apply(get_common_name).to_dict()
pl2cn = {k:(cn_correction[v] if v in cn_correction else v) for k,v in pl2cn.items()}
cn2pl = {v:k for k,v in pl2cn.items()}

In [None]:
pl2sc = names_df.groupby("primary_label")["scientific_name"].apply(get_common_name).to_dict()
sc2pl = {v:k for k,v in pl2sc.items()}

In [None]:
all_meta_files = glob("dataset/metadata/**/*.json")

NEEDED_COLUMNS = [
    "id",
    "type",
    "lat",
    "lng",
    "en",
    "gen",
    "sp",
    "rec",
    "time",
    "url",
    "also",
    "file-name",
    "lic"
]
print(f"Jsons found = {len(all_meta_files)}")

In [None]:
def process_dict(
    input,
    folder_name,
    needed_cols=NEEDED_COLUMNS
):
    input = {k:v for k,v in input.items() if k in NEEDED_COLUMNS}
    input["foldername"] = folder_name
    return input

add_xeno_canto_df = pd.DataFrame()

for meta_file_path in tqdm(all_meta_files):
    meta_file = json.load(open(meta_file_path))['recordings']
    foldname = meta_file_path.split("/")[-2]
    # add_xeno_canto_df = add_xeno_canto_df.append(
    #     [process_dict(bird_info, foldname) for bird_info in meta_file],
    #     ignore_index=True
    # )
    add_xeno_canto_df = pd.concat([
        add_xeno_canto_df,
        pd.DataFrame([process_dict(bird_info, foldname) for bird_info in meta_file])
    ]).reset_index(drop=True)

In [None]:
train_df.license.value_counts()

In [None]:
add_xeno_canto_df.lic.value_counts()

In [None]:
add_xeno_canto_df = add_xeno_canto_df.rename(columns={
    "en":"common_name",
    "lat": "latitude",
    "lng": "longitude",
    "rec": "author",
    "lic": "license"
})

add_xeno_canto_df["scientific_name"] = add_xeno_canto_df["gen"] + " " + add_xeno_canto_df["sp"]
add_xeno_canto_df["primary_label"] = add_xeno_canto_df["common_name"].map(cn2pl)
add_xeno_canto_df["url"] = "https:" + add_xeno_canto_df["url"]
add_xeno_canto_df["type"] = add_xeno_canto_df["type"].apply(lambda x: [el.strip() for el in x.split(",")])
add_xeno_canto_df["rating"] = None

UNKNOWN_BIRDS = set(list(chain(*add_xeno_canto_df["also"].tolist()))) - set(sc2pl.keys())
add_xeno_canto_df["secondary_labels"] = add_xeno_canto_df["also"].apply(lambda x: [sc2pl[el] for el in x if el not in UNKNOWN_BIRDS])
add_xeno_canto_df["filename"] = add_xeno_canto_df["primary_label"] + "/XC" + add_xeno_canto_df["id"].astype(str) + ".mp3"

In [None]:
# os.makedirs("/home/vova/data/exps/bird_clef_2022/xeno_canto/xc_scored_add")

In [None]:
pathes = [os.path.join("dataset/audio/",add_xeno_canto_df["foldername"].iloc[idx],add_xeno_canto_df["id"].iloc[idx] + ".mp3") for idx in range(len(add_xeno_canto_df))]
pathes_to_save = [os.path.join("/home/vova/data/exps/bird_clef_2022/xeno_canto/xc_scored_add",add_xeno_canto_df["filename"].iloc[idx]) for idx in range(len(add_xeno_canto_df))]

In [None]:
# for old_path, new_path in tqdm(zip(pathes, pathes_to_save)):
#     if not os.path.exists(os.path.dirname(new_path)):
#         os.makedirs(os.path.dirname(new_path))
#     copyfile(old_path, new_path)

In [None]:
add_xeno_canto_root = "/home/vova/data/exps/bird_clef_2022/xeno_canto/xc_scored_add"
librosa_result = parallel_librosa_load(pathes_to_save, sr=None)

In [None]:
# Create all_labels
add_xeno_canto_df['all_labels'] = add_xeno_canto_df.apply(lambda x: [x["primary_label"]] + x["secondary_labels"], axis=1)
# Create is_scored_in_all_labels
add_xeno_canto_df["is_scored_in_all_labels"] = add_xeno_canto_df['all_labels'].apply(lambda x: any(el in scored_birds for el in x))
# Create duration_s and sr
add_xeno_canto_df["sr"] = [el[1] for el in librosa_result]
add_xeno_canto_df["duration_s"] = [len(el[0]) / el[1] for el in librosa_result]
# Create contains_secondary
add_xeno_canto_df['contains_secondary'] = add_xeno_canto_df['secondary_labels'].apply(len) > 0

In [None]:
add_xeno_canto_df.head(5)

In [None]:
add_xeno_canto_df[[
    "primary_label", "secondary_labels", "type", 
    "latitude", "longitude", "scientific_name", 
    "common_name", "author", "rating",
    "time", "url", "filename",
    "license",
    "all_labels", "is_scored_in_all_labels", "sr",
    "duration_s", "contains_secondary",
]].reset_index(drop=True).to_csv("/home/vova/data/exps/bird_clef_2022/xeno_canto/xc_scored_add/train_metadata.csv", index=False)

In [None]:
add_xeno_canto_df = add_xeno_canto_df[[
    "primary_label", "secondary_labels", "type", 
    "latitude", "longitude", "scientific_name", 
    "common_name", "author", "rating",
    "time", "url", "filename",
    "license",
    "all_labels", "is_scored_in_all_labels", "sr",
    "duration_s", "contains_secondary"
]]

add_xeno_canto_df

In [None]:
add_xeno_canto_df["id"] = add_xeno_canto_df["url"].apply(lambda x: x.split("/")[-1]).astype(int)

new_id_samples = set(add_xeno_canto_df["url"].apply(lambda x: x.split("/")[-1]).astype(int)) - set(train_df["url"].apply(lambda x: x.split("/")[-1]).astype(int))

In [None]:
add_xeno_canto_df[add_xeno_canto_df["id"].isin(new_id_samples)].drop(columns="id").reset_index(drop=True).to_csv(
    "/home/vova/data/exps/bird_clef_2022/xeno_canto/xc_scored_add/train_metadata_nodupl.csv", index=False
)

In [None]:
(add_xeno_canto_df.primary_label.value_counts() - train_df[train_df.primary_label.isin(scored_birds)].primary_label.value_counts()).sum()

In [None]:
os.listdir("/home/vova/data/exps/bird_clef_2022/xeno_canto/xc_scored_add/")

In [None]:
add_xeno_canto_df = add_xeno_canto_df[add_xeno_canto_df["id"].isin(new_id_samples)].drop(columns="id").reset_index(drop=True)

In [None]:
add_xeno_canto_df.license.value_counts()

In [None]:
pathes = [os.path.join("/home/vova/data/exps/bird_clef_2022/xeno_canto/xc_scored_add",add_xeno_canto_df["filename"].iloc[idx]) for idx in range(len(add_xeno_canto_df))]
pathes_to_save = [os.path.join("/home/vova/data/exps/bird_clef_2022/train_audio_merged_adddata_v2_mauparfix",add_xeno_canto_df["filename"].iloc[idx]) for idx in range(len(add_xeno_canto_df))]

In [None]:
for old_path, new_path in tqdm(zip(pathes, pathes_to_save)):
    copyfile(old_path, new_path)

In [None]:
scored_birds

In [None]:
train_df[train_df["primary_label"] == "akiapo"].head(3)

In [None]:
ipd.Audio("/home/vova/data/exps/bird_clef_2022/train_audio_merged_adddata_v2_mauparfix/akiapo/XC122399.ogg")

In [None]:
add_train_df = pd.read_csv("/home/vova/data/exps/bird_clef_2022/xeno_canto/xc_scored_add/train_metadata_nodupl.csv")

In [None]:
add_train_df.loc[add_train_df.contains_secondary, "primary_label"].value_counts()

In [None]:
add_train_df[add_train_df["primary_label"] == "akiapo"].head(3)

In [None]:
ipd.Audio("/home/vova/data/exps/bird_clef_2022/train_audio_merged_adddata_v2_mauparfix/akiapo/XC713870.mp3")

# Prepare big Xeno-Canto DF

In [3]:
dataset_root="/home/vova/data/exps/birdclef_2024/xeno_canto/dataset_2024_classes"

In [4]:
print(f"Downloaded\nspecies = {len(glob(os.path.join(dataset_root, 'audio/*')))}\nsamples = {len(glob(os.path.join(dataset_root, 'audio/*/*.mp3')))}")
print(f"Ready samples {round(len(glob(os.path.join(dataset_root, 'audio/*'))) / len(json.load(open(os.path.join(dataset_root, 'all_xeno_canto_species.json')))), 3)*100}%")

Downloaded
species = 182
samples = 48632
Ready samples 100.0%


In [5]:
all_meta_files = glob(os.path.join(dataset_root, "metadata/**/*.json"))

NEEDED_COLUMNS = [
    "id",
    "type",
    "lat",
    "lng",
    "en",
    "gen",
    "sp",
    "rec",
    "time",
    "url",
    "also",
    "file-name",
    "lic"
]
print(f"Jsons found = {len(all_meta_files)}")

def process_dict(
    input,
    folder_name,
    needed_cols=NEEDED_COLUMNS
):
    input = {k:v for k,v in input.items() if k in NEEDED_COLUMNS}
    input["foldername"] = folder_name
    return input

add_xeno_canto_df = pd.DataFrame()

for meta_file_path in tqdm(all_meta_files):
    meta_file = json.load(open(meta_file_path))['recordings']
    foldname = meta_file_path.split("/")[-2]
    add_xeno_canto_df = pd.concat([
        add_xeno_canto_df,
        pd.DataFrame([process_dict(bird_info, foldname) for bird_info in meta_file])
    ]).reset_index(drop=True)
    
add_xeno_canto_df = add_xeno_canto_df.rename(columns={
    "en":"common_name",
    "lat": "latitude",
    "lng": "longitude",
    "rec": "author",
    "lic": "license"
})

add_xeno_canto_df["scientific_name"] = add_xeno_canto_df["gen"] + " " + add_xeno_canto_df["sp"]
add_xeno_canto_df["primary_label"] = add_xeno_canto_df["common_name"]
add_xeno_canto_df["url"] = "https:" + add_xeno_canto_df["url"]
add_xeno_canto_df["type"] = add_xeno_canto_df["type"].apply(lambda x: [el.strip() for el in x.split(",")])
add_xeno_canto_df["rating"] = None

add_xeno_canto_df["secondary_labels"] = add_xeno_canto_df["also"]
add_xeno_canto_df["filename"] = add_xeno_canto_df["primary_label"] + "/" + add_xeno_canto_df["id"].astype(str) + ".mp3"

Jsons found = 231


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 231/231 [00:07<00:00, 31.79it/s]


In [6]:
add_xeno_canto_df.shape

(48647, 19)

In [7]:
# add_xeno_canto_df.to_csv("dataset/raw_metadata_07_05_2022.csv", index=False)
# add_xeno_canto_df = pd.read_csv("dataset/raw_metadata_07_05_2022.csv", converters={"secondary_labels": eval})

In [8]:
all_loaded_samples = glob(os.path.join(dataset_root, "audio/*/*.mp3"))
all_loaded_samples_id = [int(os.path.splitext(os.path.basename(el))[0]) for el in all_loaded_samples]

add_xeno_canto_df["id"] = add_xeno_canto_df["id"].astype(int)
add_xeno_canto_df = add_xeno_canto_df.drop_duplicates("id").reset_index(drop=True)
add_xeno_canto_df = add_xeno_canto_df[add_xeno_canto_df["id"].isin(set(all_loaded_samples_id))].reset_index(drop=True)

In [9]:
train_metadata = pd.read_csv("/home/vova/data/exps/birdclef_2024/birdclef_2024/train_metadata_extended.csv", converters={"secondary_labels": eval, "all_labels": eval})
train_metadata_prev_comps = pd.read_csv(
    "/home/vova/data/exps/birdclef_2024/dfs/full_nodupls_meta_prev_comps_extended.csv", 
    converters={"secondary_labels": eval, "all_labels": eval}
)

  train_metadata_prev_comps = pd.read_csv(


In [10]:
comp_data = pd.concat([
    train_metadata[["primary_label", "scientific_name", "common_name", "url", "secondary_labels"]],
    train_metadata_prev_comps[["primary_label", "scientific_name", "common_name", "url", "secondary_labels"]]
]).reset_index(drop=True)
comp_data["id"] = comp_data["url"].apply(lambda x: int(x.split("/")[-1]))

In [11]:
xeno_canto_all_unique_sec_labels = set([el for el in set(list(chain(*add_xeno_canto_df["secondary_labels"]))) if el != ""])
xeno_canto_all_unique_sn_labels = set([el for el in set(add_xeno_canto_df["scientific_name"]) if el != ""])
xeno_canto_all_unique_cn_labels = set([el for el in set(add_xeno_canto_df["common_name"]) if el != ""])
xeno_canto_all_unique_sec_and_sn_labels = xeno_canto_all_unique_sec_labels | xeno_canto_all_unique_sn_labels

In [12]:
# Xeno Canto secondary labels are taken from `scientific_name`
assert not (xeno_canto_all_unique_sec_labels & xeno_canto_all_unique_cn_labels)

In [13]:
comp_data_pl_2_id = comp_data.groupby("primary_label")["id"].apply(set).to_dict()
add_xeno_canto_df_sn_2_id = add_xeno_canto_df.groupby("scientific_name")["id"].apply(set).to_dict()
add_xeno_canto_df_sn_2_id = {k:v for k,v in add_xeno_canto_df_sn_2_id.items()}

In [14]:
def get_common_name(input):
    if len(set(input)) > 1:
        # raise RuntimeError(f"There should be one2one mapping between `primary_label` and `common_name`. Failed example: {set(input)}")
        print(f"There should be one2one mapping between `primary_label` and `common_name`. Failed example: {set(input)}")
    return input.iloc[0]

In [15]:
pl2sc = comp_data.groupby("primary_label")["scientific_name"].apply(get_common_name).to_dict()
# pl2sc = {v:k.lower() for k,v in pl2sc.items()}
sc2pl = {v:k for k,v in pl2sc.items()}

There should be one2one mapping between `primary_label` and `common_name`. Failed example: {'Leuconotopicus villosus', 'Dryobates villosus'}


In [16]:
lost_classes = {}
for k in set(comp_data["scientific_name"]) - xeno_canto_all_unique_sec_and_sn_labels:
    lost_classes[sc2pl[k]] = {}
    for k_ref in add_xeno_canto_df_sn_2_id.keys():
        lost_classes[sc2pl[k]][k_ref] = len(add_xeno_canto_df_sn_2_id[k_ref] & comp_data_pl_2_id[sc2pl[k]]) / len(comp_data_pl_2_id[sc2pl[k]])
    lost_classes[sc2pl[k]] = pd.Series(lost_classes[sc2pl[k]]).sort_values(ascending=False)

In [17]:
top_lost_classes = {k:v.reset_index().iloc[0].to_dict() for k,v in lost_classes.items()}

In [18]:
top_lost_classes = {k:v for k,v in top_lost_classes.items() if v[0] > 0}

In [19]:
top_lost_classes

{'spodov': {'index': 'Spilopelia chinensis', 0: 1.0},
 'yebbul3': {'index': 'Acritillas indica', 0: 1.0},
 'laudov1': {'index': 'Spilopelia senegalensis', 0: 0.9911504424778761}}

In [20]:
sc2pl_xeno_canto = deepcopy(sc2pl)
for k,v in top_lost_classes.items():
    # if v[0] > 0:
    sc2pl_xeno_canto.pop(pl2sc[k])
    sc2pl_xeno_canto[v["index"]] = k

In [21]:
# All keys unique
assert len(set(sc2pl_xeno_canto.keys())) == len(sc2pl_xeno_canto)
# All values unique
assert len(set(sc2pl_xeno_canto.values())) == len(sc2pl_xeno_canto)
# # All keys are present in Xeno Canto
# assert not set(sc2pl_xeno_canto.keys()) - xeno_canto_all_unique_sec_and_sn_labels

In [22]:
add_xeno_canto_df["primary_label"] = add_xeno_canto_df["scientific_name"].apply(lambda x: sc2pl_xeno_canto.get(x,x))
add_xeno_canto_df["secondary_labels"] = add_xeno_canto_df["secondary_labels"].apply(lambda x: [sc2pl_xeno_canto.get(el,el) for el in x])
add_xeno_canto_df["secondary_labels"] = add_xeno_canto_df["secondary_labels"].apply(lambda x: [el for el in x if el != ""])

In [23]:
# Create all_labels
add_xeno_canto_df['all_labels'] = add_xeno_canto_df.apply(lambda x: [x["primary_label"]] + x["secondary_labels"], axis=1)

In [24]:
add_xeno_canto_df["dataset"] = "xc_2024_classes"

In [25]:
set(train_metadata.columns) - set(add_xeno_canto_df.columns)

{'duration_s'}

In [26]:
add_xeno_canto_df["filename"] = add_xeno_canto_df["filename"].apply(lambda x: x.replace(" ", ""))

In [27]:
xeno_canto_lengts_and_srs =  ProgressParallel(n_jobs=32, total=len(add_xeno_canto_df))(
    delayed(read_length_and_sr)(path) for path in add_xeno_canto_df["filename"].apply(
        lambda x: os.path.join(
            "/home/vova/data/exps/birdclef_2024/xeno_canto/dataset_2024_classes/train_features", x.replace(".mp3", ".hdf5")
        )
    )
)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48632/48632 [00:04<00:00, 10760.68it/s]


In [28]:
add_xeno_canto_df["is_valid"] = [
    el is not None for el in xeno_canto_lengts_and_srs
]
add_xeno_canto_df["sample_rate"] = [el[1] if el is not None else None for el in xeno_canto_lengts_and_srs]
add_xeno_canto_df["au_len"] = [el[0] if el is not None else None for el in xeno_canto_lengts_and_srs]
add_xeno_canto_df = add_xeno_canto_df[add_xeno_canto_df["is_valid"]]
add_xeno_canto_df = add_xeno_canto_df.drop(columns=["is_valid"])
add_xeno_canto_df["duration_s"] = add_xeno_canto_df["au_len"] / add_xeno_canto_df["sample_rate"]

In [29]:
set(train_metadata.columns) - set(add_xeno_canto_df.columns)

set()

In [30]:
add_xeno_canto_df.to_csv("/home/vova/data/exps/birdclef_2024/xeno_canto/dataset_2024_classes/train_metadata_extended.csv", index=False)

In [31]:
add_xeno_canto_df = add_xeno_canto_df[~add_xeno_canto_df["id"].isin(comp_data["id"])].reset_index(drop=True)

In [32]:
add_xeno_canto_df.shape

(21750, 24)

In [33]:
add_xeno_canto_df.to_csv("/home/vova/data/exps/birdclef_2024/xeno_canto/dataset_2024_classes/train_metadata_nodupl_extended.csv", index=False)

In [24]:
add_xeno_canto_df = pd.read_csv(
    "/home/vova/data/exps/birdclef_2024/xeno_canto/dataset_2024_classes/train_metadata_noduplV3_extended.csv", 
    converters={"secondary_labels": eval, "all_labels": eval}
)
train_metadata = pd.read_csv(
    "/home/vova/data/exps/birdclef_2024/birdclef_2024/train_metadata_extended_noduplv1.csv", 
    converters={"secondary_labels": eval, "all_labels": eval}
)
train_metadata_prev_comp = pd.read_csv(
    "/home/vova/data/exps/birdclef_2024/dfs/full_noduplsV3_scored_meta_prev_comps_extended.csv", 
    converters={"secondary_labels": eval, "all_labels": eval}
)

In [25]:
all_classes = set(chain(*train_metadata["all_labels"]))
len(all_classes)

188

In [26]:
add_xeno_canto_df["all_labels"] = add_xeno_canto_df["all_labels"].apply(lambda x: [bird for bird in x if bird in all_classes])
add_xeno_canto_df["secondary_labels"] = add_xeno_canto_df["secondary_labels"].apply(lambda x: [bird for bird in x if bird in all_classes])

train_metadata_prev_comp = train_metadata_prev_comp[train_metadata_prev_comp["primary_label"].isin(all_classes)].reset_index(drop=True)
train_metadata_prev_comp["all_labels"] = train_metadata_prev_comp["all_labels"].apply(lambda x: [bird for bird in x if bird in all_classes])
train_metadata_prev_comp["secondary_labels"] = train_metadata_prev_comp["secondary_labels"].apply(lambda x: [bird for bird in x if bird in all_classes])

In [27]:
add_xeno_canto_df.to_csv(
    "/home/vova/data/exps/birdclef_2024/xeno_canto/dataset_2024_classes/train_metadata_noduplV3_extended_2024SecLabels.csv",
    index=False
)
train_metadata_prev_comp.to_csv(
    "/home/vova/data/exps/birdclef_2024/dfs/full_noduplsV3_scored_meta_prev_comps_extended_2024SecLabels.csv",
    index=False
)