In [3]:
import pandas as pd
import numpy as np
import librosa
import seaborn as sns
import os
import json
import IPython.display as ipd
import soundfile as sf
import math
import h5py

from glob import glob
from tqdm import tqdm
from matplotlib import pyplot as plt
from itertools import chain
from os.path import join as pjoin
from shutil import copyfile
from copy import deepcopy

from code_base.utils import write_json, load_json

  from .autonotebook import tqdm as notebook_tqdm


# 2024

In [4]:
train_metadata = pd.read_csv("/gpfs/space/projects/BetterMedicine/volodymyr1/exps/bird_clef_2025/birdclef_2025/train_and_prev_comps_extendedv1.csv", converters={"secondary_labels":eval})

In [5]:
all_birds = list(set(train_metadata["primary_label"]) | set(chain(*train_metadata["secondary_labels"])))
all_birds = [el for el in all_birds if len(el)]

In [6]:
all_birds

['476538',
 'linwoo1',
 'rubsee1',
 'tbsfin1',
 'rumfly1',
 'rtlhum',
 'ragmac1',
 '21038',
 'brtpar1',
 '1192948',
 'cocwoo1',
 '65373',
 'rutpuf1',
 '963335',
 'saffin',
 'wbwwre1',
 '66893',
 'gycwor1',
 'greani1',
 'butsal1',
 'yebela1',
 'recwoo1',
 'grnkin',
 'olipic1',
 'smbani',
 'socfly1',
 'bugtan',
 'cotfly1',
 'whwswa1',
 '65336',
 '555142',
 'littin1',
 '24272',
 'creoro1',
 'bobfly1',
 'eardov1',
 'royfly1',
 'secfly1',
 '1139490',
 'orcpar',
 'roahaw',
 'orcwar',
 '66531',
 'rewbla',
 '22976',
 '22333',
 'rutjac1',
 'grasal4',
 'cocher1',
 'whbant1',
 '41970',
 'yebcha',
 '1462711',
 'cargra1',
 'sahpar1',
 'spepar1',
 '66016',
 'cregua1',
 'grekis',
 'blhpar1',
 'strfly1',
 'thbeup1',
 'yehcar1',
 'colara1',
 'rufmot1',
 'plctan1',
 'trsowl',
 '528041',
 'rugdov',
 'tropar',
 'srwswa1',
 '52884',
 '42113',
 'bkcdon',
 'anhing',
 'bicwre1',
 'crcwoo1',
 '47067',
 'purgal2',
 'ruther1',
 'thlsch3',
 'grbhaw1',
 'bucmot3',
 'piwtyr1',
 '24322',
 '126247',
 '715170',
 'crbt

In [7]:
bird2int = {
    bird:idx for idx, bird in enumerate(all_birds)
}

In [8]:
len(bird2int)

220

In [9]:
write_json("/gpfs/space/projects/BetterMedicine/volodymyr1/exps/bird_clef_2025/birdclef_2025/bird2int_2025_and_prev_comps.json", bird2int)

# 2024 Scored Data DF

In [None]:
train_metadata = pd.read_csv("/home/vova/data/exps/birdclef_2024/birdclef_2024/train_metadata_extended.csv", converters={"secondary_labels":eval, "all_labels": eval})
train_metadata_prev_comp = pd.read_csv("/home/vova/data/exps/birdclef_2024/dfs/full_noduplsV2_scored_meta_prev_comps_extended.csv", converters={"secondary_labels":eval, "all_labels": eval})
train_metadata_xeno_canto = pd.read_csv("/home/vova/data/exps/birdclef_2024/xeno_canto/dataset_2024_classes/train_metadata_nodupl_extended.csv", converters={"secondary_labels":eval, "all_labels": eval})

In [None]:
all_birds = list(
    set(chain(*train_metadata["all_labels"])) | 
    set(chain(*train_metadata_prev_comp["all_labels"])) | 
    set(chain(*train_metadata_xeno_canto["all_labels"]))
)

In [None]:
len(all_birds)

In [None]:
bird2int = {
    bird:idx for idx, bird in enumerate(all_birds)
}

In [None]:
write_json("/home/vova/data/exps/birdclef_2024/class_mappings/bird2int_2024_scoed_add_data.json", bird2int)

# 2024 + 2024 Scored from XC + Prev Comps

In [None]:
train_metadata = pd.read_csv("/home/vova/data/exps/birdclef_2024/birdclef_2024/train_metadata_extended_noduplv1.csv", converters={"secondary_labels":eval, "all_labels": eval})
train_metadata_prev_comp = pd.read_csv("/home/vova/data/exps/birdclef_2024/dfs/full_noduplsV2_meta_prev_comps_extended_pruned.csv", converters={"secondary_labels":eval, "all_labels": eval})
train_metadata_xeno_canto = pd.read_csv("/home/vova/data/exps/birdclef_2024/xeno_canto/dataset_2024_classes/train_metadata_noduplV3_extended.csv", converters={"secondary_labels":eval, "all_labels": eval})

In [None]:
all_birds = list(
    set(chain(*train_metadata["all_labels"])) | 
    set(chain(*train_metadata_prev_comp["all_labels"]))
)
len(all_birds)

In [None]:
train_metadata_xeno_canto["all_labels"] = train_metadata_xeno_canto["all_labels"].apply(lambda x: [bird for bird in x if bird in all_birds])
train_metadata_xeno_canto["secondary_labels"] = train_metadata_xeno_canto["secondary_labels"].apply(lambda x: [bird for bird in x if bird in all_birds])

In [None]:
train_metadata_xeno_canto.to_csv("/home/vova/data/exps/birdclef_2024/xeno_canto/dataset_2024_classes/train_metadata_noduplV3_extended_2024PrevCompSecLabels.csv", index=False)

In [None]:
bird2int = {
    bird:idx for idx, bird in enumerate(all_birds)
}

In [None]:
write_json("/home/vova/data/exps/birdclef_2024/class_mappings/bird2int_2024_PrevComp.json", bird2int)

# Check

In [None]:
train_metadata = pd.read_csv(
    "/home/vova/data/exps/birdclef_2024/birdclef_2024/train_metadata_extended_noduplv2.csv",
    converters={"all_labels": eval}
)
add_data_prev_comp = pd.read_csv(
    "/home/vova/data/exps/birdclef_2024/dfs/full_noduplsV3_scored_meta_prev_comps_extended.csv",
    converters={"all_labels": eval}
)
add_data_xc = pd.read_csv(
    "/home/vova/data/exps/birdclef_2024/xeno_canto/dataset_2024_classes/train_metadata_noduplV4_extended_2024SecLabels.csv",
    converters={"all_labels": eval}
)
add_data_prev_comp_all = pd.read_csv(
    "/home/vova/data/exps/birdclef_2024/dfs/full_noduplsV3_meta_prev_comps_extended.csv",
    converters={"all_labels": eval}
)

In [None]:
add_data_prev_comp_all.shape

In [None]:
all_birds = (
    set(chain(*train_metadata["all_labels"])) |
    set(chain(*add_data_prev_comp["all_labels"])) |
    set(chain(*add_data_xc["all_labels"])) |
    set(chain(*add_data_prev_comp_all["all_labels"]))
)

In [None]:
json_all_birds = load_json("/home/vova/data/exps/birdclef_2024/class_mappings/bird2int_2024_PrevComp.json")

In [None]:
set(json_all_birds.keys()) == all_birds

In [None]:
add_data_prev_comp_all["primary_label"]

# 2024 + 2024 Scored from XC + Prev Comps + XC (India + VC > 50)

In [None]:
merged_train_metadata = pd.read_csv(
    "/home/vova/data/exps/birdclef_2024/birdclef_2024/merged_train_metadata_extended_noduplv2.csv",
    converters={"all_labels": eval}
)
prev_comp_data = pd.read_csv(
    "/home/vova/data/exps/birdclef_2024/dfs/full_noduplsV3_meta_prev_comps_extended.csv",
    converters={"all_labels": eval}
)
xc_add_data = pd.read_csv(
    "/home/vova/data/exps/birdclef_2024/xeno_canto/dataset_NO2024_classes/train_metadata_noduplV5_extended_india_vc50.csv",
    converters={"all_labels": eval}
)

In [None]:
all_birds = (
    set(chain(*merged_train_metadata["all_labels"])) |
    set(chain(*prev_comp_data["all_labels"])) |
    set(chain(*xc_add_data["all_labels"])) 
)

len(all_birds)

In [None]:
bird2int = {
    bird:idx for idx, bird in enumerate(all_birds)
}

In [None]:
write_json("/home/vova/data/exps/birdclef_2024/class_mappings/bird2int_2024_PrevComp_XCIndiaVC50.json", bird2int)