In [None]:
import pandas as pd
import numpy as np
import librosa
import seaborn as sns
import os
import json
import IPython.display as ipd
import soundfile as sf
import math
import torch
import h5py
import re
from scipy.io import wavfile

from glob import glob
from tqdm import tqdm
from matplotlib import pyplot as plt
from itertools import chain
from os.path import join as pjoin
from shutil import copyfile
from copy import deepcopy
from itertools import chain
from sklearn.model_selection import train_test_split
from joblib import delayed

from code_base.utils import write_json, load_json
from code_base.utils.main_utils import ProgressParallel

from code_base.utils import parallel_librosa_load
%matplotlib inline

In [None]:
def check_url_structure(input_df, with_xc=True):
    if with_xc:
        return (
            input_df["url"].apply(lambda x: x.split("/")[-1]) == 
            input_df["filename"].apply(lambda x: os.path.splitext(x.split("/")[-1])[0][2:])
        ).all()
    else:
        return (
            input_df["url"].apply(lambda x: x.split("/")[-1]) == 
            input_df["filename"].apply(lambda x: os.path.splitext(x.split("/")[-1])[0])
        ).all()

In [None]:
def read_length_and_sr(file_path: str):
    with h5py.File(file_path, "r") as data_file:
        au_length = data_file["au"].shape[0]
        sr = int(np.array(data_file["sr"]))
    return au_length, sr

# 2024 Data

In [None]:
eBird_Taxonomy_v2021 = pd.read_csv("/home/vova/data/exps/birdclef_2024/birdclef_2024/eBird_Taxonomy_v2021.csv")
sample_submission = pd.read_csv("/home/vova/data/exps/birdclef_2024/birdclef_2024/sample_submission.csv")
train_metadata = pd.read_csv("/home/vova/data/exps/birdclef_2024/birdclef_2024/train_metadata_extended.csv", converters={"secondary_labels": eval, "all_labels": eval})

In [None]:
scored_birds = set(sample_submission.columns[1:].tolist())

In [None]:
check_url_structure(train_metadata)

In [None]:
train_metadata.url.value_counts()

# 2023 Data

In [None]:
# eBird_Taxonomy_v2021 = pd.read_csv("/home/vova/data/exps/birdclef_2024/birdclef_2023/eBird_Taxonomy_v2021.csv")
sample_submission_2023 = pd.read_csv("/home/vova/data/exps/birdclef_2024/birdclef_2023/sample_submission.csv")
train_metadata_2023 = pd.read_csv("/home/vova/data/exps/birdclef_2024/birdclef_2023/train_metadata.csv", converters={"secondary_labels": eval})

In [None]:
train_metadata_2023["dataset"] = "comp_2023"

In [None]:
train_metadata_2023.url.value_counts()

In [None]:
check_url_structure(train_metadata_2023)

In [None]:
train_metadata_2023["all_labels"] = train_metadata_2023.apply(lambda row: [row["primary_label"]] + row["secondary_labels"], axis=1)
scored_filenames_2023 = train_metadata_2023.loc[
    train_metadata_2023["all_labels"].apply(lambda x: len(set(x) & scored_birds) > 0), 
    "filename"
].tolist()
print(f"New scored recordings: {len(set(scored_filenames_2023) - set(train_metadata.filename))}")
print(f"New recording: {len(set(train_metadata_2023.filename) - set(train_metadata.filename))}")

# 2022 Data

In [None]:
glob("/home/vova/data/exps/BirdCLEF_2023/birdclef_2022/*.csv")

In [None]:
# eBird_Taxonomy_v2021_2022 = pd.read_csv("/home/vova/data/exps/BirdCLEF_2023/birdclef_2022/eBird_Taxonomy_v2021.csv")
sample_submission_2022 = pd.read_csv("/home/vova/data/exps/birdclef_2024/birdclef_2022/sample_submission.csv")
train_metadata_2022 = pd.read_csv("/home/vova/data/exps/birdclef_2024/birdclef_2022/train_metadata_fixed.csv", converters={"secondary_labels": eval})
test_2022 = pd.read_csv("/home/vova/data/exps/birdclef_2024/birdclef_2022/test.csv")

In [None]:
train_metadata_2022["dataset"] = "comp_2022"

In [None]:
train_metadata_2022.url.value_counts()

In [None]:
# train_metadata_2022[
#     train_metadata_2022.url.isin([
#         "https://www.xeno-canto.org/294370",
#         "https://www.xeno-canto.org/501149"
#     ])
# ]

In [None]:
# train_metadata_2022.loc[5748, "secondary_labels"] = ["mallar3"]
# train_metadata_2022 = train_metadata_2022.drop(index=[8441])

# train_metadata_2022.loc[1518, "secondary_labels"] = ["gadwal"]
# train_metadata_2022 = train_metadata_2022.drop(index=[5208])

# train_metadata_2022 = train_metadata_2022.reset_index(drop=True)
# train_metadata_2022.to_csv(
#     "/home/vova/data/exps/birdclef_2024/birdclef_2022/train_metadata_fixed.csv",
#     index=False
# )

In [None]:
# train_metadata_2022 = train_metadata_2022.drop_duplicates("url", keep=False).reset_index(drop=True)

In [None]:
train_metadata_2022.url.value_counts()

In [None]:
check_url_structure(train_metadata_2022)

In [None]:
train_metadata_2022["all_labels"] = train_metadata_2022.apply(lambda row: [row["primary_label"]] + row["secondary_labels"], axis=1)
scored_filenames_2022 = train_metadata_2022.loc[
    train_metadata_2022["all_labels"].apply(lambda x: len(set(x) & scored_birds) > 0), 
    "filename"
].tolist()
print(f"New scored recordings: {len(set(scored_filenames_2022) - (set(train_metadata.url) | set(train_metadata_2023.url)))}")
print(f"New recording: {len(set(train_metadata_2022.filename) - (set(train_metadata.url) | set(train_metadata_2023.url)))}")

In [None]:
print(f"New recording: {len(set(train_metadata_2022.url) - set(train_metadata.url))}")

# 2021 Data

In [None]:
glob("/home/vova/data/exps/birdclef_2024/birdclef_2021/*")

In [None]:
sample_submission_2021 = pd.read_csv("/home/vova/data/exps/birdclef_2024/birdclef_2021/sample_submission.csv")
train_metadata_2021 = pd.read_csv("/home/vova/data/exps/birdclef_2024/birdclef_2021/train_metadata.csv", converters={"secondary_labels": eval})
test_2021 = pd.read_csv("/home/vova/data/exps/birdclef_2024/birdclef_2021/test.csv")
train_soundscape_labels_2021 = pd.read_csv("/home/vova/data/exps/birdclef_2024/birdclef_2021/train_soundscape_labels.csv")

train_metadata_2021.secondary_labels = train_metadata_2021.secondary_labels.apply(lambda x: ["rocpig" if el == "rocpig1" else el for el in x])

In [None]:
# Create filename
train_metadata_2021["filename"] = train_metadata_2021.apply(lambda x: pjoin(x["primary_label"], x["filename"]), axis=1)

In [None]:
train_metadata_2021["dataset"] = "comp_2021"

In [None]:
train_metadata_2021.url.value_counts()

In [None]:
train_metadata_2021["all_labels"] = train_metadata_2021.apply(lambda row: [row["primary_label"]] + row["secondary_labels"], axis=1)
scored_filenames_2021 = train_metadata_2021.loc[
    train_metadata_2021["all_labels"].apply(lambda x: len(set(x) & scored_birds) > 0), 
    "filename"
].tolist()
print(f"New scored recordings: {len(set(scored_filenames_2021) - (set(train_metadata.filename) | set(train_metadata_2022.filename) | set(train_metadata_2023.url)))}")
print(f"New recording: {len(set(train_metadata_2021.filename) - (set(train_metadata.filename) | set(train_metadata_2022.filename) | set(train_metadata_2023.url)))}")

In [None]:
print(f"New recording: {len(set(train_metadata_2021.url) - (set(train_metadata.url) | set(train_metadata_2022.url) | set(train_metadata_2023.url)))}")

# 2020 Data

In [None]:
glob("/home/vova/data/exps/birdclef_2024/birdsong_recognition/*")

In [None]:
glob("/home/vova/data/exps/birdclef_2024/xeno_canto_bird_recordings_extended_a_m/*")

In [None]:
glob("/home/vova/data/exps/birdclef_2024/xeno_canto_bird_recordings_extended_n_z/*")

In [None]:
example_test_audio_metadata_2020 = pd.read_csv("/home/vova/data/exps/birdclef_2024/birdsong_recognition/example_test_audio_metadata.csv")
example_test_audio_summary_2020 = pd.read_csv("/home/vova/data/exps/birdclef_2024/birdsong_recognition/example_test_audio_summary.csv")
sample_submission_2020 = pd.read_csv("/home/vova/data/exps/birdclef_2024/birdsong_recognition/sample_submission.csv")
test_2020 = pd.read_csv("/home/vova/data/exps/birdclef_2024/birdsong_recognition/test.csv")
train_2020 = pd.read_csv("/home/vova/data/exps/birdclef_2024/birdsong_recognition/train.csv", converters={"secondary_labels": eval})

train_xc_a_m_2020 = pd.read_csv("/home/vova/data/exps/birdclef_2024/xeno_canto_bird_recordings_extended_a_m/train_extended.csv", converters={"secondary_labels": eval})
# train_xc_n_z_2020 = pd.read_csv("/home/vova/data/exps/birdclef_2024/xeno_canto_bird_recordings_extended_n_z/train_extended.csv", converters={"secondary_labels": eval})

# train_2020["dataset"] = "comp_2020"
# train_xc_a_m_2020["dataset"] = "a_m_2020"
# train_xc_n_z_2020["dataset"] = "n_z_2020"
# train_2020 = pd.concat([
#     train_2020, train_xc_a_m_2020, train_xc_n_z_2020
# ]).reset_index(drop=True)
# train_2020 = train_2020.drop_duplicates("url").reset_index(drop=True)

In [None]:
(
    len(glob("/home/vova/data/exps/birdclef_2024/xeno_canto_bird_recordings_extended_a_m/A-M/*/*.mp3")) + len(glob("/home/vova/data/exps/birdclef_2024/xeno_canto_bird_recordings_extended_n_z/N-Z/*/*.mp3"))
) == train_xc_a_m_2020.shape[0]

In [None]:
train_2020["url"].value_counts()

In [None]:
check_url_structure(train_2020)

In [None]:
train_xc_a_m_2020["url"].value_counts()

In [None]:
check_url_structure(train_xc_a_m_2020)

In [None]:
train_xc_a_m_2020["dataset"] = None
train_xc_a_m_2020.loc[
    train_xc_a_m_2020["ebird_code"].isin(os.listdir("/home/vova/data/exps/birdclef_2024/xeno_canto_bird_recordings_extended_a_m/A-M/")),
    "dataset"
] = "a_m_2020"
train_xc_a_m_2020.loc[
    train_xc_a_m_2020["ebird_code"].isin(os.listdir("/home/vova/data/exps/birdclef_2024/xeno_canto_bird_recordings_extended_n_z/N-Z/")),
    "dataset"
] = "n_z_2020"
train_xc_a_m_2020["dataset"].isna().sum()

In [None]:
train_2020["dataset"] = "comp_2020"
train_2020 = pd.concat([
    train_2020, train_xc_a_m_2020
])
train_2020 = train_2020.drop_duplicates("url").reset_index(drop=True)

In [None]:
train_2020

In [None]:
# Rename to other years convention
train_2020 = train_2020.rename(columns={"sci_name": "scientific_name", "species": "common_name"})
# Transform secondary_labels
ebird2name = train_2020.drop_duplicates("ebird_code")[["ebird_code", "primary_label"]].set_index("ebird_code")["primary_label"].to_dict()
name2ebird = {v:k for k,v in ebird2name.items()}
train_2020['secondary_labels'] = train_2020['secondary_labels'].apply(lambda x: [name2ebird[el] for el in x if el in name2ebird])
# ebird_code to primary_label
train_2020["primary_label"] = train_2020["ebird_code"]
# Create all_labels
train_2020['all_labels'] = train_2020.apply(lambda x: [x["primary_label"]] + x["secondary_labels"], axis=1)
# Create filename
train_2020["filename"] = train_2020.apply(lambda x: pjoin(x["primary_label"], x["filename"]).replace(".mp3", ".ogg"), axis=1)

In [None]:
scored_filenames_2020 = train_2020.loc[
    train_2020["all_labels"].apply(lambda x: len(set(x) & scored_birds) > 0), 
    "filename"
].tolist()
print(f"New scored recordings: {len(set(scored_filenames_2020) - (set(train_metadata.filename) | set(train_metadata_2022.filename) | set(train_metadata_2021.filename) | set(train_metadata_2023.filename)))}")
print(f"New recording: {len(set(train_2020.filename) - (set(train_metadata.filename) | set(train_metadata_2022.filename) | set(train_metadata_2021.filename) | set(train_metadata_2023.filename)))}")

In [None]:
print(f"New recording: {len(set(train_2020.url) - (set(train_metadata.url) | set(train_metadata_2022.url) | set(train_metadata_2021.url) | set(train_metadata_2023.filename)))}")

# Compose Pretrain DataFrame

In [None]:
# train_metadata_2022["old_filename"] = train_metadata_2022["filename"]
# train_metadata_2022["filename"] = train_metadata_2022["filename"].apply(lambda x: os.path.join("/home/vova/data/exps/BirdCLEF_2023/birdclef_2022/train_audio/", x))

In [None]:
# train_metadata_2021["old_filename"] = train_metadata_2021["filename"]
# train_metadata_2021["filename"] = train_metadata_2021["filename"].apply(lambda x: os.path.join("/home/vova/data/exps/BirdCLEF_2023/birdclef_2021/train_short_audio/", x))

In [None]:
# train_2020["old_filename"] = train_2020["filename"]

# train_2020.loc[train_2020["2020_source"] == "comp" ,"filename"] = (
#     train_2020.loc[train_2020["2020_source"] == "comp" ,"filename"].apply(lambda x: os.path.join("/home/vova/data/exps/BirdCLEF_2023/birdclef_2020/train_audio/", x).replace(".ogg", ".mp3"))
# )
# train_2020.loc[train_2020["2020_source"] == "a_m" ,"filename"] = (
#     train_2020.loc[train_2020["2020_source"] == "a_m" ,"filename"].apply(lambda x: os.path.join("/home/vova/data/exps/BirdCLEF_2023/birdclef_2020_xc_a_m/A-M/", x).replace(".ogg", ".mp3"))
# )
# train_2020.loc[train_2020["2020_source"] == "n_z" ,"filename"] = (
#     train_2020.loc[train_2020["2020_source"] == "n_z" ,"filename"].apply(lambda x: os.path.join("/home/vova/data/exps/BirdCLEF_2023/birdclef_2020_xc_n_z/N-Z/", x).replace(".ogg", ".mp3"))
# )

In [None]:
columns2take = list(
    set(train_metadata_2023.columns) & 
    set(train_metadata_2022.columns) & 
    set(train_metadata_2021.columns) & 
    set(train_2020.columns)
)
columns2take

In [None]:
# full_add_df_with_duplicates = pd.concat([train_metadata_2023, train_metadata_2022, train_metadata_2021, train_2020]).reset_index(drop=True)
# full_add_df_with_duplicates.to_csv("/home/vova/data/exps/birdclef_2024/dfs/full_meta_prev_comps.csv", index=False)
# full_add_df_with_duplicates.shape

In [None]:
full_add_df_no_duplicates = pd.concat([
    train_metadata_2023[columns2take],
    train_metadata_2022.loc[~train_metadata_2022["url"].isin(train_metadata_2023["url"]), columns2take],
    train_metadata_2021.loc[~train_metadata_2021["url"].isin(set(train_metadata_2023["url"]) | set(train_metadata_2022["url"])), columns2take],
    train_2020.loc[~train_2020["url"].isin(set(train_metadata_2023["url"]) | set(train_metadata_2022["url"]) | set(train_metadata_2021["url"])), columns2take]
]).reset_index(drop=True)

In [None]:
full_add_df_no_duplicates["url"].value_counts()

In [None]:
set(train_metadata.columns) - set(full_add_df_no_duplicates.columns)

In [None]:
full_add_df_no_duplicates["dataset"].value_counts()

In [None]:
dataset_mapping = {
    "comp_2021":"birdclef_2021/train_features/",
    "comp_2023":"birdclef_2023/train_features/",
    "comp_2022":"birdclef_2022/train_features/",
    "comp_2020":"birdsong_recognition/train_features/",
    "a_m_2020": "xeno_canto_bird_recordings_extended_a_m/train_features/",
    "n_z_2020": "xeno_canto_bird_recordings_extended_n_z/train_features/",
}

In [None]:
train_audio_lengts_and_srs =  ProgressParallel(n_jobs=32, total=len(full_add_df_no_duplicates))(
    delayed(read_length_and_sr)(path) for path in full_add_df_no_duplicates.apply(
        lambda row: os.path.join(
            "/home/vova/data/exps/birdclef_2024/", dataset_mapping[row["dataset"]], row["filename"].replace(".ogg", ".hdf5")
        ), 
        axis=1
    )
)

In [None]:
full_add_df_no_duplicates["sample_rate"] = [el[1] for el in train_audio_lengts_and_srs]
full_add_df_no_duplicates["au_len"] = [el[0] for el in train_audio_lengts_and_srs]
full_add_df_no_duplicates["duration_s"] = full_add_df_no_duplicates["au_len"] / full_add_df_no_duplicates["sample_rate"]

In [None]:
set(train_metadata.columns) - set(full_add_df_no_duplicates.columns)

In [None]:
set(full_add_df_no_duplicates.columns) - set(train_metadata.columns)

In [None]:
full_add_df_no_duplicates.to_csv("/home/vova/data/exps/birdclef_2024/dfs/full_nodupls_meta_prev_comps_extended.csv", index=False)

In [None]:
full_add_df_no_duplicates_v2 = full_add_df_no_duplicates[~full_add_df_no_duplicates["url"].isin(train_metadata["url"])].reset_index(drop=True)

In [None]:
full_add_df_no_duplicates_v2.to_csv("/home/vova/data/exps/birdclef_2024/dfs/full_noduplsV2_meta_prev_comps_extended.csv", index=False)

In [None]:
train_metadata_unique_labels = set(chain(*train_metadata["all_labels"].to_list()))
len(train_metadata_unique_labels)

In [None]:
full_add_df_no_duplicates_v2_scored = full_add_df_no_duplicates_v2[
    full_add_df_no_duplicates_v2["all_labels"].apply(lambda x: len(set(x) & train_metadata_unique_labels) > 0)
].reset_index(drop=True)

In [None]:
full_add_df_no_duplicates_v2_scored.to_csv("/home/vova/data/exps/birdclef_2024/dfs/full_noduplsV2_scored_meta_prev_comps_extended.csv", index=False)

In [None]:
train_metadata_with_prev_comp = pd.concat([train_metadata, full_add_df_no_duplicates_v2_scored]).reset_index(drop=True)

In [None]:
train_metadata_with_prev_comp.url.value_counts()

In [None]:
train_metadata_with_prev_comp.to_csv("/home/vova/data/exps/birdclef_2024/dfs/train_2024_with_prev_extended.csv", index=False)

# Prune files

In [None]:
full_add_df_no_duplicates = pd.read_csv(
    "/home/vova/data/exps/birdclef_2024/dfs/full_noduplsV2_meta_prev_comps_extended.csv"
)

In [None]:
dataset_mapping = {
    "comp_2021":"birdclef_2021/train_features/",
    "comp_2023":"birdclef_2023/train_features/",
    "comp_2022":"birdclef_2022/train_features/",
    "comp_2020":"birdsong_recognition/train_features/",
    "a_m_2020": "xeno_canto_bird_recordings_extended_a_m/train_features/",
    "n_z_2020": "xeno_canto_bird_recordings_extended_n_z/train_features/",
}

In [None]:
full_add_df_no_duplicates["h5py_filename"] = full_add_df_no_duplicates.apply(
    lambda row: os.path.join(
        "/home/vova/data/exps/birdclef_2024/", dataset_mapping[row["dataset"]], row["filename"].replace(".ogg", ".hdf5")
    ), 
    axis=1
)

In [None]:
full_add_df_no_duplicates.shape

In [None]:
full_add_df_no_duplicates["h5py_filename"].apply(os.path.exists).all()

In [None]:
picked_files = set(full_add_df_no_duplicates["h5py_filename"])

In [None]:
full_add_df_no_duplicates["h5py_filename"].iloc[0]

In [None]:
all_h5py_files = (
    glob("/home/vova/data/exps/birdclef_2024/birdclef_2023/**/*.hdf5", recursive=True) + 
    glob("/home/vova/data/exps/birdclef_2024/birdclef_2022/**/*.hdf5", recursive=True) + 
    glob("/home/vova/data/exps/birdclef_2024/birdclef_2021/**/*.hdf5", recursive=True) +
    glob("/home/vova/data/exps/birdclef_2024/birdsong_recognition/**/*.hdf5", recursive=True) +
    glob("/home/vova/data/exps/birdclef_2024/xeno_canto_bird_recordings_extended_a_m/**/*.hdf5", recursive=True) +
    glob("/home/vova/data/exps/birdclef_2024/xeno_canto_bird_recordings_extended_n_z/**/*.hdf5", recursive=True) 
)
len(all_h5py_files)

In [None]:
to_del = [el for el in all_h5py_files if el not in picked_files]
len(to_del)

In [None]:
for el in tqdm(to_del):
    os.remove(el)