In [1]:
import pandas as pd
import numpy as np
import librosa
import seaborn as sns
import os
import json
import IPython.display as ipd
import soundfile as sf
import math
import torch
import h5py
import re
from scipy.io import wavfile

from glob import glob
from tqdm import tqdm
from matplotlib import pyplot as plt
from itertools import chain
from os.path import join as pjoin
from shutil import copyfile
from copy import deepcopy
from itertools import chain
from sklearn.model_selection import train_test_split
from joblib import delayed

from code_base.utils import write_json, load_json
from code_base.utils.main_utils import ProgressParallel

from code_base.utils import parallel_librosa_load
%matplotlib inline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def check_url_structure(input_df, with_xc=True):
    if with_xc:
        return (
            input_df["url"].apply(lambda x: x.split("/")[-1]) == 
            input_df["filename"].apply(lambda x: os.path.splitext(x.split("/")[-1])[0][2:])
        ).all()
    else:
        return (
            input_df["url"].apply(lambda x: x.split("/")[-1]) == 
            input_df["filename"].apply(lambda x: os.path.splitext(x.split("/")[-1])[0])
        ).all()

In [3]:
def read_length_and_sr(file_path: str):
    with h5py.File(file_path, "r") as data_file:
        au_length = data_file["au"].shape[0]
        sr = int(np.array(data_file["sr"]))
    return au_length, sr

# 2024 Data

In [4]:
eBird_Taxonomy_v2021 = pd.read_csv("/home/vova/data/exps/birdclef_2024/birdclef_2024/eBird_Taxonomy_v2021.csv")
sample_submission = pd.read_csv("/home/vova/data/exps/birdclef_2024/birdclef_2024/sample_submission.csv")
train_metadata = pd.read_csv("/home/vova/data/exps/birdclef_2024/birdclef_2024/train_metadata_extended.csv", converters={"secondary_labels": eval, "all_labels": eval})

In [5]:
scored_birds = set(sample_submission.columns[1:].tolist())

In [6]:
check_url_structure(train_metadata)

True

In [7]:
train_metadata.url.value_counts()

url
https://www.xeno-canto.org/134896    1
https://www.xeno-canto.org/191729    1
https://www.xeno-canto.org/265870    1
https://www.xeno-canto.org/265869    1
https://www.xeno-canto.org/265868    1
                                    ..
https://www.xeno-canto.org/359359    1
https://www.xeno-canto.org/358708    1
https://www.xeno-canto.org/356320    1
https://www.xeno-canto.org/351356    1
https://xeno-canto.org/858550        1
Name: count, Length: 24458, dtype: int64

# 2023 Data

In [8]:
# eBird_Taxonomy_v2021 = pd.read_csv("/home/vova/data/exps/birdclef_2024/birdclef_2023/eBird_Taxonomy_v2021.csv")
sample_submission_2023 = pd.read_csv("/home/vova/data/exps/birdclef_2024/birdclef_2023/sample_submission.csv")
train_metadata_2023 = pd.read_csv("/home/vova/data/exps/birdclef_2024/birdclef_2023/train_metadata.csv", converters={"secondary_labels": eval})

In [9]:
train_metadata_2023["dataset"] = "comp_2023"

In [10]:
train_metadata_2023.url.value_counts()

url
https://www.xeno-canto.org/128013    1
https://www.xeno-canto.org/636130    1
https://www.xeno-canto.org/610310    1
https://www.xeno-canto.org/610311    1
https://www.xeno-canto.org/611448    1
                                    ..
https://www.xeno-canto.org/397735    1
https://www.xeno-canto.org/126187    1
https://www.xeno-canto.org/126188    1
https://www.xeno-canto.org/206031    1
https://xeno-canto.org/753190        1
Name: count, Length: 16941, dtype: int64

In [11]:
check_url_structure(train_metadata_2023)

True

In [12]:
train_metadata_2023["all_labels"] = train_metadata_2023.apply(lambda row: [row["primary_label"]] + row["secondary_labels"], axis=1)
scored_filenames_2023 = train_metadata_2023.loc[
    train_metadata_2023["all_labels"].apply(lambda x: len(set(x) & scored_birds) > 0), 
    "filename"
].tolist()
print(f"New scored recordings: {len(set(scored_filenames_2023) - set(train_metadata.filename))}")
print(f"New recording: {len(set(train_metadata_2023.filename) - set(train_metadata.filename))}")

New scored recordings: 1334
New recording: 13929


# 2022 Data

In [13]:
glob("/home/vova/data/exps/BirdCLEF_2023/birdclef_2022/*.csv")

[]

In [14]:
# eBird_Taxonomy_v2021_2022 = pd.read_csv("/home/vova/data/exps/BirdCLEF_2023/birdclef_2022/eBird_Taxonomy_v2021.csv")
sample_submission_2022 = pd.read_csv("/home/vova/data/exps/birdclef_2024/birdclef_2022/sample_submission.csv")
train_metadata_2022 = pd.read_csv("/home/vova/data/exps/birdclef_2024/birdclef_2022/train_metadata_fixed.csv", converters={"secondary_labels": eval})
test_2022 = pd.read_csv("/home/vova/data/exps/birdclef_2024/birdclef_2022/test.csv")

In [15]:
train_metadata_2022["dataset"] = "comp_2022"

In [16]:
train_metadata_2022.url.value_counts()

url
https://www.xeno-canto.org/125458    1
https://www.xeno-canto.org/654864    1
https://www.xeno-canto.org/629467    1
https://www.xeno-canto.org/630434    1
https://www.xeno-canto.org/636553    1
                                    ..
https://www.xeno-canto.org/522535    1
https://www.xeno-canto.org/522536    1
https://www.xeno-canto.org/522537    1
https://www.xeno-canto.org/522538    1
https://www.xeno-canto.org/666195    1
Name: count, Length: 14850, dtype: int64

In [17]:
# train_metadata_2022[
#     train_metadata_2022.url.isin([
#         "https://www.xeno-canto.org/294370",
#         "https://www.xeno-canto.org/501149"
#     ])
# ]

In [18]:
# train_metadata_2022.loc[5748, "secondary_labels"] = ["mallar3"]
# train_metadata_2022 = train_metadata_2022.drop(index=[8441])

# train_metadata_2022.loc[1518, "secondary_labels"] = ["gadwal"]
# train_metadata_2022 = train_metadata_2022.drop(index=[5208])

# train_metadata_2022 = train_metadata_2022.reset_index(drop=True)
# train_metadata_2022.to_csv(
#     "/home/vova/data/exps/birdclef_2024/birdclef_2022/train_metadata_fixed.csv",
#     index=False
# )

In [19]:
# train_metadata_2022 = train_metadata_2022.drop_duplicates("url", keep=False).reset_index(drop=True)

In [20]:
train_metadata_2022.url.value_counts()

url
https://www.xeno-canto.org/125458    1
https://www.xeno-canto.org/654864    1
https://www.xeno-canto.org/629467    1
https://www.xeno-canto.org/630434    1
https://www.xeno-canto.org/636553    1
                                    ..
https://www.xeno-canto.org/522535    1
https://www.xeno-canto.org/522536    1
https://www.xeno-canto.org/522537    1
https://www.xeno-canto.org/522538    1
https://www.xeno-canto.org/666195    1
Name: count, Length: 14850, dtype: int64

In [21]:
check_url_structure(train_metadata_2022)

True

In [22]:
train_metadata_2022["all_labels"] = train_metadata_2022.apply(lambda row: [row["primary_label"]] + row["secondary_labels"], axis=1)
scored_filenames_2022 = train_metadata_2022.loc[
    train_metadata_2022["all_labels"].apply(lambda x: len(set(x) & scored_birds) > 0), 
    "filename"
].tolist()
print(f"New scored recordings: {len(set(scored_filenames_2022) - (set(train_metadata.url) | set(train_metadata_2023.url)))}")
print(f"New recording: {len(set(train_metadata_2022.filename) - (set(train_metadata.url) | set(train_metadata_2023.url)))}")

New scored recordings: 2707
New recording: 14850


In [23]:
print(f"New recording: {len(set(train_metadata_2022.url) - set(train_metadata.url))}")

New recording: 13104


# 2021 Data

In [24]:
glob("/home/vova/data/exps/birdclef_2024/birdclef_2021/*")

['/home/vova/data/exps/birdclef_2024/birdclef_2021/train_metadata.csv',
 '/home/vova/data/exps/birdclef_2024/birdclef_2021/train_features',
 '/home/vova/data/exps/birdclef_2024/birdclef_2021/train_short_audio',
 '/home/vova/data/exps/birdclef_2024/birdclef_2021/train_soundscape_labels.csv',
 '/home/vova/data/exps/birdclef_2024/birdclef_2021/train_soundscapes',
 '/home/vova/data/exps/birdclef_2024/birdclef_2021/test_soundscapes',
 '/home/vova/data/exps/birdclef_2024/birdclef_2021/sample_submission.csv',
 '/home/vova/data/exps/birdclef_2024/birdclef_2021/test.csv']

In [25]:
sample_submission_2021 = pd.read_csv("/home/vova/data/exps/birdclef_2024/birdclef_2021/sample_submission.csv")
train_metadata_2021 = pd.read_csv("/home/vova/data/exps/birdclef_2024/birdclef_2021/train_metadata.csv", converters={"secondary_labels": eval})
test_2021 = pd.read_csv("/home/vova/data/exps/birdclef_2024/birdclef_2021/test.csv")
train_soundscape_labels_2021 = pd.read_csv("/home/vova/data/exps/birdclef_2024/birdclef_2021/train_soundscape_labels.csv")

train_metadata_2021.secondary_labels = train_metadata_2021.secondary_labels.apply(lambda x: ["rocpig" if el == "rocpig1" else el for el in x])

In [26]:
# Create filename
train_metadata_2021["filename"] = train_metadata_2021.apply(lambda x: pjoin(x["primary_label"], x["filename"]), axis=1)

In [27]:
train_metadata_2021["dataset"] = "comp_2021"

In [28]:
train_metadata_2021.url.value_counts()

url
https://www.xeno-canto.org/109605    1
https://www.xeno-canto.org/195784    1
https://www.xeno-canto.org/288979    1
https://www.xeno-canto.org/309486    1
https://www.xeno-canto.org/323999    1
                                    ..
https://www.xeno-canto.org/382235    1
https://www.xeno-canto.org/388583    1
https://www.xeno-canto.org/388709    1
https://www.xeno-canto.org/389236    1
https://www.xeno-canto.org/615888    1
Name: count, Length: 62874, dtype: int64

In [29]:
train_metadata_2021["all_labels"] = train_metadata_2021.apply(lambda row: [row["primary_label"]] + row["secondary_labels"], axis=1)
scored_filenames_2021 = train_metadata_2021.loc[
    train_metadata_2021["all_labels"].apply(lambda x: len(set(x) & scored_birds) > 0), 
    "filename"
].tolist()
print(f"New scored recordings: {len(set(scored_filenames_2021) - (set(train_metadata.filename) | set(train_metadata_2022.filename) | set(train_metadata_2023.url)))}")
print(f"New recording: {len(set(train_metadata_2021.filename) - (set(train_metadata.filename) | set(train_metadata_2022.filename) | set(train_metadata_2023.url)))}")

New scored recordings: 1206
New recording: 56240


In [30]:
print(f"New recording: {len(set(train_metadata_2021.url) - (set(train_metadata.url) | set(train_metadata_2022.url) | set(train_metadata_2023.url)))}")

New recording: 56106


# 2020 Data

In [31]:
glob("/home/vova/data/exps/birdclef_2024/birdsong_recognition/*")

['/home/vova/data/exps/birdclef_2024/birdsong_recognition/example_test_audio_summary.csv',
 '/home/vova/data/exps/birdclef_2024/birdsong_recognition/train_features',
 '/home/vova/data/exps/birdclef_2024/birdsong_recognition/example_test_audio_metadata.csv',
 '/home/vova/data/exps/birdclef_2024/birdsong_recognition/train.csv',
 '/home/vova/data/exps/birdclef_2024/birdsong_recognition/train_audio',
 '/home/vova/data/exps/birdclef_2024/birdsong_recognition/example_test_audio',
 '/home/vova/data/exps/birdclef_2024/birdsong_recognition/sample_submission.csv',
 '/home/vova/data/exps/birdclef_2024/birdsong_recognition/test.csv']

In [32]:
glob("/home/vova/data/exps/birdclef_2024/xeno_canto_bird_recordings_extended_a_m/*")

['/home/vova/data/exps/birdclef_2024/xeno_canto_bird_recordings_extended_a_m/train_features',
 '/home/vova/data/exps/birdclef_2024/xeno_canto_bird_recordings_extended_a_m/train_extended.csv',
 '/home/vova/data/exps/birdclef_2024/xeno_canto_bird_recordings_extended_a_m/A-M']

In [33]:
glob("/home/vova/data/exps/birdclef_2024/xeno_canto_bird_recordings_extended_n_z/*")

['/home/vova/data/exps/birdclef_2024/xeno_canto_bird_recordings_extended_n_z/train_features',
 '/home/vova/data/exps/birdclef_2024/xeno_canto_bird_recordings_extended_n_z/train_extended.csv',
 '/home/vova/data/exps/birdclef_2024/xeno_canto_bird_recordings_extended_n_z/N-Z']

In [34]:
example_test_audio_metadata_2020 = pd.read_csv("/home/vova/data/exps/birdclef_2024/birdsong_recognition/example_test_audio_metadata.csv")
example_test_audio_summary_2020 = pd.read_csv("/home/vova/data/exps/birdclef_2024/birdsong_recognition/example_test_audio_summary.csv")
sample_submission_2020 = pd.read_csv("/home/vova/data/exps/birdclef_2024/birdsong_recognition/sample_submission.csv")
test_2020 = pd.read_csv("/home/vova/data/exps/birdclef_2024/birdsong_recognition/test.csv")
train_2020 = pd.read_csv("/home/vova/data/exps/birdclef_2024/birdsong_recognition/train.csv", converters={"secondary_labels": eval})

train_xc_a_m_2020 = pd.read_csv("/home/vova/data/exps/birdclef_2024/xeno_canto_bird_recordings_extended_a_m/train_extended.csv", converters={"secondary_labels": eval})
# train_xc_n_z_2020 = pd.read_csv("/home/vova/data/exps/birdclef_2024/xeno_canto_bird_recordings_extended_n_z/train_extended.csv", converters={"secondary_labels": eval})

# train_2020["dataset"] = "comp_2020"
# train_xc_a_m_2020["dataset"] = "a_m_2020"
# train_xc_n_z_2020["dataset"] = "n_z_2020"
# train_2020 = pd.concat([
#     train_2020, train_xc_a_m_2020, train_xc_n_z_2020
# ]).reset_index(drop=True)
# train_2020 = train_2020.drop_duplicates("url").reset_index(drop=True)

In [35]:
(
    len(glob("/home/vova/data/exps/birdclef_2024/xeno_canto_bird_recordings_extended_a_m/A-M/*/*.mp3")) + len(glob("/home/vova/data/exps/birdclef_2024/xeno_canto_bird_recordings_extended_n_z/N-Z/*/*.mp3"))
) == train_xc_a_m_2020.shape[0]

True

In [36]:
train_2020["url"].value_counts()

url
https://www.xeno-canto.org/134874    1
https://www.xeno-canto.org/420847    1
https://www.xeno-canto.org/420730    1
https://www.xeno-canto.org/419729    1
https://www.xeno-canto.org/419727    1
                                    ..
https://www.xeno-canto.org/141826    1
https://www.xeno-canto.org/134879    1
https://www.xeno-canto.org/118417    1
https://www.xeno-canto.org/115431    1
https://www.xeno-canto.org/55761     1
Name: count, Length: 21375, dtype: int64

In [37]:
check_url_structure(train_2020)

True

In [38]:
train_xc_a_m_2020["url"].value_counts()

url
https://www.xeno-canto.org/554809    1
https://www.xeno-canto.org/468372    1
https://www.xeno-canto.org/388424    1
https://www.xeno-canto.org/395413    1
https://www.xeno-canto.org/406253    1
                                    ..
https://www.xeno-canto.org/528212    1
https://www.xeno-canto.org/528213    1
https://www.xeno-canto.org/528214    1
https://www.xeno-canto.org/531294    1
https://www.xeno-canto.org/565140    1
Name: count, Length: 23784, dtype: int64

In [39]:
check_url_structure(train_xc_a_m_2020)

True

In [40]:
train_xc_a_m_2020["dataset"] = None
train_xc_a_m_2020.loc[
    train_xc_a_m_2020["ebird_code"].isin(os.listdir("/home/vova/data/exps/birdclef_2024/xeno_canto_bird_recordings_extended_a_m/A-M/")),
    "dataset"
] = "a_m_2020"
train_xc_a_m_2020.loc[
    train_xc_a_m_2020["ebird_code"].isin(os.listdir("/home/vova/data/exps/birdclef_2024/xeno_canto_bird_recordings_extended_n_z/N-Z/")),
    "dataset"
] = "n_z_2020"
train_xc_a_m_2020["dataset"].isna().sum()

0

In [41]:
train_2020["dataset"] = "comp_2020"
train_2020 = pd.concat([
    train_2020, train_xc_a_m_2020
])
train_2020 = train_2020.drop_duplicates("url").reset_index(drop=True)

In [42]:
train_2020

Unnamed: 0,rating,playback_used,ebird_code,channels,date,pitch,duration,filename,speed,species,...,url,country,author,primary_label,longitude,length,time,recordist,license,dataset
0,3.5,no,aldfly,1 (mono),2013-05-25,Not specified,25,XC134874.mp3,Not specified,Alder Flycatcher,...,https://www.xeno-canto.org/134874,United States,Jonathon Jongsma,Empidonax alnorum_Alder Flycatcher,-92.962,Not specified,8:00,Jonathon Jongsma,Creative Commons Attribution-ShareAlike 3.0,comp_2020
1,4.0,no,aldfly,2 (stereo),2013-05-27,both,36,XC135454.mp3,both,Alder Flycatcher,...,https://www.xeno-canto.org/135454,United States,Mike Nelson,Empidonax alnorum_Alder Flycatcher,-82.1106,0-3(s),08:30,Mike Nelson,Creative Commons Attribution-NonCommercial-Sha...,comp_2020
2,4.0,no,aldfly,2 (stereo),2013-05-27,both,39,XC135455.mp3,both,Alder Flycatcher,...,https://www.xeno-canto.org/135455,United States,Mike Nelson,Empidonax alnorum_Alder Flycatcher,-82.1106,0-3(s),08:30,Mike Nelson,Creative Commons Attribution-NonCommercial-Sha...,comp_2020
3,3.5,no,aldfly,2 (stereo),2013-05-27,both,33,XC135456.mp3,both,Alder Flycatcher,...,https://www.xeno-canto.org/135456,United States,Mike Nelson,Empidonax alnorum_Alder Flycatcher,-82.1106,0-3(s),08:30,Mike Nelson,Creative Commons Attribution-NonCommercial-Sha...,comp_2020
4,4.0,no,aldfly,2 (stereo),2013-05-27,both,36,XC135457.mp3,level,Alder Flycatcher,...,https://www.xeno-canto.org/135457,United States,Mike Nelson,Empidonax alnorum_Alder Flycatcher,-82.1106,0-3(s),08:30,Mike Nelson,Creative Commons Attribution-NonCommercial-Sha...,comp_2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45154,4.5,no,yetvir,2 (stereo),2020-05-16,,63,XC558915.mp3,,Yellow-throated Vireo,...,https://www.xeno-canto.org/558915,United States,William Whitehead,Vireo flavifrons_Yellow-throated Vireo,-74.2495,,07:00,William Whitehead,Creative Commons Attribution-NonCommercial-Sha...,n_z_2020
45155,4.0,no,yetvir,1 (mono),2020-07-27,,41,XC579823.mp3,,Yellow-throated Vireo,...,https://www.xeno-canto.org/579823,United States,Bobby Wilcox,Vireo flavifrons_Yellow-throated Vireo,-91.0756,,12:00,Bobby Wilcox,Creative Commons Attribution-NonCommercial-Sha...,n_z_2020
45156,4.0,no,yetvir,2 (stereo),2020-07-10,,25,XC574737.mp3,,Yellow-throated Vireo,...,https://www.xeno-canto.org/574737,Canada,Jon Ruddy,Vireo flavifrons_Yellow-throated Vireo,-76.3558,,10:30,Jon Ruddy,Creative Commons Attribution-NonCommercial-Sha...,n_z_2020
45157,3.5,no,yetvir,1 (mono),2020-07-03,,33,XC573213.mp3,,Yellow-throated Vireo,...,https://www.xeno-canto.org/573213,United States,Annette McClellan,Vireo flavifrons_Yellow-throated Vireo,-87.8015,,06:00,Annette McClellan,Creative Commons Attribution-NonCommercial-Sha...,n_z_2020


In [43]:
# Rename to other years convention
train_2020 = train_2020.rename(columns={"sci_name": "scientific_name", "species": "common_name"})
# Transform secondary_labels
ebird2name = train_2020.drop_duplicates("ebird_code")[["ebird_code", "primary_label"]].set_index("ebird_code")["primary_label"].to_dict()
name2ebird = {v:k for k,v in ebird2name.items()}
train_2020['secondary_labels'] = train_2020['secondary_labels'].apply(lambda x: [name2ebird[el] for el in x if el in name2ebird])
# ebird_code to primary_label
train_2020["primary_label"] = train_2020["ebird_code"]
# Create all_labels
train_2020['all_labels'] = train_2020.apply(lambda x: [x["primary_label"]] + x["secondary_labels"], axis=1)
# Create filename
train_2020["filename"] = train_2020.apply(lambda x: pjoin(x["primary_label"], x["filename"]).replace(".mp3", ".ogg"), axis=1)

In [44]:
scored_filenames_2020 = train_2020.loc[
    train_2020["all_labels"].apply(lambda x: len(set(x) & scored_birds) > 0), 
    "filename"
].tolist()
print(f"New scored recordings: {len(set(scored_filenames_2020) - (set(train_metadata.filename) | set(train_metadata_2022.filename) | set(train_metadata_2021.filename) | set(train_metadata_2023.filename)))}")
print(f"New recording: {len(set(train_2020.filename) - (set(train_metadata.filename) | set(train_metadata_2022.filename) | set(train_metadata_2021.filename) | set(train_metadata_2023.filename)))}")

New scored recordings: 735
New recording: 12077


In [45]:
print(f"New recording: {len(set(train_2020.url) - (set(train_metadata.url) | set(train_metadata_2022.url) | set(train_metadata_2021.url) | set(train_metadata_2023.filename)))}")

New recording: 12123


# Compose Pretrain DataFrame

In [46]:
# train_metadata_2022["old_filename"] = train_metadata_2022["filename"]
# train_metadata_2022["filename"] = train_metadata_2022["filename"].apply(lambda x: os.path.join("/home/vova/data/exps/BirdCLEF_2023/birdclef_2022/train_audio/", x))

In [47]:
# train_metadata_2021["old_filename"] = train_metadata_2021["filename"]
# train_metadata_2021["filename"] = train_metadata_2021["filename"].apply(lambda x: os.path.join("/home/vova/data/exps/BirdCLEF_2023/birdclef_2021/train_short_audio/", x))

In [48]:
# train_2020["old_filename"] = train_2020["filename"]

# train_2020.loc[train_2020["2020_source"] == "comp" ,"filename"] = (
#     train_2020.loc[train_2020["2020_source"] == "comp" ,"filename"].apply(lambda x: os.path.join("/home/vova/data/exps/BirdCLEF_2023/birdclef_2020/train_audio/", x).replace(".ogg", ".mp3"))
# )
# train_2020.loc[train_2020["2020_source"] == "a_m" ,"filename"] = (
#     train_2020.loc[train_2020["2020_source"] == "a_m" ,"filename"].apply(lambda x: os.path.join("/home/vova/data/exps/BirdCLEF_2023/birdclef_2020_xc_a_m/A-M/", x).replace(".ogg", ".mp3"))
# )
# train_2020.loc[train_2020["2020_source"] == "n_z" ,"filename"] = (
#     train_2020.loc[train_2020["2020_source"] == "n_z" ,"filename"].apply(lambda x: os.path.join("/home/vova/data/exps/BirdCLEF_2023/birdclef_2020_xc_n_z/N-Z/", x).replace(".ogg", ".mp3"))
# )

In [50]:
columns2take = list(
    set(train_metadata_2023.columns) & 
    set(train_metadata_2022.columns) & 
    set(train_metadata_2021.columns) & 
    set(train_2020.columns)
)
columns2take

['primary_label',
 'longitude',
 'url',
 'all_labels',
 'type',
 'scientific_name',
 'secondary_labels',
 'latitude',
 'rating',
 'filename',
 'dataset',
 'common_name',
 'license',
 'author']

In [51]:
# full_add_df_with_duplicates = pd.concat([train_metadata_2023, train_metadata_2022, train_metadata_2021, train_2020]).reset_index(drop=True)
# full_add_df_with_duplicates.to_csv("/home/vova/data/exps/birdclef_2024/dfs/full_meta_prev_comps.csv", index=False)
# full_add_df_with_duplicates.shape

In [52]:
full_add_df_no_duplicates = pd.concat([
    train_metadata_2023[columns2take],
    train_metadata_2022.loc[~train_metadata_2022["url"].isin(train_metadata_2023["url"]), columns2take],
    train_metadata_2021.loc[~train_metadata_2021["url"].isin(set(train_metadata_2023["url"]) | set(train_metadata_2022["url"])), columns2take],
    train_2020.loc[~train_2020["url"].isin(set(train_metadata_2023["url"]) | set(train_metadata_2022["url"]) | set(train_metadata_2021["url"])), columns2take]
]).reset_index(drop=True)

In [53]:
full_add_df_no_duplicates["url"].value_counts()

url
https://www.xeno-canto.org/128013    1
https://www.xeno-canto.org/531446    1
https://www.xeno-canto.org/249088    1
https://www.xeno-canto.org/249087    1
https://www.xeno-canto.org/249086    1
                                    ..
https://www.xeno-canto.org/240029    1
https://www.xeno-canto.org/237876    1
https://www.xeno-canto.org/237871    1
https://www.xeno-canto.org/237849    1
https://www.xeno-canto.org/177143    1
Name: count, Length: 100158, dtype: int64

In [54]:
set(train_metadata.columns) - set(full_add_df_no_duplicates.columns)

{'duration_s'}

In [55]:
full_add_df_no_duplicates["dataset"].value_counts()

dataset
comp_2021    56575
comp_2023    16941
comp_2022    14367
comp_2020     5507
a_m_2020      4080
n_z_2020      2688
Name: count, dtype: int64

In [56]:
dataset_mapping = {
    "comp_2021":"birdclef_2021/train_features/",
    "comp_2023":"birdclef_2023/train_features/",
    "comp_2022":"birdclef_2022/train_features/",
    "comp_2020":"birdsong_recognition/train_features/",
    "a_m_2020": "xeno_canto_bird_recordings_extended_a_m/train_features/",
    "n_z_2020": "xeno_canto_bird_recordings_extended_n_z/train_features/",
}

In [57]:
train_audio_lengts_and_srs =  ProgressParallel(n_jobs=32, total=len(full_add_df_no_duplicates))(
    delayed(read_length_and_sr)(path) for path in full_add_df_no_duplicates.apply(
        lambda row: os.path.join(
            "/home/vova/data/exps/birdclef_2024/", dataset_mapping[row["dataset"]], row["filename"].replace(".ogg", ".hdf5")
        ), 
        axis=1
    )
)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100158/100158 [00:08<00:00, 11723.82it/s]


In [58]:
full_add_df_no_duplicates["sample_rate"] = [el[1] for el in train_audio_lengts_and_srs]
full_add_df_no_duplicates["au_len"] = [el[0] for el in train_audio_lengts_and_srs]
full_add_df_no_duplicates["duration_s"] = full_add_df_no_duplicates["au_len"] / full_add_df_no_duplicates["sample_rate"]

In [59]:
set(train_metadata.columns) - set(full_add_df_no_duplicates.columns)

set()

In [60]:
set(full_add_df_no_duplicates.columns) - set(train_metadata.columns)

{'au_len', 'dataset', 'sample_rate'}

In [61]:
full_add_df_no_duplicates.to_csv("/home/vova/data/exps/birdclef_2024/dfs/full_nodupls_meta_prev_comps_extended.csv", index=False)

In [62]:
full_add_df_no_duplicates_v2 = full_add_df_no_duplicates[~full_add_df_no_duplicates["url"].isin(train_metadata["url"])].reset_index(drop=True)

In [63]:
full_add_df_no_duplicates_v2.to_csv("/home/vova/data/exps/birdclef_2024/dfs/full_noduplsV2_meta_prev_comps_extended.csv", index=False)

In [64]:
train_metadata_unique_labels = set(chain(*train_metadata["all_labels"].to_list()))
len(train_metadata_unique_labels)

188

In [65]:
full_add_df_no_duplicates_v2_scored = full_add_df_no_duplicates_v2[
    full_add_df_no_duplicates_v2["all_labels"].apply(lambda x: len(set(x) & train_metadata_unique_labels) > 0)
].reset_index(drop=True)

In [66]:
full_add_df_no_duplicates_v2_scored.to_csv("/home/vova/data/exps/birdclef_2024/dfs/full_noduplsV2_scored_meta_prev_comps_extended.csv", index=False)

In [67]:
train_metadata_with_prev_comp = pd.concat([train_metadata, full_add_df_no_duplicates_v2_scored]).reset_index(drop=True)

In [68]:
train_metadata_with_prev_comp.url.value_counts()

url
https://www.xeno-canto.org/134896    1
https://www.xeno-canto.org/540753    1
https://www.xeno-canto.org/592535    1
https://www.xeno-canto.org/592502    1
https://www.xeno-canto.org/578469    1
                                    ..
https://xeno-canto.org/747424        1
https://xeno-canto.org/747422        1
https://xeno-canto.org/747408        1
https://xeno-canto.org/746321        1
https://www.xeno-canto.org/315587    1
Name: count, Length: 28408, dtype: int64

In [69]:
train_metadata_with_prev_comp.to_csv("/home/vova/data/exps/birdclef_2024/dfs/train_2024_with_prev_extended.csv", index=False)