In [1]:
import pandas as pd
import os
import joblib
import librosa
import numpy as np

In [None]:
df_2021 = pd.read_csv("../inputs/pretrain/birdclef-2021/train_metadata.csv")
df_2021["path"] = "../inputs/pretrain/birdclef-2021/train_short_audio/" + df_2021["primary_label"] + "/" + df_2021["filename"]

In [None]:
df_2022 = pd.read_csv("../inputs/pretrain/birdclef-2022/train_metadata.csv")
df_2022["path"] = "../inputs/pretrain/birdclef-2022/train_audio/" + df_2022["filename"]

In [None]:
df_2023 = pd.read_csv("../inputs/pretrain/birdclef-2023/train_metadata.csv")
df_2023["path"] = "../inputs/pretrain/birdclef-2023/train_audio/" + df_2023["filename"]

In [2]:
data_dir = '/data1/Mamba/Kaggle/BirdCLEF-2025/Data_Raw'
df_2025 = pd.read_csv(f"{data_dir}/train.csv")
df_2025["path"] = f"{data_dir}/train_audio/" + df_2025["filename"]

In [3]:
def get_audio_meta(path):
    x, sr = librosa.load(path, mono=True)
    return len(x), sr, np.abs(x.max())

In [None]:
meta_2021 = joblib.Parallel(n_jobs=10, verbose=5)(joblib.delayed(get_audio_meta)(p) for p in df_2021["path"])

In [None]:
meta_2022 = joblib.Parallel(n_jobs=10, verbose=5)(joblib.delayed(get_audio_meta)(p) for p in df_2022["path"])

In [None]:
meta_2023 = joblib.Parallel(n_jobs=10, verbose=5)(joblib.delayed(get_audio_meta)(p) for p in df_2023["path"])

In [4]:
meta_2025 = joblib.Parallel(n_jobs=10, verbose=5)(joblib.delayed(get_audio_meta)(p) for p in df_2025["path"])

[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  52 tasks      | elapsed:    2.8s
[Parallel(n_jobs=10)]: Done 376 tasks      | elapsed:    4.7s
[Parallel(n_jobs=10)]: Done 942 tasks      | elapsed:    9.7s
[Parallel(n_jobs=10)]: Done 1590 tasks      | elapsed:   13.1s
[Parallel(n_jobs=10)]: Done 2382 tasks      | elapsed:   17.6s
[Parallel(n_jobs=10)]: Done 3318 tasks      | elapsed:   23.7s
[Parallel(n_jobs=10)]: Done 4398 tasks      | elapsed:   30.8s
[Parallel(n_jobs=10)]: Done 5622 tasks      | elapsed:   38.5s
[Parallel(n_jobs=10)]: Done 6990 tasks      | elapsed:   47.6s
[Parallel(n_jobs=10)]: Done 8502 tasks      | elapsed:   58.4s
[Parallel(n_jobs=10)]: Done 10158 tasks      | elapsed:  1.1min
[Parallel(n_jobs=10)]: Done 13646 tasks      | elapsed:  1.5min
[Parallel(n_jobs=10)]: Done 17534 tasks      | elapsed:  1.9min
[Parallel(n_jobs=10)]: Done 21710 tasks      | elapsed:  2.3min
[Parallel(n_jobs=10)]: Done 26174 tasks 

In [7]:
def merge_metadata(meta_pairs, base_df):
    df_meta = pd.DataFrame(meta_pairs, columns=["duration", "sr", "max_amplitude"])
    df_meta["duration_sec"] = df_meta["duration"] / df_meta["sr"]

    new_df = pd.concat([base_df, df_meta], axis=1)
    return new_df

In [None]:
df_2021_rich = merge_metadata(meta_2021, df_2021)
df_2021_rich.to_parquet("train_metadata_rich_2022.parquet")

In [None]:
df_2022_rich = merge_metadata(meta_2022, df_2022)
df_2022_rich.to_parquet("train_metadata_rich_2022.parquet")

In [None]:
df_2023_rich = merge_metadata(meta_2023, df_2023)
df_2023_rich.to_parquet("train_metadata_rich_2023.parquet")

In [9]:
df_2025_rich = merge_metadata(meta_2025, df_2025)
df_2025_rich.to_parquet("train_metadata_rich_2025.parquet")

## Merge 2021-2023 data

In [None]:
def load_df():
    df_2021 = pd.read_parquet("train_metadata_rich_2021.parquet")
    df_2022 = pd.read_parquet("train_metadata_rich_2022.parquet")
    df_2023 = pd.read_parquet("train_metadata_rich_2023.parquet")
    return pd.concat([df_2021, df_2022, df_2023]).reset_index(drop=True)

df = load_df()

primary_label_count = df.primary_label.value_counts()
sample_weights = (
    primary_label_count / 
    primary_label_count.sum()
)  ** (-0.5)
df["weight"] = sample_weights[df["primary_label"].values].values
df["weight"] = (df["weight"] / df["weight"].mean())

In [10]:
def load_df():
    df_2025 = pd.read_parquet("train_metadata_rich_2025.parquet")
    return df_2025

df = load_df()

primary_label_count = df.primary_label.value_counts()
sample_weights = (
    primary_label_count / 
    primary_label_count.sum()
)  ** (-0.5)
df["weight"] = sample_weights[df["primary_label"].values].values
df["weight"] = (df["weight"] / df["weight"].mean())

In [11]:
# check exist files
assert df["path"].apply(os.path.exists).all()
df["path"] = df["path"].apply(os.path.abspath)

# save merged df
pretrain_filename = "train_metadata_rich_2025_weight.pkl"
df.to_pickle(pretrain_filename)