**About** : This notebook is used to prepare the data.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
cd ../src/

/home/tviel/work/kaggle_birdclef_2024/src


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


### Imports

In [3]:
import os
import sys
import glob
import json
import h5py
import librosa
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm
from copy import deepcopy
from joblib import delayed
from sklearn.metrics import *

warnings.simplefilter(action="ignore", category=UserWarning)
pd.set_option('display.width', 500)
pd.set_option('max_colwidth', 100)

In [4]:
from params import *
from data.preparation import *
from util.plots import display_audio
from data.processing import create_target_path, ProgressParallel, get_load_librosa_save_h5py

## Competiton data

### Data

In [10]:
# df = prepare_data()
df = prepare_data_2()

In [11]:
# prepare_folds()

In [12]:
df.head()

Unnamed: 0,id,filename,primary_label,secondary_labels,rating,path,path_ft,fold
0,XC134896,asbfly/XC134896.ogg,asbfly,[],5.0,../input/train_audio/asbfly/XC134896.ogg,../input/train_features/asbfly/XC134896.hdf5,0
1,XC164848,asbfly/XC164848.ogg,asbfly,[],2.5,../input/train_audio/asbfly/XC164848.ogg,../input/train_features/asbfly/XC164848.hdf5,2
2,XC175797,asbfly/XC175797.ogg,asbfly,[],2.5,../input/train_audio/asbfly/XC175797.ogg,../input/train_features/asbfly/XC175797.hdf5,1
3,XC207738,asbfly/XC207738.ogg,asbfly,[],4.0,../input/train_audio/asbfly/XC207738.ogg,../input/train_features/asbfly/XC207738.hdf5,2
4,XC209218,asbfly/XC209218.ogg,asbfly,[],4.0,../input/train_audio/asbfly/XC209218.ogg,../input/train_features/asbfly/XC209218.hdf5,1


### Convert to h5py

In [None]:
SAVE_FOLDER = DATA_PATH + "train_features/"

In [None]:
!rm -rf $SAVE_FOLDER

In [None]:
audios = glob.glob(DATA_PATH + "train_audio/*/*")
print(f"-> Found {len(audios)} files")

In [None]:
targets = [create_target_path(SAVE_FOLDER, f) for f in audios]

os.makedirs(SAVE_FOLDER, exist_ok=True)
for f in set([os.path.dirname(f) for f in targets]):
    os.makedirs(f, exist_ok=True)

print(f"-> Created {len(os.listdir(SAVE_FOLDER))} folders")

In [None]:
fct = get_load_librosa_save_h5py(do_normalize=False, sr=32000)
# fct(audios[0], targets[0])

In [None]:
_ = ProgressParallel(n_jobs=16, total=len(audios))(
    delayed(fct)(load_path, save_path)
    for load_path, save_path in zip(audios, targets)
)

In [None]:
saved_targets = glob.glob(SAVE_FOLDER + "*/*.hdf5")
print(f"Saved {len(saved_targets)} files")

In [None]:
df.head()

## Xenocanto extra data

### Data

In [None]:
df_xc = prepare_xenocanto_data()

In [None]:
# df_xc[df_xc.duplicated(subset="id", keep=False)].groupby('id').agg(list)
# df_map = pd.read_csv(DATA_PATH + "eBird_Taxonomy_v2021.csv").sort_values('PRIMARY_COM_NAME')
# df_map[df_map.SCI_NAME.apply(lambda x: x.lower().startswith('calonectris'))]
# df_xc[df_xc.secondary_labels.apply(lambda x: "unk" in x)]

### Convert to h5py

In [None]:
audios = glob.glob(DATA_PATH + "xenocanto/audio/*/*")
print(f"-> Found {len(audios)} files")

In [None]:
SAVE_FOLDER = DATA_PATH + "xenocanto/features/"

In [None]:
!rm -rf $SAVE_FOLDER

In [None]:
targets = [create_target_path(SAVE_FOLDER, f) for f in audios]

os.makedirs(SAVE_FOLDER, exist_ok=True)
for f in set([os.path.dirname(f) for f in targets]):
    os.makedirs(f, exist_ok=True)

print(f"-> Created {len(os.listdir(SAVE_FOLDER))} folders")

In [None]:
fct = get_load_librosa_save_h5py(do_normalize=False, sr=32000)
fct(audios[0], targets[0])

In [None]:
_ = ProgressParallel(n_jobs=16, total=len(audios))(
    delayed(fct)(load_path, save_path)
    for load_path, save_path in zip(audios, targets)
)

In [None]:
saved_targets = glob.glob(SAVE_FOLDER + "*/*.hdf5")
print(f"Saved {len(saved_targets)} files")

### Nocall

In [None]:
paths = [
    DATA_PATH + "background_noise/birdclef2021_nocall/",
    DATA_PATH + "background_noise/birdclef2020_nocall/",
    DATA_PATH + "background_noise/freefield/",
    DATA_PATH + "background_noise/warblrb/",
    DATA_PATH + "background_noise/birdvox/",
    DATA_PATH + "background_noise/rainforest/",
    DATA_PATH + "background_noise/environment/",
    DATA_PATH + "nocall_dieter/aicrowd2020_noise_30sec/noise_30sec/",
    DATA_PATH + "nocall_dieter/ff1010bird_nocall/nocall/",
    DATA_PATH + "nocall_dieter/train_soundscapes/nocall/",
]

In [None]:
max_samples = 200

dfs = []
for path in paths:
    audios = glob.glob(path + "*")
    df = pd.DataFrame({"path": audios})

    if len(df) > max_samples:
        df = df.sample(max_samples)

    df["duration"] = df["path"].apply(lambda x: librosa.get_duration(path=x))

    src = path.split("/", 3)[-1].split('/')[0]
    df["source"] = src

    df = df[df['duration'] >= 5]
    df = df[df['duration'] <= 60]

    dfs.append(df)

    print(f"-> Found {len(audios)} files in {src}")

In [None]:
df = pd.concat(dfs, ignore_index=True)

plt.figure(figsize=(15, 5))
plt.subplot(1, 2, 1)
sns.countplot(y=df['source'].values)
plt.subplot(1, 2, 2)
sns.histplot(x=df['duration'].values)
plt.show()

In [None]:
SAVE_FOLDER = DATA_PATH + "nocall_features/"
audios = df["path"].values

In [None]:
targets = [create_target_path(SAVE_FOLDER, f) for f in audios]

os.makedirs(SAVE_FOLDER, exist_ok=True)
for f in set([os.path.dirname(f) for f in targets]):
    os.makedirs(f, exist_ok=True)

print(f"-> Created {len(os.listdir(SAVE_FOLDER))} folders")

In [None]:
fct = get_load_librosa_save_h5py(do_normalize=False, sr=32000)
fct(audios[0], targets[0])

In [None]:
_ = ProgressParallel(n_jobs=16, total=len(audios))(
    delayed(fct)(load_path, save_path)
    for load_path, save_path in zip(audios, targets)
)

In [None]:
saved_targets = glob.glob(SAVE_FOLDER + "*/*.hdf5")
print(f"Saved {len(saved_targets)} files")

## Unlabeled data

### Data

### Convert to h5py

In [21]:
audios = glob.glob(DATA_PATH + "unlabeled_soundscapes/*")
print(f"-> Found {len(audios)} files")

-> Found 8444 files


In [22]:
SAVE_FOLDER = DATA_PATH + "unlabeled_features/"

In [23]:
!rm -rf $SAVE_FOLDER

In [24]:
targets = [create_target_path(SAVE_FOLDER, f) for f in audios]

os.makedirs(SAVE_FOLDER, exist_ok=True)
for f in set([os.path.dirname(f) for f in targets]):
    os.makedirs(f, exist_ok=True)

print(f"-> Created {len(os.listdir(SAVE_FOLDER))} folders")

-> Created 1 folders


In [25]:
fct = get_load_librosa_save_h5py(do_normalize=False, sr=32000)
fct(audios[0], targets[0])

In [26]:
_ = ProgressParallel(n_jobs=16, total=len(audios))(
    delayed(fct)(load_path, save_path)
    for load_path, save_path in zip(audios, targets)
)

  0%|          | 0/8444 [00:00<?, ?it/s]

100%|██████████| 8444/8444 [06:28<00:00, 21.75it/s]


In [27]:
saved_targets = glob.glob(SAVE_FOLDER + "*/*.hdf5")
print(f"Saved {len(saved_targets)} files")

Saved 8444 files


: 

### Nocall

In [None]:
paths = [
    DATA_PATH + "background_noise/birdclef2021_nocall/",
    DATA_PATH + "background_noise/birdclef2020_nocall/",
    DATA_PATH + "background_noise/freefield/",
    DATA_PATH + "background_noise/warblrb/",
    DATA_PATH + "background_noise/birdvox/",
    DATA_PATH + "background_noise/rainforest/",
    DATA_PATH + "background_noise/environment/",
    DATA_PATH + "nocall_dieter/aicrowd2020_noise_30sec/noise_30sec/",
    DATA_PATH + "nocall_dieter/ff1010bird_nocall/nocall/",
    DATA_PATH + "nocall_dieter/train_soundscapes/nocall/",
]

In [None]:
max_samples = 200

dfs = []
for path in paths:
    audios = glob.glob(path + "*")
    df = pd.DataFrame({"path": audios})

    if len(df) > max_samples:
        df = df.sample(max_samples)

    df["duration"] = df["path"].apply(lambda x: librosa.get_duration(path=x))

    src = path.split("/", 3)[-1].split('/')[0]
    df["source"] = src

    df = df[df['duration'] >= 5]
    df = df[df['duration'] <= 60]

    dfs.append(df)

    print(f"-> Found {len(audios)} files in {src}")

In [None]:
df = pd.concat(dfs, ignore_index=True)

plt.figure(figsize=(15, 5))
plt.subplot(1, 2, 1)
sns.countplot(y=df['source'].values)
plt.subplot(1, 2, 2)
sns.histplot(x=df['duration'].values)
plt.show()

In [None]:
SAVE_FOLDER = DATA_PATH + "nocall_features/"
audios = df["path"].values

In [None]:
targets = [create_target_path(SAVE_FOLDER, f) for f in audios]

os.makedirs(SAVE_FOLDER, exist_ok=True)
for f in set([os.path.dirname(f) for f in targets]):
    os.makedirs(f, exist_ok=True)

print(f"-> Created {len(os.listdir(SAVE_FOLDER))} folders")

In [None]:
fct = get_load_librosa_save_h5py(do_normalize=False, sr=32000)
fct(audios[0], targets[0])

In [None]:
_ = ProgressParallel(n_jobs=16, total=len(audios))(
    delayed(fct)(load_path, save_path)
    for load_path, save_path in zip(audios, targets)
)

In [None]:
saved_targets = glob.glob(SAVE_FOLDER + "*/*.hdf5")
print(f"Saved {len(saved_targets)} files")

### Duplicates

In [None]:
df_tot = pd.concat([df, df_xc])
dups = df_tot[df_tot.duplicated(subset='id', keep=False)].sort_values('id')

In [None]:
# dups[dups['fold'] == -1].id.values.tolist()

In [None]:
dups.head(10)

In [None]:
idx = 0

In [None]:
display_audio(dups["path"].values[idx])

In [None]:
display_audio(dups["path"].values[idx + 1])

Done !