**About** : This notebook is used to prepare the data.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
cd ../src/

/home/tviel/work/kaggle_birdclef_2024/src


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


### Imports

In [3]:
import os
import sys
import glob
import json
import h5py
import librosa
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm
from copy import deepcopy
from joblib import delayed
from sklearn.metrics import *

warnings.simplefilter(action="ignore", category=UserWarning)
pd.set_option('display.width', 500)
pd.set_option('max_colwidth', 100)

In [4]:
from params import *
from data.preparation import *
from util.plots import display_audio
from data.processing import create_target_path, ProgressParallel, get_load_librosa_save_h5py

### Competiton data

In [None]:
df = prepare_data_2()

In [None]:
prepare_folds()

In [None]:
df.head()

In [None]:
SAVE_FOLDER = DATA_PATH + "train_features/"

In [None]:
!rm -rf $SAVE_FOLDER

In [None]:
audios = glob.glob(DATA_PATH + "train_audio/*/*")
print(f"-> Found {len(audios)} files")

In [None]:
targets = [create_target_path(SAVE_FOLDER, f) for f in audios]

os.makedirs(SAVE_FOLDER, exist_ok=True)
for f in set([os.path.dirname(f) for f in targets]):
    os.makedirs(f, exist_ok=True)

print(f"-> Created {len(os.listdir(SAVE_FOLDER))} folders")

In [None]:
fct = get_load_librosa_save_h5py(do_normalize=False, sr=32000)
# fct(audios[0], targets[0])

In [None]:
_ = ProgressParallel(n_jobs=16, total=len(audios))(
    delayed(fct)(load_path, save_path)
    for load_path, save_path in zip(audios, targets)
)

In [None]:
saved_targets = glob.glob(SAVE_FOLDER + "*/*.hdf5")
print(f"Saved {len(saved_targets)} files")

In [None]:
df.head()

### Previous comp data
- Download the already precomputed features to save time

In [16]:
df = pd.read_csv(DATA_PATH + "df_extra_comp.csv")

In [17]:
SAVE_FOLDER = DATA_PATH + "prev_comps_features/"

In [18]:
audios = df.path.values
print(f"-> Found {len(audios)} files")

-> Found 166 files


In [13]:
targets = [create_target_path(SAVE_FOLDER, f) for f in audios]

os.makedirs(SAVE_FOLDER, exist_ok=True)
for f in set([os.path.dirname(f) for f in targets]):
    os.makedirs(f, exist_ok=True)

print(f"-> Created {len(os.listdir(SAVE_FOLDER))} folders")

-> Created 9 folders


In [14]:
fct = get_load_librosa_save_h5py(do_normalize=False, sr=32000)
# fct(audios[0], targets[0])

In [16]:
for load_path, save_path in tqdm(zip(audios, targets), total=len(audios)):
    fct(load_path, save_path) 

  0%|          | 0/166 [00:00<?, ?it/s]

100%|██████████| 166/166 [00:03<00:00, 53.75it/s] 


In [17]:
saved_targets = glob.glob(SAVE_FOLDER + "*/*.hdf5")
print(f"Saved {len(saved_targets)} files")

Saved 166 files


In [20]:
df["fold"] = -1
df["path_ft"] = targets
df.to_csv('../input/df_extra_comp.csv', index=False)

### Xenocanto extra data

In [None]:
df_xc = prepare_xenocanto_data()

# df_xc = add_xeno_low_freq(df, upsample_to=0, low_freq=500).reset_index(drop=True)

In [None]:
audios = glob.glob(DATA_PATH + "xenocanto/audio/*/*")
audios = [a for a in audios if a.split("/", 4)[-1] in df_xc.filename.values]
print(f"-> Found {len(audios)} files")

In [None]:
SAVE_FOLDER = DATA_PATH + "xenocanto/features/"

In [None]:
!rm -rf $SAVE_FOLDER

In [None]:
targets = [create_target_path(SAVE_FOLDER, f) for f in audios]

os.makedirs(SAVE_FOLDER, exist_ok=True)
for f in set([os.path.dirname(f) for f in targets]):
    os.makedirs(f, exist_ok=True)

print(f"-> Created {len(os.listdir(SAVE_FOLDER))} folders")

In [None]:
fct = get_load_librosa_save_h5py(do_normalize=False, sr=32000)
fct(audios[0], targets[0])

In [None]:
_ = ProgressParallel(n_jobs=16, total=len(audios))(
    delayed(fct)(load_path, save_path)
    for load_path, save_path in zip(audios, targets)
)

In [None]:
saved_targets = glob.glob(SAVE_FOLDER + "*/*.hdf5")
print(f"Saved {len(saved_targets)} files")

### Unlabeled soundscapes

In [None]:
audios = glob.glob(DATA_PATH + "unlabeled_soundscapes/*")
print(f"-> Found {len(audios)} files")

In [None]:
SAVE_FOLDER = DATA_PATH + "unlabeled_features/"

In [None]:
!rm -rf $SAVE_FOLDER

In [None]:
targets = [create_target_path(SAVE_FOLDER, f) for f in audios]

os.makedirs(SAVE_FOLDER, exist_ok=True)
for f in set([os.path.dirname(f) for f in targets]):
    os.makedirs(f, exist_ok=True)

print(f"-> Created {len(os.listdir(SAVE_FOLDER))} folders")

In [None]:
fct = get_load_librosa_save_h5py(do_normalize=False, sr=32000)
fct(audios[0], targets[0])

In [None]:
_ = ProgressParallel(n_jobs=16, total=len(audios))(
    delayed(fct)(load_path, save_path)
    for load_path, save_path in zip(audios, targets)
)

In [None]:
saved_targets = glob.glob(SAVE_FOLDER + "*/*.hdf5")
print(f"Saved {len(saved_targets)} files")

Done !