## Interactive audio sample cleaning

In [198]:
use_case = {
            'experiment' : 1,
            'project_root': '/home/olly/Desktop/Kaggle_BC25',
            #'project_root': r'C:\Users\ollyp\OneDrive\Desktop\Kaggle_BC25'
            #'project_root': '/media/olly/Red_SSD/Kaggle_BC25'
            }

In [199]:
import librosa
import time
from time import sleep
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
from pathlib import Path
from glob import glob
from tqdm import tqdm
import torchaudio
import pickle
import matplotlib.animation as animation
import IPython.display as ipd
import torchaudio.transforms as T
from IPython.display import display, HTML, Audio  # Fixed deprecated import
import librosa.display
import time
import IPython.display as ipd
from joblib import Parallel, delayed
#from IPython.core.display import display, HTML

import ipywidgets as widgets
button = widgets.Button(description="Continue")
output = widgets.Output()

%matplotlib widget  
torch.set_num_threads(1)

In [200]:
class FilePaths:
    def __init__(self, options=None):
        _project_dir = Path(options['project_root'])
        self.DATA_FOLDER = _project_dir / 'Data'
        self.ORIGINAL_AUDIO = self.DATA_FOLDER / 'Original_Data/birdclef-2025/Extra_Samples'
        self.KAGGLE_LABELS = self.DATA_FOLDER / 'Original_Data/birdclef-2025/train_extra.csv'
        self.CROP_LABELS = self.DATA_FOLDER / 'Train_Metadata' / 'marked_labels.parquet'
        self.OUTPUT_CSV_PATH = self.DATA_FOLDER / 'Train_Metadata/train.csv'
        self.OUTPUT_NAMING_CSV_PATH = self.DATA_FOLDER / 'Train_Metadata/naming.csv'
        self.OUTPUT_JSON_PATH = self.DATA_FOLDER / 'Train_Metadata/species_names.json'

paths = FilePaths(use_case)

In [201]:
class VoiceDetector():
    def __init__(self, chunk_len, threshold=0.1, no_voice=0, voice=20):
        model, (get_speech_timestamps, _, read_audio, _, _) = torch.hub.load(repo_or_dir='snakers4/silero-vad',
                                                                 model='silero_vad', verbose=False)
        self.model=model
        self.chunk_len=chunk_len
        self.threshold=threshold
        self.get_stamps = get_speech_timestamps

    def detect(self, np_wav):
        speech_timestamps = self.get_stamps(torch.Tensor(np_wav), self.model, threshold=self.threshold)
        voice_detect = np.zeros_like(np_wav)
        for st in speech_timestamps:
            voice_detect[st['start']: st['end']] = 20
    
        #downsample to match the power plot axis
        voice_detect = np.pad(voice_detect, 
                            (0, int(np.ceil(len(voice_detect) / self.chunk_len) * self.chunk_len - len(voice_detect))))
        voice_detect = voice_detect.reshape((-1, self.chunk_len)).max(axis=1)  # Use max to preserve speech detection
        return voice_detect
    
def calc_signal_pwr(wav, chunk_len, sr=32000):
    power = wav ** 2 
    power = np.pad(power, (0, int(np.ceil(len(power) / chunk_len) * chunk_len - len(power))))
    power = power.reshape((-1, chunk_len)).sum(axis=1)
    return power

class MelSpecMaker():
    def __init__(self, sr=32000, n_mels=128, n_fft=2048, f_min = 20, f_max=14000):
        self.sr = sr
        self.n_mels = n_mels
        self.n_fft = n_fft
        self.f_min = f_min
        self.f_max = f_max
        self.mel_transform = T.MelSpectrogram(sample_rate=self.sr,
                                              n_mels=self.n_mels,
                                              f_min=self.f_min,
                                              f_max=self.f_max,
                                              n_fft=self.n_fft)

    def create_melspec(self, waveform):
        waveform = torch.tensor(waveform).unsqueeze(0)  #We need a channel-first Torch tensor
        mel_spec = self.mel_transform(waveform)
        mel_spec_db = torchaudio.functional.amplitude_to_DB(
            mel_spec, 
            multiplier=10.0, 
            amin=1e-10,  
            db_multiplier=0.0  
        ).squeeze(0).numpy()

        num_frames = mel_spec_db.shape[1]
        duration = waveform.shape[1] / self.sr 
        time_axis = np.linspace(0, duration, num=num_frames)
        mel_frequencies = librosa.mel_frequencies(n_mels=self.n_mels, fmin=20, fmax=14000)

        return mel_spec_db, time_axis, mel_frequencies

def interactive_plot(mel_spec_db,
                     mel_frequencies,
                     power,
                     segmentation,
                     chunk_duration,
                     common_nm,
                     zoo_cls):
    """Interactive plot with click-based marking, auto-spacing, and drag-to-mark functionality."""
    
    duration = len(power) * chunk_duration
    t_power = np.arange(len(power)) * chunk_duration
    t_seg = np.arange(len(segmentation)) * chunk_duration
    t_melspec = np.linspace(0, duration, num=mel_spec_db.shape[1])

    marked_times = []
    marked_lines = []
    dragging = False
    start_time = None
    
    fig, ax = plt.subplots(figsize=(8, 2))
    ax.set_title(f'Recording of a {common_nm} ({zoo_cls})')

    img = ax.imshow(mel_spec_db, aspect='auto', origin='lower', cmap='magma',
                    extent=[t_melspec[0], t_melspec[-1], mel_frequencies[0], mel_frequencies[-1]],
                    zorder=1)

    ax.set_xlabel("Time (s)")
    ax.set_ylabel("Frequency (Hz)", color='m')
    ax.tick_params(axis='y', labelcolor='m')

    ax2 = ax.twinx()
    ax2.plot(t_power, 10 * np.log10(power), 'b', label='Power', zorder=2)  
    ax2.plot(t_seg, segmentation, 'k', label='Voice', zorder=2)
    ax2.set_ylabel("Power (dB) / Voice Detection", color='b')
    ax2.tick_params(axis='y', labelcolor='b')
    ax2.legend(loc="upper right")

    def add_marker(time):
        if time not in marked_times:
            marked_times.append(time)
            line1 = ax.axvline(time - 6, color='gray', linestyle='--', zorder=3)
            line2 = ax.axvline(time + 6, color='gray', linestyle='--', zorder=3)
            area = ax.axvspan(time - 6, time + 6, color='gray', alpha=0.5)
            line3 = ax.axvline(time, color='g', linestyle='--', zorder=4)
            marked_lines.append((line1, line2, line3, area))
            fig.canvas.draw_idle()

    def onclick(event):
        nonlocal dragging, start_time
        if event.inaxes is not None and event.button == 1:
            dragging = True
            start_time = round(event.xdata, 1)
            add_marker(start_time)

    def onmotion(event):
        if dragging and event.inaxes is not None:
            current_time = round(event.xdata, 1)
            if current_time > start_time:
                time_offset = current_time - start_time
                next_marker = start_time + 12 * (time_offset // 12)
                if next_marker <= duration and next_marker not in marked_times:
                    add_marker(next_marker)

    def onclick(event):
        nonlocal dragging, start_time
        if event.inaxes is not None:
            if event.button == 1:  # Left click to add markers
                dragging = True
                start_time = round(event.xdata, 1)
                add_marker(start_time)
            elif event.button == 3:  # Right click to remove the most recent mark
                if marked_times:
                    marked_times.pop()
                    line_set = marked_lines.pop()
                    for line in line_set:
                        line.remove()
                    fig.canvas.draw_idle()

    def onrelease(event):
        nonlocal dragging
        if event.button == 1:
            dragging = False
    
    def onkeypress(event):
        if event.key == 'a':
            for line_set in marked_lines:
                for line in line_set:
                    line.remove()
            marked_lines.clear()
            marked_times.clear()
            
            end_buffer = 6
            start_buffer= 6
            max_spacing = 12
            time_to_cover = max(0, duration - start_buffer - end_buffer)
            num_spaces = time_to_cover // max_spacing + 1
            spacing = time_to_cover / num_spaces
            #spacing = time_to_cover/num_marks
            time = start_buffer
            while time <= (duration - end_buffer):
                add_marker(round(time,1))
                time += spacing
        elif event.key == ' ':  # Spacebar to clear all
            for line_set in marked_lines:
                for line in line_set:
                    line.remove()
            marked_lines.clear()
            marked_times.clear()
        
        fig.canvas.draw_idle()

    fig.canvas.mpl_connect('button_press_event', onclick)
    fig.canvas.mpl_connect('motion_notify_event', onmotion)
    fig.canvas.mpl_connect('button_release_event', onrelease)
    fig.canvas.mpl_connect('key_press_event', onkeypress)

    plt.show()
    return marked_times





def mark_one_sample(filename, common_nm, zoo_cls):
    wav, sr = librosa.load(filename, sr=None)  #returns a mono-channel NumPy array
    chunk_duration = 0.1
    chunk_len  = int(chunk_duration * sr)
    specmaker = MelSpecMaker(sr=sr)
    voice_detector = VoiceDetector(chunk_len)
    mel_spec_db, time_axis, mel_frequencies = specmaker.create_melspec(wav)
    power = calc_signal_pwr(wav, chunk_len)
    voice_detections = voice_detector.detect(wav)
    # Set the audio player width to match the plot width
    display(HTML("<style>audio { width: 800px; margin-left: 35px; }</style>"))
    display(ipd.Audio(filename)) 
    marked_times = interactive_plot(mel_spec_db,
                                    mel_frequencies,
                                    power,
                                    voice_detections,
                                    chunk_duration,
                                    common_nm=common_nm,
                                    zoo_cls=zoo_cls,)
    return marked_times

def plot_duration_mix(df, threshold):
    df['duration_category'] = df['duration'].apply(lambda x: f'< {threshold} s' if x < threshold else f'> {threshold} s')
    df_counts = df.groupby(['primary_label', 'duration_category']).size().reset_index(name='count')
    df_counts = df_counts.sort_values(by='count', ascending=False)
    custom_colors = {f'< {threshold} s': 'blue', f'> {threshold} s': 'red'}

    fig = px.bar(
        df_counts, 
        x="primary_label", 
        y="count", 
        color="duration_category",
        title="Stacked Occurrences by Duration",
        labels={"duration_category": "Duration", "count": "Occurrences", "primary_label": "Label"},
        barmode="stack",
        color_discrete_map=custom_colors,
        opacity=1.0  # Ensure full opacity
    )
    fig.show()


def get_audio_duration(filepath):
    info = torchaudio.info(filepath)
    return info.num_frames / info.sample_rate  # Duration in seconds

def get_audio_duration(path):
    try:
        return torchaudio.info(path).num_frames / torchaudio.info(path).sample_rate
    except Exception as e:
        print(f"Failed to read {path}: {e}")
        return None

## Process Kaggle Samples
Start by loading the auxiliary dataframe parquet with a 'duration' column produce by the EDA notebook

In [202]:
#df = pd.read_parquet(paths.CROP_LABELS, engine="pyarrow")
df = pd.read_csv(paths.KAGGLE_LABELS)  #, sep=';'
df.head(3)

Unnamed: 0,filename,primary_label,url,class,common_name,License
0,1139490/2391.wav,1139490,http://orthoptera.archive.speciesfile.org/Comm...,insecta,Ragoniella pulchella,CC-BY
1,42113/XC975063.wav,42113,https://xeno-canto.org/admin.php/975063/download,aves,Collared Peccary,Creative Commons Attribution-NonCommercial-Sha...
2,66016/vaillanti-escape1.mp3,66016,https://www.fonozoo.com/sonidos/Lithobates%20v...,amphibia,Vaillant's Frog,


In [203]:
df['filepath'] = str(paths.ORIGINAL_AUDIO) + '/' +  df['filename']
df.head()

Unnamed: 0,filename,primary_label,url,class,common_name,License,filepath
0,1139490/2391.wav,1139490,http://orthoptera.archive.speciesfile.org/Comm...,insecta,Ragoniella pulchella,CC-BY,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...
1,42113/XC975063.wav,42113,https://xeno-canto.org/admin.php/975063/download,aves,Collared Peccary,Creative Commons Attribution-NonCommercial-Sha...,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...
2,66016/vaillanti-escape1.mp3,66016,https://www.fonozoo.com/sonidos/Lithobates%20v...,amphibia,Vaillant's Frog,,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...
3,66016/vaillanti-escape3.mp3,66016,https://www.fonozoo.com/sonidos/Lithobates%20v...,amphibia,Vaillant's Frog,,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...
4,66016/vaillanti-escape4.mp3,66016,https://www.fonozoo.com/sonidos/Lithobates%20v...,amphibia,Vaillant's Frog,,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...


In [204]:
filepaths = df['filepath'].to_list()
df["duration"] = Parallel(n_jobs=4)(
    delayed(get_audio_duration)(fp) for fp in tqdm(filepaths))

#/home/olly/Desktop/Kaggle_BC25/Data/Extra_Samples/turvul

100%|██████████| 15/15 [00:01<00:00, 12.85it/s]


Failed to read /home/olly/Desktop/Kaggle_BC25/Data/Original_Data/birdclef-2025/Extra_Samples/turlvul/XC39894.mp3: Failed to open the input "/home/olly/Desktop/Kaggle_BC25/Data/Original_Data/birdclef-2025/Extra_Samples/turlvul/XC39894.mp3" (No such file or directory).
Exception raised from get_input_format_context at /__w/_temp/conda_environment_8430229400/conda-bld/torchaudio_1711422726100/work/src/libtorio/ffmpeg/stream_reader/stream_reader.cpp:42 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x741d38180d87 in /home/olly/miniconda3/envs/bird_audio_ml/lib/python3.11/site-packages/torch/lib/libc10.so)
frame #1: c10::detail::torchCheckFail(char const*, char const*, unsigned int, std::string const&) + 0x64 (0x741d3813175f in /home/olly/miniconda3/envs/bird_audio_ml/lib/python3.11/site-packages/torch/lib/libc10.so)
frame #2: <unknown function> + 0x42904 (0x741ce25bf904 in /home/olly/miniconda3/envs/bird_audio_ml/lib/python3.11/site-packages

Let's look at the frequency of the duration column

In [205]:
import plotly.express as px

fig = px.histogram(df, x='duration', nbins=30)
fig.update_layout(title='Histogram of Duration', xaxis_title='Duration', yaxis_title='Frequency')
fig.show()

In [206]:
bins = [5, 10, 12, 20, 30, 40, 60, 120]
counts = [(df['duration'] > b).sum() for b in bins]
df_counts = pd.DataFrame({'Duration Threshold': bins, 'Count > Threshold': counts})
fig = px.line(df_counts, x='Duration Threshold', y='Count > Threshold', markers=True, title='Reverse Cumulative Histogram of Duration')
fig.update_layout(xaxis_title='Duration', yaxis_title='Count of Values > Threshold')
fig.show()

So in total we have 20,255 samples with length over 12 seconds.  Let's take a closer look at a few of these rows

In [207]:
df = df.sort_values(by='duration', ascending=False)
df.head(15)

Unnamed: 0,filename,primary_label,url,class,common_name,License,filepath,duration
8,turvul/XC381486.mp3,turvul,https://xeno-canto.org/381486,aves,Turkey Vulture,Creative Commons Attribution-NonCommercial-NoD...,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...,152.976
9,turvul/XC520288.mp3,turvul,https://xeno-canto.org/520288,aves,Turkey Vulture,Creative Commons Attribution-NonCommercial-Sha...,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...,114.239
0,1139490/2391.wav,1139490,http://orthoptera.archive.speciesfile.org/Comm...,insecta,Ragoniella pulchella,CC-BY,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...,86.146259
5,66578/Pristimantis_bogotensis15.wav,66578,https://amphibiaweb.org/sounds/Pristimantis_bo...,amphibia,Bogota Robber Frog,CC-BY-NC,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...,56.42449
13,turvul/XC780516.wav,turvul,https://xeno-canto.org/780516,aves,Turkey Vulture,Creative Commons Attribution-NonCommercial-Sha...,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...,34.833708
14,turvul/XC904279.wav,turvul,https://xeno-canto.org/904279,aves,Turkey Vulture,Creative Commons Attribution-NonCommercial-Sha...,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...,27.299184
11,turvul/XC748979.mp3,turvul,https://xeno-canto.org/748979,aves,Turkey Vulture,Creative Commons Attribution-NonCommercial-Sha...,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...,13.866009
1,42113/XC975063.wav,42113,https://xeno-canto.org/admin.php/975063/download,aves,Collared Peccary,Creative Commons Attribution-NonCommercial-Sha...,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...,8.514062
6,868458/2388.WAV,868458,http://orthoptera.archive.speciesfile.org/Comm...,insecta,Typophyllum inflatum,CC-BY,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...,8.18976
12,turvul/XC764680.wav,turvul,https://xeno-canto.org/764680,aves,Turkey Vulture,Creative Commons Attribution-NonCommercial-NoD...,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...,6.669615


Let's get a new column with the count by primary label for later filtering.

In [208]:
df['label_count'] = df.groupby('primary_label')['primary_label'].transform('count')
df.head()

Unnamed: 0,filename,primary_label,url,class,common_name,License,filepath,duration,label_count
8,turvul/XC381486.mp3,turvul,https://xeno-canto.org/381486,aves,Turkey Vulture,Creative Commons Attribution-NonCommercial-NoD...,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...,152.976,8
9,turvul/XC520288.mp3,turvul,https://xeno-canto.org/520288,aves,Turkey Vulture,Creative Commons Attribution-NonCommercial-Sha...,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...,114.239,8
0,1139490/2391.wav,1139490,http://orthoptera.archive.speciesfile.org/Comm...,insecta,Ragoniella pulchella,CC-BY,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...,86.146259,1
5,66578/Pristimantis_bogotensis15.wav,66578,https://amphibiaweb.org/sounds/Pristimantis_bo...,amphibia,Bogota Robber Frog,CC-BY-NC,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...,56.42449,1
13,turvul/XC780516.wav,turvul,https://xeno-canto.org/780516,aves,Turkey Vulture,Creative Commons Attribution-NonCommercial-Sha...,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...,34.833708,8


In [209]:
#common_df = df[df['label_count'] >= 400]
df_gt_100 = df[df['label_count'] >= 100].copy()
df_lt_100 = df[df['label_count'] <= 100].copy()
df_lt_120 = df[df['label_count'] <= 120].copy()
#df = df[(df['label_count'] <=500) | (df['duration'] <= 12)]
df.head()

Unnamed: 0,filename,primary_label,url,class,common_name,License,filepath,duration,label_count
8,turvul/XC381486.mp3,turvul,https://xeno-canto.org/381486,aves,Turkey Vulture,Creative Commons Attribution-NonCommercial-NoD...,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...,152.976,8
9,turvul/XC520288.mp3,turvul,https://xeno-canto.org/520288,aves,Turkey Vulture,Creative Commons Attribution-NonCommercial-Sha...,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...,114.239,8
0,1139490/2391.wav,1139490,http://orthoptera.archive.speciesfile.org/Comm...,insecta,Ragoniella pulchella,CC-BY,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...,86.146259,1
5,66578/Pristimantis_bogotensis15.wav,66578,https://amphibiaweb.org/sounds/Pristimantis_bo...,amphibia,Bogota Robber Frog,CC-BY-NC,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...,56.42449,1
13,turvul/XC780516.wav,turvul,https://xeno-canto.org/780516,aves,Turkey Vulture,Creative Commons Attribution-NonCommercial-Sha...,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...,34.833708,8


In [210]:
bins = [5, 10, 12, 20, 30, 40, 60, 120]
counts = [(df['duration'] > b).sum() for b in bins]

df_counts = pd.DataFrame({'Duration Threshold': bins, 'Count > Threshold': counts})

fig = px.line(df_counts, x='Duration Threshold', y='Count > Threshold', markers=True, title='Reverse Cumulative Histogram of Duration')
fig.update_layout(xaxis_title='Duration', yaxis_title='Count of Values > Threshold')
fig.show()

In [211]:
counts

[11, 7, 7, 6, 5, 4, 3, 1]

In [212]:
#filtered_df = df[df['label_count'] < 100]
duration_counts = df_lt_100.groupby('primary_label')['duration'].apply(lambda x: (x > 12).sum()).reset_index()
duration_counts.columns = ['primary_label', 'count_over_12']

fig = px.bar(duration_counts, x='primary_label', y='count_over_12', 
             title='Count of Samples with Duration > 12 for Labels with < 100 Total Samples',
             labels={'primary_label': 'Primary Label', 'count_over_12': 'Count of Duration > 12'},
             text='count_over_12')

fig.update_layout(xaxis={'categoryorder':'total ascending'}, xaxis_tickangle=-45)
fig.show()


In [213]:
len(df_lt_100)

15

In [214]:
len(df_lt_120)

15

In [215]:
df_lt_100['primary_label'].nunique()

6

In [216]:
df_lt_120['primary_label'].nunique()

6

So if I marked up all 4,159 samples from classes rarest first I would have 118 classes fully marked with up to 100 samples.  The question is what to do with the rest of the classes.  What proportion of these are over 12 seconds?

In [217]:
df_gt_100.head()

Unnamed: 0,filename,primary_label,url,class,common_name,License,filepath,duration,label_count


In [218]:
plot_duration_mix(df_gt_100, 20)

secfly1, strher & snoegr could be double in size if their long samples were included, and they only have approx 50 short samples.  But for the rest, the long samples could sensibly be filtered out for the purpose of having a clean dataset, and the loss of data would be small.  This should be tried as an experiment.

Or maybe we throw away anything over 30 seconds?  A bit less clean, but a bit more data?   Or over 30 seconds up until there are 200 samples, then over 20 seconds up to 300 samples, then anything over 12 seconds after that?

In [219]:
plot_duration_mix(df_gt_100, 30)

#### Marking Strategy
- Mark all 26 of Fabio Sierra's first, since they have a consistant voice-sample format
- Mark the remaining 118 CSA samples
- Mark the 4159 samples from the rarest 120 or so classes.
- Mark as many of the remaining 17,000 samples over 12 seconds as I can, from rarest first.
- Throw away any remaining samples over 20 seconds, and experiment later with a progressive duration/counts approach for this.

In [220]:
df.head()

Unnamed: 0,filename,primary_label,url,class,common_name,License,filepath,duration,label_count
8,turvul/XC381486.mp3,turvul,https://xeno-canto.org/381486,aves,Turkey Vulture,Creative Commons Attribution-NonCommercial-NoD...,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...,152.976,8
9,turvul/XC520288.mp3,turvul,https://xeno-canto.org/520288,aves,Turkey Vulture,Creative Commons Attribution-NonCommercial-Sha...,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...,114.239,8
0,1139490/2391.wav,1139490,http://orthoptera.archive.speciesfile.org/Comm...,insecta,Ragoniella pulchella,CC-BY,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...,86.146259,1
5,66578/Pristimantis_bogotensis15.wav,66578,https://amphibiaweb.org/sounds/Pristimantis_bo...,amphibia,Bogota Robber Frog,CC-BY-NC,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...,56.42449,1
13,turvul/XC780516.wav,turvul,https://xeno-canto.org/780516,aves,Turkey Vulture,Creative Commons Attribution-NonCommercial-Sha...,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...,34.833708,8


In [221]:
len(df)

15

In [222]:
df=df[df['duration'] > 12]
len(df)

7

In [223]:
#df['sort_csa'] = df['collection'].eq('CSA')  # True if 'csa', else False
#df['sort_fabio'] = df['author'].eq('Fabio A. Sarria-S')  # True if 'Fabio', else False
#df = df.sort_values(by=[ 'sort_fabio', 'sort_csa','label_count',], ascending=[False, False, True])
#df = df.drop(columns=['label_count', 'sort_csa', 'sort_fabio']).reset_index(drop=True)
if 'centres' not in df.columns:
    df['centres'] = [[] for _ in range(len(df))]
if 'date_reviewed' not in df.columns:
    df['date_reviewed'] = pd.NaT  # Assigns missing datetime values
df.head(12)

Unnamed: 0,filename,primary_label,url,class,common_name,License,filepath,duration,label_count,centres,date_reviewed
8,turvul/XC381486.mp3,turvul,https://xeno-canto.org/381486,aves,Turkey Vulture,Creative Commons Attribution-NonCommercial-NoD...,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...,152.976,8,[],NaT
9,turvul/XC520288.mp3,turvul,https://xeno-canto.org/520288,aves,Turkey Vulture,Creative Commons Attribution-NonCommercial-Sha...,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...,114.239,8,[],NaT
0,1139490/2391.wav,1139490,http://orthoptera.archive.speciesfile.org/Comm...,insecta,Ragoniella pulchella,CC-BY,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...,86.146259,1,[],NaT
5,66578/Pristimantis_bogotensis15.wav,66578,https://amphibiaweb.org/sounds/Pristimantis_bo...,amphibia,Bogota Robber Frog,CC-BY-NC,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...,56.42449,1,[],NaT
13,turvul/XC780516.wav,turvul,https://xeno-canto.org/780516,aves,Turkey Vulture,Creative Commons Attribution-NonCommercial-Sha...,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...,34.833708,8,[],NaT
14,turvul/XC904279.wav,turvul,https://xeno-canto.org/904279,aves,Turkey Vulture,Creative Commons Attribution-NonCommercial-Sha...,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...,27.299184,8,[],NaT
11,turvul/XC748979.mp3,turvul,https://xeno-canto.org/748979,aves,Turkey Vulture,Creative Commons Attribution-NonCommercial-Sha...,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...,13.866009,8,[],NaT


In [224]:
reviewed = df[~df['date_reviewed'].isna()].reset_index(drop=True).copy()
not_reviewed = df[df['date_reviewed'].isna()].reset_index(drop=True).copy()

In [225]:
not_reviewed.head(3)

Unnamed: 0,filename,primary_label,url,class,common_name,License,filepath,duration,label_count,centres,date_reviewed
0,turvul/XC381486.mp3,turvul,https://xeno-canto.org/381486,aves,Turkey Vulture,Creative Commons Attribution-NonCommercial-NoD...,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...,152.976,8,[],NaT
1,turvul/XC520288.mp3,turvul,https://xeno-canto.org/520288,aves,Turkey Vulture,Creative Commons Attribution-NonCommercial-Sha...,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...,114.239,8,[],NaT
2,1139490/2391.wav,1139490,http://orthoptera.archive.speciesfile.org/Comm...,insecta,Ragoniella pulchella,CC-BY,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...,86.146259,1,[],NaT


In [226]:
#print(f"Processing row with primary_label: {not_reviewed.at[0,'primary_label']}")
filepath = paths.ORIGINAL_AUDIO / not_reviewed.at[0,'filename'] #.iloc[0]
print(filepath)
common_name = not_reviewed.at[0, 'common_name']
zoo_class =  not_reviewed.at[0, 'class']
secondary = not_reviewed.at[0, 'secondary_labels']
print(f'Secondary Labels: {secondary}')
logged = False
marked_times = False
marked_times = mark_one_sample(filepath, common_name, zoo_class)

/home/olly/Desktop/Kaggle_BC25/Data/Original_Data/birdclef-2025/Extra_Samples/turvul/XC381486.mp3


KeyError: 'secondary_labels'

In [None]:
if marked_times and not logged:
    not_reviewed.at[0, 'centres'] = marked_times
    not_reviewed.at[0, 'date_reviewed'] = pd.Timestamp.now()
    reviewed = pd.concat([reviewed, not_reviewed.iloc[[0]]], ignore_index=True)
    not_reviewed = not_reviewed.iloc[1:].reset_index(drop=True)  # Drop first row and reset index
    updated_df = pd.concat([reviewed, not_reviewed], ignore_index=True)
    updated_df.to_parquet(paths.CROP_LABELS, engine="pyarrow")
    print(f'The following time centres getting added: {marked_times}')
    plt.close()
    logged=True

In [None]:
reviewed.tail(1)

Unnamed: 0,filename,primary_label,collection,author,common_name,class,secondary_labels,duration,centres,date_reviewed
5613,strowl1/XC577721.ogg,strowl1,XC,JAYRSON ARAUJO DE OLIVEIRA,Striped Owl,aves,[''],350.955094,"[6.0, 17.7, 29.4, 41.1, 52.8, 64.4, 76.1, 87.8...",2025-04-10 16:19:39.394977


rejects:  
cargra1/iNat969137.ogg  
41663/iNat181697.ogg  (actually I think it's the one after this, a racoon)  
41663/iNat1187502.ogg  
41663/iNat1001216.ogg  
gybmar/XC277037.ogg  
babwar/iNat399853.ogg
babwar/iNat247468.ogg