In [65]:
use_case = {
            'experiment' : 1,
            #'project_root': '/home/olly/Desktop/Kaggle_BC25',
            #'project_root': r'C:\Users\ollyp\OneDrive\Desktop\Kaggle_BC25'
            'project_root': '/media/olly/Red_SSD/Kaggle_BC25',
            'remake_crops': False
            }

In [66]:
import shutil
import numpy as np
import pandas as pd
import soundfile as sf
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
from joblib import Parallel, delayed
import librosa
from IPython.display import Audio 
from tqdm import tqdm
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
from pydub import AudioSegment
import torchaudio
from scipy.signal import resample
pd.set_option('display.max_colwidth', None)

In [67]:
class Stop: #bold red
    S = '\033[1m' + '\033[91m'
    E = '\033[0m'
    
class Go: #bold green
    S = '\033[1m' + '\033[32m'
    E = '\033[0m'
    
class Blue: #for general info
    S = '\033[94m'
    E = '\033[0m'

In [68]:
class FilePaths:
    def __init__(self, options=None):
        _project_dir = Path(options['project_root'])
        self.DATA_FOLDER = _project_dir / 'Data'
        self.ORIGINAL_AUDIO = self.DATA_FOLDER / 'Original_Data/birdclef-2025/train_audio'
        self.MODIFIED_AUDIO = self.DATA_FOLDER / 'Cropped_Train_Audio'
        self.MARKED_LABELS = self.DATA_FOLDER / 'Train_Metadata' / 'marked_labels.parquet'
        self.KAGGLE_LABELS_PLUS_DURATION = self.DATA_FOLDER / 'Train_Metadata' / 'aux_labels.parquet'
        self.CROP_LABELS_PATH = self.DATA_FOLDER / 'Train_Metadata/cropped_audio_labels.parquet'

        if not self.MODIFIED_AUDIO.exists:
            shutil.rmtree(self.MODIFIED_AUDIO)  # remove the old one
        self.MODIFIED_AUDIO.mkdir(parents=True, exist_ok=True)

paths = FilePaths(use_case)

In [69]:
class Config():
    def __init__(self, options=None):
        self.DEFAULT_LENGTH = 48
        self.OUTPUT_FILE_TYPE = '.flac'
        self.OUTPUT_DEPTH = 16
        self.SAVE_SOUNDFILES = options['remake_crops'] if options is not None else True 
        self.SR = 32000
        self.DEBUG = False

In [70]:
cfg = Config(use_case)

In [71]:
#bird_map = BirdCodeConverter(paths.INPUT_BIRD_MAP)

## Helper Functions

In [72]:
def load_file(path):
    try:
        waveform, sr = torchaudio.load(path)
        y = waveform.numpy()
        if y.shape[0] == 2:
            y = y.mean(axis=0)
        else:
            y = y.squeeze()

        loaded_length = len(y)//sr
        if loaded_length <= 1:
            print(f'Warning, the loaded clip length is only {loaded_length} seconds for the file {path}')
        return y, sr
    except Exception as e:
        print(f"Error processing file {path}: {e}")
        return None, None

In [73]:
def plot_continuous(df, column_name, x_max=None, x_min=None, bins=None):
    plt.figure(figsize=(6, 4))
    ax = sns.histplot(df[column_name], bins=bins, kde=True)
    plt.title(f'Distribution of {column_name} with {bins} Bins and KDE')
    plt.xlabel(column_name)
    ax.set(xlim=(x_min, x_max) if x_min is not None and x_max is not None else None)
    plt.ylabel('Count')
    plt.show()
   
def map_names(row, mapping):
    common_name = mapping[row['primary_label']][0]
    scientific_name = mapping[row['primary_label']][1]
    return pd.Series([common_name, scientific_name], index=['common_name', 'scientific_name'])

def process_time(x):
    if pd.isna(x):
        return np.NaN
    else:
        minutes, seconds = map(int, x.split(':'))
        return int(minutes * 60 + seconds)

## Load Metadata

In [74]:
df_marked = pd.read_parquet(paths.MARKED_LABELS, engine="pyarrow")
df_marked.head(3)

Unnamed: 0,filename,primary_label,collection,author,common_name,class,secondary_labels,duration,centres,date_reviewed
0,528041/CSA36365.ogg,528041,CSA,Fabio A. Sarria-S,Orophus conspersus,insecta,[''],112.417563,[4.1],2025-03-30 17:26:38.846914
1,528041/CSA36359.ogg,528041,CSA,Fabio A. Sarria-S,Orophus conspersus,insecta,[''],101.033469,[4.1],2025-03-30 17:26:52.355833
2,1139490/CSA36385.ogg,1139490,CSA,Fabio A. Sarria-S,Ragoniella pulchella,insecta,[''],98.853375,[4.0],2025-03-30 17:27:03.258653


In [75]:
df_marked.shape

(20255, 10)

In [76]:
df = pd.read_parquet(paths.KAGGLE_LABELS_PLUS_DURATION, engine="pyarrow")
df.head(3)

Unnamed: 0,filename,primary_label,collection,author,common_name,class,secondary_labels,duration
0,1139490/CSA36385.ogg,1139490,CSA,Fabio A. Sarria-S,Ragoniella pulchella,insecta,[''],98.853375
1,1139490/CSA36389.ogg,1139490,CSA,Fabio A. Sarria-S,Ragoniella pulchella,insecta,[''],96.537719
2,1192948/CSA36358.ogg,1192948,CSA,Fabio A. Sarria-S,Oxyprora surinamensis,insecta,[''],116.599812


In [77]:
df.shape

(28564, 8)

Let's check an example

In [78]:
random_row = df.sample(n=1)
fn= str(paths.ORIGINAL_AUDIO / random_row['filename'].item())
duration = random_row['duration'].item()
print(f'The file {fn} has a duration of {duration} seconds')
Audio(fn, rate=32000)

The file /media/olly/Red_SSD/Kaggle_BC25/Data/Original_Data/birdclef-2025/train_audio/spepar1/iNat38721.ogg has a duration of 18.5 seconds


In [79]:
info = sf.info(fn)
print(f"Sample rate: {info.samplerate}")
print(f"Channels: {info.channels}")
print(f"Bit depth: {info.subtype}")  # This shows things like 'PCM_16', 'PCM_24'

Sample rate: 32000
Channels: 1
Bit depth: VORBIS


In [80]:
array, rate = load_file(fn)
array.shape

(592000,)

In [81]:
#sf.write("/home/olly/Desktop/example.ogg", array, rate)
import torch
audio_tensor = torch.unsqueeze(torch.from_numpy(array),0)
torchaudio.save("/home/olly/Desktop/example.ogg", audio_tensor, rate)

In [82]:
Audio("/home/olly/Desktop/example.ogg", rate=32000)

## Determine crop positions

- Only work with samples > 48 seconds
- If no overlap with the 48 position, then crop at 48
- If there is an overlap, crop at the top of that block of 12 seconds.
- The lists for the remaining part must be reset by that amount, and it goes back into the cue

In [83]:
df_long = df_marked[df_marked['duration']> 48].copy()
df_long.head(40)

Unnamed: 0,filename,primary_label,collection,author,common_name,class,secondary_labels,duration,centres,date_reviewed
0,528041/CSA36365.ogg,528041,CSA,Fabio A. Sarria-S,Orophus conspersus,insecta,[''],112.417563,[4.1],2025-03-30 17:26:38.846914
1,528041/CSA36359.ogg,528041,CSA,Fabio A. Sarria-S,Orophus conspersus,insecta,[''],101.033469,[4.1],2025-03-30 17:26:52.355833
2,1139490/CSA36385.ogg,1139490,CSA,Fabio A. Sarria-S,Ragoniella pulchella,insecta,[''],98.853375,[4.0],2025-03-30 17:27:03.258653
3,1139490/CSA36389.ogg,1139490,CSA,Fabio A. Sarria-S,Ragoniella pulchella,insecta,[''],96.537719,[4.6],2025-03-30 17:27:12.371260
4,1462711/CSA36371.ogg,1462711,CSA,Fabio A. Sarria-S,Cocconotus aratifrons,insecta,[''],107.817344,[4.1],2025-03-30 17:27:24.786244
5,1462711/CSA36379.ogg,1462711,CSA,Fabio A. Sarria-S,Cocconotus aratifrons,insecta,[''],99.432906,[4.4],2025-03-30 17:27:34.326689
6,1462711/CSA36390.ogg,1462711,CSA,Fabio A. Sarria-S,Cocconotus aratifrons,insecta,[''],98.302344,[4.5],2025-03-30 17:27:42.771245
7,1192948/CSA36358.ogg,1192948,CSA,Fabio A. Sarria-S,Oxyprora surinamensis,insecta,[''],116.599812,[4.2],2025-03-30 17:27:52.559547
8,1192948/CSA36366.ogg,1192948,CSA,Fabio A. Sarria-S,Oxyprora surinamensis,insecta,[''],105.446313,[4.5],2025-03-30 17:28:02.315068
9,1192948/CSA36373.ogg,1192948,CSA,Fabio A. Sarria-S,Oxyprora surinamensis,insecta,[''],103.631469,[4.2],2025-03-30 17:28:11.034505


In [84]:
df_long.shape

(5861, 10)

That's 5861 samples over 48 seconds long.  Breaking these into multiple samples will have quite a big impact on the training distribution.  Let's work with this sample from row 28 to come up with a suitable process.

In [85]:
row = df_long.iloc[28]
row

filename                                    476537/CSA35459.ogg
primary_label                                            476537
collection                                                  CSA
author              Eliana Barona-Cortés | Daniela García-Cobos
common_name                                Colombian Plump Frog
class                                                  amphibia
secondary_labels                                           ['']
duration                                                157.296
centres               [5.4, 21.0, 33.9, 47.0, 61.9, 74.0, 80.4]
date_reviewed                        2025-03-31 07:24:40.504696
Name: 29, dtype: object

In [86]:
centres = row['centres'].tolist()
duration = row['duration']
centres

[5.4, 21.0, 33.9, 47.0, 61.9, 74.0, 80.4]

In [87]:
duration

157.296

In [88]:
[print(num) for num in range(0,120,20)]

0
20
40
60
80
100


[None, None, None, None, None, None]

In [89]:
default_length = 48
buffer = 6

def crop_with_centres(duration, centres, default_length, buffer):
    crop_centres = [c for c in centres if c < default_length - buffer]
    remaining_centres = [v for v in centres if v >= default_length - buffer]
    last_crop_centre = max(crop_centres, default=default_length - buffer)
    next_crop_centre = min(remaining_centres, default=default_length + buffer)
    crop_length = last_crop_centre + buffer
    new_centres = [round(c - next_crop_centre + buffer,2) for c in remaining_centres] #So the first should always = buffer
    new_duration = round(duration - crop_length,2)
    next_start = round(next_crop_centre - buffer,2)
    return next_start, crop_length, crop_centres, new_duration, new_centres


def split_row(row, default_length, buffer):
    #print(row['centres'])
    centres = row['centres'].tolist()
    duration = row['duration']

    crop_dict = {}
    counter = 0
    position = 0
    if isinstance(centres, list) and centres:
        while centres:
            next_start, crop_length, crop_centres,  duration, centres = crop_with_centres(duration, centres, default_length, buffer)
            new_fn = row['filename'].removesuffix(".ogg") + '_' + str(counter) + '.ogg'
            crop_dict[new_fn] = (position, crop_length, crop_centres)
            position = next_start
            counter +=1
    else:
        #do cropping uniformly, dict has None for the list
        for idx, start_time in enumerate(range(0, int(duration), default_length)):
            if duration - start_time > 2* buffer: #don't bother using anything less than 12 seconds
                new_fn = new_fn = row['filename'].removesuffix(".ogg") + '_' + str(idx) + '.ogg'
                length = default_length if duration - start_time > default_length else round(duration - start_time,1)
                crop_dict[new_fn] = (start_time, length, None)

    return crop_dict

crop_dicts = {}
for _, row in df_long.iterrows():
    orignal_fn = row['filename']
    crop_dict = split_row(row, default_length, buffer)
    crop_dicts[orignal_fn] = crop_dict

len(crop_dicts)


5861

These were from the CSA collection, where there was only a short amount of usable sound at the beginning, but the segment was really long.  So it's correct

In [90]:
for idx, key in enumerate(crop_dicts):
    if idx < 5:
        print(crop_dicts[key])

{'528041/CSA36365_0.ogg': (0, 10.1, [4.1])}
{'528041/CSA36359_0.ogg': (0, 10.1, [4.1])}
{'1139490/CSA36385_0.ogg': (0, 10.0, [4.0])}
{'1139490/CSA36389_0.ogg': (0, 10.6, [4.6])}
{'1462711/CSA36371_0.ogg': (0, 10.1, [4.1])}


In [91]:
for idx, key in enumerate(crop_dicts):
    if 100 < idx < 105:
        print(crop_dicts[key])

{'52884/CSA35167_0.ogg': (0, 38.0, [8.0, 20.0, 32.0]), '52884/CSA35167_1.ogg': (38.0, 36.0, [6.0, 18.0, 30.0]), '52884/CSA35167_2.ogg': (36.0, 36.0, [6.0, 18.0, 30.0]), '52884/CSA35167_3.ogg': (36.0, 36.0, [6.0, 18.0, 30.0]), '52884/CSA35167_4.ogg': (36.0, 12.0, [6.0])}
{'52884/CSA18799_0.ogg': (0, 44.1, [7.5, 19.5, 27.6, 38.1]), '52884/CSA18799_1.ogg': (63.4, 41.2, [6.0, 21.8, 35.2]), '52884/CSA18799_2.ogg': (61.9, 45.1, [6.0, 13.8, 24.7, 39.1]), '52884/CSA18799_3.ogg': (50.0, 12.0, [6.0])}
{'52884/CSA35130_0.ogg': (0, 39.2, [9.2, 21.2, 33.2]), '52884/CSA35130_1.ogg': (39.2, 36.0, [6.0, 18.0, 30.0]), '52884/CSA35130_2.ogg': (36.0, 36.0, [6.0, 18.0, 30.0]), '52884/CSA35130_3.ogg': (36.0, 12.0, [6.0])}
{'52884/CSA11080_0.ogg': (0, 46.7, [16.7, 28.7, 40.7]), '52884/CSA11080_1.ogg': (46.7, 36.0, [6.0, 18.0, 30.0]), '52884/CSA11080_2.ogg': (36.0, 36.0, [6.0, 18.0, 30.0]), '52884/CSA11080_3.ogg': (36.0, 36.0, [6.0, 18.0, 30.0])}


In [92]:
for idx, key in enumerate(crop_dicts):
    if 3000 < idx < 3005:
        print(crop_dicts[key])

{'yercac1/XC842227_0.ogg': (0, 48, None), 'yercac1/XC842227_1.ogg': (48, 48, None), 'yercac1/XC842227_2.ogg': (96, 48, None)}
{'yercac1/XC122293_0.ogg': (0, 48, None), 'yercac1/XC122293_1.ogg': (48, 48, None), 'yercac1/XC122293_2.ogg': (96, 46.7, None)}
{'yercac1/XC129007_0.ogg': (0, 48, None), 'yercac1/XC129007_1.ogg': (48, 48, None), 'yercac1/XC129007_2.ogg': (96, 41.8, None)}
{'yercac1/XC59783_0.ogg': (0, 48, None), 'yercac1/XC59783_1.ogg': (48, 48, None), 'yercac1/XC59783_2.ogg': (96, 38.9, None)}


In [93]:
for idx, key in enumerate(crop_dicts):
    if idx > 5856:
        print(crop_dicts[key])

{'grekis/XC806345_0.ogg': (0, 48, None)}
{'grekis/iNat926084_0.ogg': (0, 48, None)}
{'grekis/XC518343_0.ogg': (0, 48, None)}
{'grekis/XC317696_0.ogg': (0, 48, None)}


## Modify the labels dataframe

In [94]:
def process_row(row, crop_dict):
    rows = []
    for crop_fn in crop_dict:
        new_row = row.copy()  # important! don't modify the original
        new_row['start_position'] = crop_dict[crop_fn][0]
        new_row['duration'] = crop_dict[crop_fn][1]
        new_row['centres'] = crop_dict[crop_fn][2]
        new_row['original_file'] = row['filename']
        new_row['filename'] = crop_fn
        rows.append(new_row)

    return rows

new_rows = []
for _, row in df_long.iterrows():
    orignal_fn = row['filename']
    crops = crop_dicts[orignal_fn]
    replacements = process_row(row, crops)  # returns list of pd.Series
    new_rows.extend(replacements)

new_df = pd.DataFrame(new_rows)
new_df.head()

Unnamed: 0,filename,primary_label,collection,author,common_name,class,secondary_labels,duration,centres,date_reviewed,start_position,original_file
0,528041/CSA36365_0.ogg,528041,CSA,Fabio A. Sarria-S,Orophus conspersus,insecta,[''],10.1,[4.1],2025-03-30 17:26:38.846914,0.0,528041/CSA36365.ogg
1,528041/CSA36359_0.ogg,528041,CSA,Fabio A. Sarria-S,Orophus conspersus,insecta,[''],10.1,[4.1],2025-03-30 17:26:52.355833,0.0,528041/CSA36359.ogg
2,1139490/CSA36385_0.ogg,1139490,CSA,Fabio A. Sarria-S,Ragoniella pulchella,insecta,[''],10.0,[4.0],2025-03-30 17:27:03.258653,0.0,1139490/CSA36385.ogg
3,1139490/CSA36389_0.ogg,1139490,CSA,Fabio A. Sarria-S,Ragoniella pulchella,insecta,[''],10.6,[4.6],2025-03-30 17:27:12.371260,0.0,1139490/CSA36389.ogg
4,1462711/CSA36371_0.ogg,1462711,CSA,Fabio A. Sarria-S,Cocconotus aratifrons,insecta,[''],10.1,[4.1],2025-03-30 17:27:24.786244,0.0,1462711/CSA36371.ogg


In [95]:
new_df.iloc[1000:1015]

Unnamed: 0,filename,primary_label,collection,author,common_name,class,secondary_labels,duration,centres,date_reviewed,start_position,original_file
1166,crbtan1/XC236091_1.ogg,crbtan1,XC,Niels Krabbe,Crimson-backed Tanager,aves,[''],36.0,"[6.0, 18.0, 30.0]",2025-04-01 07:39:09.965107,45.6,crbtan1/XC236091.ogg
1166,crbtan1/XC236091_2.ogg,crbtan1,XC,Niels Krabbe,Crimson-backed Tanager,aves,[''],18.1,"[6.0, 12.1]",2025-04-01 07:39:09.965107,36.0,crbtan1/XC236091.ogg
1167,crbtan1/XC373218_0.ogg,crbtan1,XC,Kent Livezey,Crimson-backed Tanager,aves,[''],42.3,"[2.2, 12.3, 24.3, 36.3]",2025-04-01 07:39:40.670814,0.0,crbtan1/XC373218.ogg
1167,crbtan1/XC373218_1.ogg,crbtan1,XC,Kent Livezey,Crimson-backed Tanager,aves,[''],39.2,"[6.0, 18.0, 33.2]",2025-04-01 07:39:40.670814,42.3,crbtan1/XC373218.ogg
1168,labter1/XC384529_0.ogg,labter1,XC,Jan Cubilla,Large-billed Tern,aves,[''],36.8,"[6.8, 18.8, 30.8]",2025-04-01 07:39:57.817478,0.0,labter1/XC384529.ogg
1168,labter1/XC384529_1.ogg,labter1,XC,Jan Cubilla,Large-billed Tern,aves,[''],43.4,"[6.0, 18.0, 30.0, 37.4]",2025-04-01 07:39:57.817478,36.8,labter1/XC384529.ogg
1169,labter1/XC186487_0.ogg,labter1,XC,João Gava,Large-billed Tern,aves,[''],46.8,"[4.8, 16.8, 28.8, 40.8]",2025-04-01 07:40:09.450959,0.0,labter1/XC186487.ogg
1169,labter1/XC186487_1.ogg,labter1,XC,João Gava,Large-billed Tern,aves,[''],24.0,"[6.0, 18.0]",2025-04-01 07:40:09.450959,46.8,labter1/XC186487.ogg
1170,labter1/XC356722_0.ogg,labter1,XC,Miguel Angel Torres,Large-billed Tern,aves,[''],47.1,"[5.1, 17.1, 29.1, 41.1]",2025-04-01 07:40:24.900409,0.0,labter1/XC356722.ogg
1170,labter1/XC356722_1.ogg,labter1,XC,Miguel Angel Torres,Large-billed Tern,aves,[''],19.4,"[6.0, 13.4]",2025-04-01 07:40:24.900409,47.1,labter1/XC356722.ogg


## Validate Metadata

In [96]:
non_numeric_values = {}
columns_to_check = ['duration', 'start_position']
for col in columns_to_check:
    numeric_values = pd.to_numeric(new_df[col], errors='coerce')
    non_numeric_values[col] = new_df[numeric_values.isna()]

for col, non_numeric_df in non_numeric_values.items():
    if len(non_numeric_df) != 0:
        print(f"Non-numeric values in {col}:")
        print(non_numeric_df)
    else:
        print(Go.S + f'There are no non-numeric values in {col}' + Go.E)

[1m[32mThere are no non-numeric values in duration[0m
[1m[32mThere are no non-numeric values in start_position[0m


In [97]:
new_df.describe()

Unnamed: 0,duration,date_reviewed,start_position
count,14156.0,4422,14156.0
mean,39.639771,2025-04-02 18:35:38.755640064,52.351556
min,8.4,2025-03-30 17:26:38.846914,0.0
25%,36.0,2025-04-01 09:12:57.903235072,0.0
50%,48.0,2025-04-02 15:40:46.719691008,36.0
75%,48.0,2025-04-03 20:04:07.631607040,48.0
max,48.0,2025-04-06 10:44:38.913199,1728.0
std,11.602878,,106.635509


## Crop and save the new files

In [98]:
paths.MODIFIED_AUDIO

PosixPath('/media/olly/Red_SSD/Kaggle_BC25/Data/Cropped_Train_Audio')

In [99]:
unique_labels = sorted(list(df['primary_label'].unique()))
for bird_e_name in unique_labels:
    sub_folder_path = paths.MODIFIED_AUDIO / bird_e_name
    sub_folder_path.mkdir(exist_ok=True)

In [100]:
def process_clip(item_tuple, 
                 in_folder = paths.ORIGINAL_AUDIO,
                 out_folder = paths.MODIFIED_AUDIO,
                 default_sr = cfg.SR):
    rel_path, df = item_tuple
    in_path = str(in_folder / rel_path)

    #print(in_path)
    y, sr = load_file(in_path)

    df['start'] = (df['start_position'] * sr).astype(int)
    df['end'] = (df['start'] + df['duration'] * sr).astype(int)

    if y is not None:
        for _, row in df.iterrows():
            start=row.start
            end = row.end
            if end > len(y):
                end = len(y)
            part = y[start:end]
            if len(part) >=1:
                out_path = str(out_folder / row.filename)
                if sr != default_sr:
                    num_samples = int(len(part) * default_sr / sr)
                    part = resample(part, num_samples)
                sf.write(out_path, part, default_sr)
    return

In [101]:
if cfg.SAVE_SOUNDFILES:
   grouped_df = new_df.groupby('original_file')
   Parallel(n_jobs=6)(delayed(process_clip)(group) for group in tqdm(grouped_df))


Let's remove any samples from the dataframe that for any reason didn't get saved to their expected locations

In [102]:
if cfg.SAVE_SOUNDFILES:
    original_length = len(new_df)
    training_samples = set([path.parent.name + '/' + path.name for path in Path(paths.MODIFIED_AUDIO).rglob('*') if path.suffix in {'.ogg', '.flac'}])

    new_df = new_df[new_df['filename'].isin(training_samples)]
    new_length = len(new_df)

    print(Blue.S + 'The original length was: ' +Blue.E, original_length)
    print(Blue.S + 'The final length was: ' +Blue.E, new_length)
    if original_length > new_length:
        print(Blue.S + 'Samples removed: ' + Blue.E, original_length - new_length)

Finally reshape the dataframe for training.  Keep the original filename in case I want to use this for split selection.


In [103]:
df_out = new_df[['filename', 'primary_label', 'secondary_labels', 'class', 'centres', 'original_file']]
df_out.head()

Unnamed: 0,filename,primary_label,secondary_labels,class,centres,original_file
0,528041/CSA36365_0.ogg,528041,[''],insecta,[4.1],528041/CSA36365.ogg
1,528041/CSA36359_0.ogg,528041,[''],insecta,[4.1],528041/CSA36359.ogg
2,1139490/CSA36385_0.ogg,1139490,[''],insecta,[4.0],1139490/CSA36385.ogg
3,1139490/CSA36389_0.ogg,1139490,[''],insecta,[4.6],1139490/CSA36389.ogg
4,1462711/CSA36371_0.ogg,1462711,[''],insecta,[4.1],1462711/CSA36371.ogg


In [104]:
df_out.to_parquet(paths.CROP_LABELS_PATH, index=False)

In [105]:
df_check = pd.read_parquet(paths.CROP_LABELS_PATH, engine="pyarrow")
df_check.head()

Unnamed: 0,filename,primary_label,secondary_labels,class,centres,original_file
0,528041/CSA36365_0.ogg,528041,[''],insecta,[4.1],528041/CSA36365.ogg
1,528041/CSA36359_0.ogg,528041,[''],insecta,[4.1],528041/CSA36359.ogg
2,1139490/CSA36385_0.ogg,1139490,[''],insecta,[4.0],1139490/CSA36385.ogg
3,1139490/CSA36389_0.ogg,1139490,[''],insecta,[4.6],1139490/CSA36389.ogg
4,1462711/CSA36371_0.ogg,1462711,[''],insecta,[4.1],1462711/CSA36371.ogg


In [106]:
df_check.shape

(14156, 6)