# Final Data Exploration & Preprocessing

### Observations

### Data Preprocessing Steps
- Modify the filenames to be relative to the data folder
- Combine all datasets into a single format
- Replot label distributions
- Remove any unwanted labels from the secondaries, or anything not in the primaries

In [18]:
use_case = {
            'experiment' : 1,
            'project_root': '/home/olly/Desktop/Kaggle_BC25',
            #'project_root': r'C:\Users\ollyp\OneDrive\Desktop\Kaggle_BC25'
            #'project_root': '/media/olly/Red_SSD/Kaggle_BC25',
            }

In [19]:
from pathlib import Path
import pandas as pd
import plotly.express as px
from joblib import Parallel, delayed
import json
from tqdm.notebook import tqdm
from collections import Counter
from IPython.display import Audio
import ast
import torchaudio
import torch
import numpy as np

In [20]:
class FilePaths:
    def __init__(self, options=None):
        _project_dir = Path(options['project_root'])
        self.DATA_FOLDER = _project_dir / 'Data'
        self.AUDIO_FOLDER = self.DATA_FOLDER / 'Original_Data' / 'birdclef-2025' / 'train_audio'
        self.MODIFIED_AUDIO = self.DATA_FOLDER / 'Cropped_Train_Audio'
        self.MODIFIED_AUDIO_EXTRAS = self.DATA_FOLDER / 'Cropped_Train_Audio_Extras'
        self.KAGGLE_LABELS = self.DATA_FOLDER / 'Train_Metadata/aux_labels.parquet'
        self.CROP_LABELS_PATH = self.DATA_FOLDER / 'Train_Metadata/cropped_audio_labels.parquet'
        self.EXTRA_LABELS_PATH = self.DATA_FOLDER / 'Train_Metadata/extra_cropped_audio_labels.parquet'
        self.MARKED_LABELS = self.DATA_FOLDER / 'Train_Metadata' / 'marked_labels.parquet'
        self.TRAIN_LABELS_PATH = self.DATA_FOLDER / 'Train_Metadata/train.parquet'
        self.OUTPUT_NAMING_CSV_PATH = self.DATA_FOLDER / 'Train_Metadata/naming.csv'
        self.OUTPUT_JSON_PATH = self.DATA_FOLDER / 'Train_Metadata/species_names.json'
        self.SHORT_FILES_PATH = self.DATA_FOLDER / 'Train_Metadata/audio_labels_under_6_sec.parquet'

paths = FilePaths(options=use_case)

In [21]:
df_cropped = pd.read_parquet(paths.CROP_LABELS_PATH, engine="pyarrow")
df_cropped.head(3)

Unnamed: 0,filename,primary_label,secondary_labels,class,centres,original_file
0,528041/CSA36365_0.ogg,528041,[''],insecta,[4.1],528041/CSA36365.ogg
1,528041/CSA36359_0.ogg,528041,[''],insecta,[4.1],528041/CSA36359.ogg
2,1139490/CSA36385_0.ogg,1139490,[''],insecta,[4.0],1139490/CSA36385.ogg


In [22]:
df_extra = pd.read_parquet(paths.EXTRA_LABELS_PATH, engine="pyarrow")
df_extra.head(20)

Unnamed: 0,filename,primary_label,secondary_labels,class,centres,original_file
0,turvul/XC381486_0.flac,turvul,,aves,"[6.0, 17.8, 29.5, 41.2]",turvul/XC381486.mp3
1,turvul/XC381486_1.flac,turvul,,aves,"[6.0, 17.8, 29.5, 41.2]",turvul/XC381486.mp3
2,turvul/XC381486_2.flac,turvul,,aves,"[6.0, 17.8, 29.5, 41.2]",turvul/XC381486.mp3
3,turvul/XC381486_3.flac,turvul,,aves,[6.0],turvul/XC381486.mp3
4,turvul/XC520288_0.flac,turvul,,aves,"[6.0, 17.4, 28.7, 40.1]",turvul/XC520288.mp3
5,turvul/XC520288_1.flac,turvul,,aves,"[6.0, 17.3, 28.7, 40.1]",turvul/XC520288.mp3
6,turvul/XC520288_2.flac,turvul,,aves,"[6.0, 17.4]",turvul/XC520288.mp3
7,1139490/2391_0.flac,1139490,,insecta,"[6.0, 16.6, 27.2, 37.8]",1139490/2391.wav
8,1139490/2391_1.flac,1139490,,insecta,"[6.0, 16.6, 27.2, 37.8]",1139490/2391.wav
9,66578/Pristimantis_bogotensis15_0.flac,66578,,amphibia,[12.8],66578/Pristimantis_bogotensis15.wav


In [23]:
df_marked= pd.read_parquet(paths.MARKED_LABELS, engine="pyarrow")
df_marked.head(3)

Unnamed: 0,filename,primary_label,collection,author,common_name,class,secondary_labels,duration,centres,date_reviewed
0,528041/CSA36365.ogg,528041,CSA,Fabio A. Sarria-S,Orophus conspersus,insecta,[''],112.417563,[4.1],2025-03-30 17:26:38.846914
1,528041/CSA36359.ogg,528041,CSA,Fabio A. Sarria-S,Orophus conspersus,insecta,[''],101.033469,[4.1],2025-03-30 17:26:52.355833
2,1139490/CSA36385.ogg,1139490,CSA,Fabio A. Sarria-S,Ragoniella pulchella,insecta,[''],98.853375,[4.0],2025-03-30 17:27:03.258653


In [24]:
df_marked.shape

(20255, 10)

In [25]:
df_cropped['primary_label'].nunique()

185

In [26]:
df_marked['primary_label'].nunique()

201

In [27]:
df_original = pd.read_parquet(paths.KAGGLE_LABELS, engine='pyarrow')
df_original.head(3)

Unnamed: 0,filename,primary_label,collection,author,common_name,class,secondary_labels,duration
0,1139490/CSA36385.ogg,1139490,CSA,Fabio A. Sarria-S,Ragoniella pulchella,insecta,[''],98.853375
1,1139490/CSA36389.ogg,1139490,CSA,Fabio A. Sarria-S,Ragoniella pulchella,insecta,[''],96.537719
2,1192948/CSA36358.ogg,1192948,CSA,Fabio A. Sarria-S,Oxyprora surinamensis,insecta,[''],116.599812


In [28]:
df_original.shape

(28564, 8)

First job is to remove the rows in the marked dataframe that have been split up.

In [29]:
use_cols = ['filename', 'primary_label', 'secondary_labels', 'class', 'centres']
df_marked= df_marked[~df_marked['filename'].isin(df_cropped['original_file'])]
df_marked_filtered = df_marked[use_cols].copy()
df_marked_filtered['original_file'] = df_marked_filtered['filename']
df_marked_filtered.head(3)


Unnamed: 0,filename,primary_label,secondary_labels,class,centres,original_file
26,548639/CSA34187.ogg,548639,[''],amphibia,"[2.6, 10.7]",548639/CSA34187.ogg
34,1194042/CSA18802.ogg,1194042,[''],insecta,"[5.0, 18.9, 31.4]",1194042/CSA18802.ogg
35,1194042/CSA18783.ogg,1194042,[''],insecta,"[3.5, 11.5, 23.7]",1194042/CSA18783.ogg


In [30]:
df_marked_filtered.shape

(14394, 6)

We're still missing all the shorter samples that are neither marked nor cropped!

In [31]:
df_original['centres'] = None
df_original= df_original[~df_original['filename'].isin(df_cropped['original_file'])]
df_original= df_original[~df_original['filename'].isin(df_marked_filtered['original_file'])]
df_original_filtered = df_original[use_cols+['duration']].copy()

In [32]:
df_original_filtered

Unnamed: 0,filename,primary_label,secondary_labels,class,centres,duration
32,135045/iNat327127.ogg,135045,[''],amphibia,,9.000000
44,1564122/CSA34195.ogg,1564122,[''],insecta,,2.033969
45,1564122/CSA34196.ogg,1564122,[''],insecta,,1.180687
46,1564122/CSA34197.ogg,1564122,[''],insecta,,1.907312
47,1564122/CSA34198.ogg,1564122,[''],insecta,,0.994375
...,...,...,...,...,...,...
28540,ywcpar/iNat1169084.ogg,ywcpar,[''],aves,,8.480000
28546,ywcpar/iNat267843.ogg,ywcpar,[''],aves,,4.395844
28559,ywcpar/iNat77392.ogg,ywcpar,[''],aves,,3.392000
28560,ywcpar/iNat78624.ogg,ywcpar,[''],aves,,7.659688


Now let's extend the filename column to cope with different directories for the new audio crops

In [33]:
parent_folders = '/'.join(paths.AUDIO_FOLDER.parts[-3:]) + '/'
df_original_filtered['original_file'] = df_original_filtered['filename']
df_original_filtered['filename'] = parent_folders + df_original_filtered['filename']
df_marked_filtered['filename'] = parent_folders + df_marked_filtered['filename']
df_marked_filtered.head(3)

Unnamed: 0,filename,primary_label,secondary_labels,class,centres,original_file
26,Original_Data/birdclef-2025/train_audio/548639...,548639,[''],amphibia,"[2.6, 10.7]",548639/CSA34187.ogg
34,Original_Data/birdclef-2025/train_audio/119404...,1194042,[''],insecta,"[5.0, 18.9, 31.4]",1194042/CSA18802.ogg
35,Original_Data/birdclef-2025/train_audio/119404...,1194042,[''],insecta,"[3.5, 11.5, 23.7]",1194042/CSA18783.ogg


In [34]:
df_original_filtered.head()

Unnamed: 0,filename,primary_label,secondary_labels,class,centres,duration,original_file
32,Original_Data/birdclef-2025/train_audio/135045...,135045,[''],amphibia,,9.0,135045/iNat327127.ogg
44,Original_Data/birdclef-2025/train_audio/156412...,1564122,[''],insecta,,2.033969,1564122/CSA34195.ogg
45,Original_Data/birdclef-2025/train_audio/156412...,1564122,[''],insecta,,1.180687,1564122/CSA34196.ogg
46,Original_Data/birdclef-2025/train_audio/156412...,1564122,[''],insecta,,1.907312,1564122/CSA34197.ogg
47,Original_Data/birdclef-2025/train_audio/156412...,1564122,[''],insecta,,0.994375,1564122/CSA34198.ogg


In [35]:
df_marked_filtered.shape

(14394, 6)

In [36]:
parent_folders = '/'.join(paths.MODIFIED_AUDIO.parts[-1:]) + '/'
df_cropped['filename'] = parent_folders + df_cropped['filename']
df_cropped.head(3)

Unnamed: 0,filename,primary_label,secondary_labels,class,centres,original_file
0,Cropped_Train_Audio/528041/CSA36365_0.ogg,528041,[''],insecta,[4.1],528041/CSA36365.ogg
1,Cropped_Train_Audio/528041/CSA36359_0.ogg,528041,[''],insecta,[4.1],528041/CSA36359.ogg
2,Cropped_Train_Audio/1139490/CSA36385_0.ogg,1139490,[''],insecta,[4.0],1139490/CSA36385.ogg


In [37]:
parent_folders = '/'.join(paths.MODIFIED_AUDIO_EXTRAS.parts[-1:]) + '/'
df_extra['filename'] = parent_folders + df_extra['filename']
df_extra.head(3)

Unnamed: 0,filename,primary_label,secondary_labels,class,centres,original_file
0,Cropped_Train_Audio_Extras/turvul/XC381486_0.flac,turvul,,aves,"[6.0, 17.8, 29.5, 41.2]",turvul/XC381486.mp3
1,Cropped_Train_Audio_Extras/turvul/XC381486_1.flac,turvul,,aves,"[6.0, 17.8, 29.5, 41.2]",turvul/XC381486.mp3
2,Cropped_Train_Audio_Extras/turvul/XC381486_2.flac,turvul,,aves,"[6.0, 17.8, 29.5, 41.2]",turvul/XC381486.mp3


In [38]:
df = pd.concat([df_marked_filtered, df_original_filtered, df_cropped, df_extra], ignore_index=True)
df.head()

Unnamed: 0,filename,primary_label,secondary_labels,class,centres,original_file,duration
0,Original_Data/birdclef-2025/train_audio/548639...,548639,[''],amphibia,"[2.6, 10.7]",548639/CSA34187.ogg,
1,Original_Data/birdclef-2025/train_audio/119404...,1194042,[''],insecta,"[5.0, 18.9, 31.4]",1194042/CSA18802.ogg,
2,Original_Data/birdclef-2025/train_audio/119404...,1194042,[''],insecta,"[3.5, 11.5, 23.7]",1194042/CSA18783.ogg,
3,Original_Data/birdclef-2025/train_audio/119404...,1194042,[''],insecta,[3.4],1194042/CSA18794.ogg,
4,Original_Data/birdclef-2025/train_audio/134650...,1346504,[''],insecta,"[5.1, 19.4, 30.9]",1346504/CSA18784.ogg,


In [39]:
df.shape

(36880, 7)

In [40]:
def plot_class_distribution(df, plot_col='primary_label', title='primary_label frequency', height=400):
    total_counts = df[plot_col].value_counts().reset_index()
    total_counts.columns = [plot_col, 'total_count']
    fig = px.bar(total_counts, x=total_counts[plot_col], log_y=True, y=total_counts.total_count, template='seaborn',
    hover_data=[plot_col, 'total_count'], color=plot_col, height=height)
    fig.update_layout(title=title)
    fig.update_layout(showlegend=False)
    fig.update_xaxes(categoryorder='total descending')
    fig.show()

plot_class_distribution(df, plot_col='primary_label', title='Frequency by primary_label')

The good news here is that none of the most common classes have increased in frequncy, but a few of the rarest ones have, from the splitting process on longer samples.

In [41]:
plot_class_distribution(df[df['class']=='aves'], plot_col='primary_label', title='Frequency by primary_label')

In [42]:
plot_class_distribution(df[df['class']=='insecta'], plot_col='primary_label', title='Insecta Label Counts')

In [43]:
plot_class_distribution(df[df['class']=='amphibia'], plot_col='primary_label', title='Amphibia Label Counts')

In [44]:
plot_class_distribution(df[df['class']=='mamalia'], plot_col='primary_label', title='Mammalia Label Counts')

## Get the lengths of each sample

In [45]:
secondary_labels = df['secondary_labels'].to_list()
seconds_list = [eval(string) for string in secondary_labels]
flattened_seconds = [item for sublist in seconds_list for item in sublist]
unique_seconds = sorted(list(set(flattened_seconds)))
len(unique_seconds)

TypeError: eval() arg 1 must be a string, bytes or code object

In [None]:
unique_seconds

NameError: name 'unique_seconds' is not defined

In [None]:
primary_list = df['primary_label'].unique()
extra_seconds = set(unique_seconds) - set(primary_list)
extra_seconds

NameError: name 'unique_seconds' is not defined

In [None]:
def filter_list(secondaries, allowed):
    if not isinstance(secondaries, (list, str)):
        return secondaries  # Return as is if it's not a list or string
    # Ensure secondaries is a list if it's a string
    if isinstance(secondaries, str):
        secondaries = eval(secondaries)  # Convert string representation of list to actual list
    # Filter the list
    filtered_list = [item for item in secondaries if item in allowed]
    return str(filtered_list)  # Convert back to string if needed

df['secondary_labels'] = df['secondary_labels'].apply(lambda x: filter_list(x, primary_list))

NameError: name 'primary_list' is not defined

In [46]:
secondary_labels = df['secondary_labels'].to_list()
seconds_list = [eval(string) for string in secondary_labels]
flattened_seconds = [item for sublist in seconds_list for item in sublist]
item_counts = dict(Counter(flattened_seconds))
total_secondaries = len(flattened_seconds)
total_secondaries

TypeError: eval() arg 1 must be a string, bytes or code object

In [47]:
df.head()

Unnamed: 0,filename,primary_label,secondary_labels,class,centres,original_file,duration
0,Original_Data/birdclef-2025/train_audio/548639...,548639,[''],amphibia,"[2.6, 10.7]",548639/CSA34187.ogg,
1,Original_Data/birdclef-2025/train_audio/119404...,1194042,[''],insecta,"[5.0, 18.9, 31.4]",1194042/CSA18802.ogg,
2,Original_Data/birdclef-2025/train_audio/119404...,1194042,[''],insecta,"[3.5, 11.5, 23.7]",1194042/CSA18783.ogg,
3,Original_Data/birdclef-2025/train_audio/119404...,1194042,[''],insecta,[3.4],1194042/CSA18794.ogg,
4,Original_Data/birdclef-2025/train_audio/134650...,1346504,[''],insecta,"[5.1, 19.4, 30.9]",1346504/CSA18784.ogg,


In [48]:
df.tail()

Unnamed: 0,filename,primary_label,secondary_labels,class,centres,original_file,duration
36875,Cropped_Train_Audio_Extras/turvul/XC764680.flac,turvul,,aves,[3.4],turvul/XC764680.wav,
36876,Cropped_Train_Audio_Extras/turvul/XC552488.flac,turvul,,aves,[3.2],turvul/XC552488.mp3,
36877,Cropped_Train_Audio_Extras/66016/vaillanti-esc...,66016,,amphibia,[0.6],66016/vaillanti-escape3.mp3,
36878,Cropped_Train_Audio_Extras/66016/vaillanti-esc...,66016,,amphibia,[0.6],66016/vaillanti-escape1.mp3,
36879,Cropped_Train_Audio_Extras/66016/vaillanti-esc...,66016,,amphibia,[0.8],66016/vaillanti-escape4.mp3,


In [49]:
df.shape

(36880, 7)

Now to remove any files that were found to be erroneous

In [50]:
df.shape

(36880, 7)

In [51]:
bad_files = [
    'cargra1/iNat969137.ogg', 
    #41663/iNat181697.ogg  (actually I think it's the one after this, a racoon)  
    '41663/iNat1187502.ogg', 
    '41663/iNat1001216.ogg',
    'gybmar/XC277037.ogg',
    'babwar/iNat399853.ogg',
    'babwar/iNat247468.ogg'
    ]

df = df[~df['original_file'].isin(bad_files)]
df.shape

(36874, 7)

In [52]:
common_labels = df['primary_label'].value_counts()[lambda x: x > 600].index.tolist()
common_labels

['grekis',
 'compau',
 'trokin',
 'roahaw',
 'whtdov',
 'laufal1',
 'banana',
 'socfly1',
 'yeofly1',
 'trsowl',
 'wbwwre1',
 'bobfly1']

In [53]:
len(common_labels)

12

In [55]:
next_common_labels = df['primary_label'].value_counts()[lambda x: (x > 150) & (x <= 600) ].index.tolist()
print(next_common_labels)

['sobtyr1', 'yercac1', 'soulap1', 'gycwor1', 'saffin', 'strcuc1', 'tropar', 'compot1', 'amekes', 'bubwre1', 'blbgra1', 'bbwduc', 'strfly1', 'bkmtou1', 'pirfly1', 'littin1', 'linwoo1', 'greegr', 'butsal1', 'chbant1', 'speowl1', 'whbman1', 'smbani', 'bugtan', 'yecspi2', 'yebela1', 'creoro1', 'rutjac1', 'paltan1', 'rumfly1', 'stbwoo2', 'cotfly1', 'yehcar1', 'solsan', 'orcpar', 'rinkin1', 'colcha1', 'blhpar1', 'bkcdon', 'cocwoo1', 'yebsee1', 'babwar', 'plbwoo1', 'strowl1', 'yebfly1', 'crcwoo1', 'baymac', 'thbeup1', 'blbwre1', 'ywcpar', 'purgal2', 'gohman1', 'y00678', 'greani1', 'rtlhum', 'pavpig2', 'secfly1', 'chfmac1', 'gybmar', 'blcjay1', 'whfant1', 'strher', 'cattyr', 'rugdov', 'grnkin', 'whbant1', 'watjac1']


In [56]:
len(next_common_labels)

67

In [None]:
rare_labels = df['primary_label'].value_counts()[lambda x: x < 20].index.tolist()
print(rare_labels)

['shghum1', '65349', '41970', 'sahpar1', '21038', '64862', 'plctan1', '1462737', '24272', '1346504', '41778', '24292', '787625', '65547', '126247', '65336', '555142', '1564122', '46010', '476537', '66016', '548639', '714022', '963335', '868458', '66893', '476538', '523060', '134933', '66578', '1139490', '1192948', '65419', '1194042', '1462711', '66531', '42113', '42087', '21116', '528041', '47067', '81930', '67082']


In [None]:
print(len(rare_labels))

43


So now we've got actual lists in the secondary labels instead of a string.   Let's save this out in .parquet form to keep the list structures intact.

In [None]:
df.to_parquet(paths.TRAIN_LABELS_PATH, index=False, engine='pyarrow')

In [None]:
df_check = pd.read_parquet(paths.TRAIN_LABELS_PATH, engine="pyarrow")
df_check.head()

Unnamed: 0,filename,primary_label,secondary_labels,class,centres,original_file,duration
0,Original_Data/birdclef-2025/train_audio/548639...,548639,[],amphibia,"[2.6, 10.7]",548639/CSA34187.ogg,
1,Original_Data/birdclef-2025/train_audio/119404...,1194042,[],insecta,"[5.0, 18.9, 31.4]",1194042/CSA18802.ogg,
2,Original_Data/birdclef-2025/train_audio/119404...,1194042,[],insecta,"[3.5, 11.5, 23.7]",1194042/CSA18783.ogg,
3,Original_Data/birdclef-2025/train_audio/119404...,1194042,[],insecta,[3.4],1194042/CSA18794.ogg,
4,Original_Data/birdclef-2025/train_audio/134650...,1346504,[],insecta,"[5.1, 19.4, 30.9]",1346504/CSA18784.ogg,


In [None]:
df_check['primary_label'].nunique()

206

In [None]:
df_check.shape

(36853, 7)

Finally let's get a list of all files that are less than 6 seconds, for concatenation during training.

In [None]:
short_files = df_original[df_original['duration'] <=6]['filename'].to_list()
short_files[:5]

['1564122/CSA34195.ogg',
 '1564122/CSA34196.ogg',
 '1564122/CSA34197.ogg',
 '1564122/CSA34198.ogg',
 '1564122/CSA34199.ogg']

In [None]:
len(short_files)

3203

In [None]:
original_list = df['original_file'].to_list()
original_list[:5]

['548639/CSA34187.ogg',
 '1194042/CSA18802.ogg',
 '1194042/CSA18783.ogg',
 '1194042/CSA18794.ogg',
 '1346504/CSA18784.ogg']

In [None]:
short_files_df = df[df['original_file'].isin(short_files)]
short_files_df.head()

Unnamed: 0,filename,primary_label,secondary_labels,class,centres,original_file,duration
14395,Original_Data/birdclef-2025/train_audio/156412...,1564122,[],insecta,,1564122/CSA34195.ogg,2.033969
14396,Original_Data/birdclef-2025/train_audio/156412...,1564122,[],insecta,,1564122/CSA34196.ogg,1.180687
14397,Original_Data/birdclef-2025/train_audio/156412...,1564122,[],insecta,,1564122/CSA34197.ogg,1.907312
14398,Original_Data/birdclef-2025/train_audio/156412...,1564122,[],insecta,,1564122/CSA34198.ogg,0.994375
14399,Original_Data/birdclef-2025/train_audio/156412...,1564122,[],insecta,,1564122/CSA34199.ogg,2.222625


In [None]:
short_files_df.to_parquet(paths.SHORT_FILES_PATH, index=False, engine='pyarrow')