# Data Exploration & Preprocessing

### Observations

### Data Preprocessing Steps
- Combine the three datasets into a single `.csv` with the dataset folder in the file path
- create an output `.json` file with Common Name & Scientific Name using ebird code as the key
- Remove any unwanted labels, such as 'spybird' from the secondaries, or anything not in the primaries

In [3]:
use_case = {
            'experiment' : 1,
            'project_root': '/home/olly/Desktop/Kaggle_BC25',
            #'project_root': r'C:\Users\ollyp\OneDrive\Desktop\Kaggle_BC25'
            #'project_root': '/media/olly/Red_SSD/Kaggle_BC25',
            }

In [4]:
from pathlib import Path
import pandas as pd
import plotly.express as px
from joblib import Parallel, delayed
import json
from tqdm.notebook import tqdm
from collections import Counter
from IPython.display import Audio
import ast
import torchaudio
import torch
import numpy as np

In [5]:
class FilePaths:
    def __init__(self, options=None):
        _project_dir = Path(options['project_root'])
        self.DATA_FOLDER = _project_dir / 'Data'
        self.IMAGE_FOLDER = self.DATA_FOLDER / 'Original_Data' / 'birdclef-2025' / 'train_audio'
        self.KAGGLE_DS = self.DATA_FOLDER / 'Train_Xeno_Canto'
        self.EXTRA_DATA = self.DATA_FOLDER / 'Extra_Samples' / 'metadata.csv'
        self.METADATA = self.DATA_FOLDER /  'Original_Data' / 'birdclef-2025' / 'train.csv'
        self.OUTPUT_CSV_PATH = self.DATA_FOLDER / 'Train_Metadata/train.csv'
        self.OUTPUT_NAMING_CSV_PATH = self.DATA_FOLDER / 'Train_Metadata/naming.csv'
        self.OUTPUT_JSON_PATH = self.DATA_FOLDER / 'Train_Metadata/species_names.json'
        self.OUTPUT_AUX_LABELS_PATH = self.DATA_FOLDER / 'Train_Metadata' / 'aux_labels.parquet'

paths = FilePaths(options=use_case)

Now I'm going to extract these configuration settings to variables for the rest of the script.  Doing it this way to make it consistancy with the rest of the project, and so that the analysis can be updated for any changes in dataset settings.

In [6]:
#use_cols = ['filename', 'primary_label', 'secondary_labels', 'start', 'end', 'source_fn']
df = pd.read_csv(paths.METADATA)
df['filepath'] = str(paths.IMAGE_FOLDER) + '/' + df['filename'] 
print(f'There are {len(df)} rows')
df.head(3)

There are 28564 rows


Unnamed: 0,primary_label,secondary_labels,type,filename,collection,rating,url,latitude,longitude,scientific_name,common_name,author,license,filepath
0,1139490,[''],[''],1139490/CSA36385.ogg,CSA,0.0,http://colecciones.humboldt.org.co/rec/sonidos...,7.3206,-73.7128,Ragoniella pulchella,Ragoniella pulchella,Fabio A. Sarria-S,cc-by-nc-sa 4.0,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...
1,1139490,[''],[''],1139490/CSA36389.ogg,CSA,0.0,http://colecciones.humboldt.org.co/rec/sonidos...,7.3206,-73.7128,Ragoniella pulchella,Ragoniella pulchella,Fabio A. Sarria-S,cc-by-nc-sa 4.0,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...
2,1192948,[''],[''],1192948/CSA36358.ogg,CSA,0.0,http://colecciones.humboldt.org.co/rec/sonidos...,7.3791,-73.7313,Oxyprora surinamensis,Oxyprora surinamensis,Fabio A. Sarria-S,cc-by-nc-sa 4.0,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...


In [7]:
use_cols = ['primary_label', 'secondary_labels', 'filename', 'collection', 'latitude',
            'longitude', 'scientific_name', 'common_name', 'author', 'filepath']
df = df[use_cols]
df.head(3)

Unnamed: 0,primary_label,secondary_labels,filename,collection,latitude,longitude,scientific_name,common_name,author,filepath
0,1139490,[''],1139490/CSA36385.ogg,CSA,7.3206,-73.7128,Ragoniella pulchella,Ragoniella pulchella,Fabio A. Sarria-S,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...
1,1139490,[''],1139490/CSA36389.ogg,CSA,7.3206,-73.7128,Ragoniella pulchella,Ragoniella pulchella,Fabio A. Sarria-S,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...
2,1192948,[''],1192948/CSA36358.ogg,CSA,7.3791,-73.7313,Oxyprora surinamensis,Oxyprora surinamensis,Fabio A. Sarria-S,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...


Let's add another column for the taxonomic class

In [8]:
df[df['common_name'].str.contains('Daedadelus', case=False, na=False)]  
df[df['scientific_name'].str.contains('Daedadelus waehnerorum', case=False, na=False)] 

Unnamed: 0,primary_label,secondary_labels,filename,collection,latitude,longitude,scientific_name,common_name,author,filepath
44,1564122,[''],1564122/CSA34195.ogg,CSA,-3.8333,-70.3333,Daedadelus waehnerorum,Daedadelus waehnerorum,Fernando Montealegre-Z,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...
45,1564122,[''],1564122/CSA34196.ogg,CSA,-3.8333,-70.3333,Daedadelus waehnerorum,Daedadelus waehnerorum,Fernando Montealegre-Z,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...
46,1564122,[''],1564122/CSA34197.ogg,CSA,-3.8333,-70.3333,Daedadelus waehnerorum,Daedadelus waehnerorum,Fernando Montealegre-Z,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...
47,1564122,[''],1564122/CSA34198.ogg,CSA,-3.8333,-70.3333,Daedadelus waehnerorum,Daedadelus waehnerorum,Fernando Montealegre-Z,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...
48,1564122,[''],1564122/CSA34199.ogg,CSA,-3.8333,-70.3333,Daedadelus waehnerorum,Daedadelus waehnerorum,Fernando Montealegre-Z,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...
49,1564122,[''],1564122/CSA34200.ogg,CSA,-3.8333,-70.3333,Daedadelus waehnerorum,Daedadelus waehnerorum,Fernando Montealegre-Z,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...


In [9]:
num_classes = len(list(df['primary_label'].unique()))
num_classes

206

In [10]:
class_mapping = {'amphibia':["Spotted Foam-nest Frog", "Emerald Glass Frog","Andean Poison Frog",
                            "Common Rocket Frog","Yellow-striped Poison Dart Frog","Spotted-thighed Poison Frog",
                            "Lesser Antillean whistling frog","Whistling Grass Frog","Mexican White-lipped Frog",
                            "Masked Tree Frog","Caracas Snouted Tree Frog","Red Snouted Tree Frog","Colombian Plump Frog",
                            "Panama Humming Frog","Giant Toad","Copiphora gracilis","Rusty Tree Frog",
                            "Chirique-Flusse Tree Frog","Colombian Niputidea Nurse Frog","Bogert's Aquatic Tree Frog",
                            "Boettger's Colombian Tree Frog","Harlequin Treefrog","Yellow Tree Frog","Esmereldas Robber Frog",
                            "Túngara Frog","Tatayo's Glass Frog","Savage's Thin-toed Frog","Vaillant's Frog",
                            "Cachabi Robber Frog","Bogota Robber Frog","Banded Rain Frog","Palm Rocket Frog",
                            "Veined Tree Frog",'Santa Rita Rocket Frog',"Rivero's Toad"
                            ],
                 'aves':['Amazon Kingfisher','American Kestrel','American Pygmy Kingfisher',
                         'Anhinga','Bay-breasted Warbler','Bare-faced Ibis','Bananaquit',
                         'Blue-and-yellow Macaw','Black-bellied Whistling-Duck','Bicolored Wren',
                         'Black-capped Donacobius','Yellow-throated Toucan','Blue-black Grassquit',
                         'Black-bellied Wren','Black-crested Antshrike','Black-collared Hawk',
                         'Black-chested Jay','Black-crowned Tityra','Blue-headed Parrot','Black Vulture',
                         'Boat-billed Flycatcher','Boat-billed Heron', 'Brown-throated Parakeet',
                         'Blue-billed Curassow','Buff-breasted Wren','Whooping Motmot','Blue-gray Tanager',
                         'Buff-throated Saltator','Carib Grackle','Cattle Tyrant','Chestnut-backed Antbird',
                         'Chestnut-fronted Macaw', 'Cinnamon Becard', 'Cocoi Heron', 'Cocoa Woodcreeper',
                         'Collared Aracari', 'Colombian Chachalaca', 'Common Pauraque', 'Common Potoo', 
                         'Common Tody-Flycatcher', 'Crimson-backed Tanager', 'Crimson-crested Woodpecker',
                         'Crested Bobwhite', 'Crested Guan', 'Crested Oropendola', 'Eared Dove',
                         'Fork-tailed Flycatcher', 'Golden-headed Manakin', 'Olivaceous Saltator',
                         'Great Black Hawk', 'Greater Ani', 'Great Egret', 'Green Ibis', 'Great Kiskadee',
                         'Great Potoo', 'Great Tinamou', 'Green Kingfisher', 'Gray Seedeater', 'Gray-breasted Martin',
                         'Gray-cowled Wood-Rail', 'Large-billed Tern', 'Laughing Falcon', 'Least Grebe',
                         'Lineated Woodpecker', 'Little Tinamou', 'Masked Tityra', 'Neotropic Cormorant',
                         'Northern Screamer', 'Olivaceous Piculet', 'Orange-chinned Parakeet',
                         'Pale-legged Hornero', 'Palm Tanager', 'Pale-vented Pigeon', 'Pied Puffbird',
                         'Piratic Flycatcher', 'Pied Water-Tyrant', 'Plain-brown Woodcreeper',
                         'Plain-colored Tanager', 'Plumbeous Kite', 'Purple Gallinule', 'Red-and-green Macaw',
                         'Red-breasted Meadowlark', 'Red-crowned Woodpecker', 'Ringed Kingfisher', 'Roadside Hawk',
                         'Roseate Spoonbill', 'Royal Flycatcher', 'Rufous-tailed Hummingbird', 'Ruddy-breasted Seedeater',
                         'Rufous Motmot', 'Ruddy Ground Dove', 'Rusty-margined Flycatcher', 'Rufescent Tiger-Heron',
                         'Rufous-tailed Jacamar', 'Russet-throated Puffbird', 'Saffron Finch', 'Saffron-headed Parrot',
                         'Savanna Hawk', 'Sepia-capped Flycatcher', 'Shining-green Hummingbird', 'Slate-headed Tody-Flycatcher',
                         'Smooth-billed Ani', 'Snowy Egret', 'Southern Beardless-Tyrannulet', 'Social Flycatcher',
                         'Solitary Sandpiper', 'Southern Lapwing', 'Spot-breasted Woodpecker', 'Spectacled Owl',
                         'Spectacled Parrotlet', 'Southern Rough-winged Swallow', 'Straight-billed Woodcreeper',
                         'Striped Cuckoo', 'Streaked Flycatcher', 'Striated Heron', 'Striped Owl', 'Thick-billed Seed-Finch',
                         'Thick-billed Euphonia', 'Brown-winged Schiffornis', 'Tropical Kingbird', 'Tropical Parula',
                         'Tropical Screech-Owl', 'Turkey Vulture', 'Vermilion Flycatcher', 'Wattled Jacana',
                         'White-breasted Wood-Wren', 'White-bellied Antbird', 'White-bearded Manakin',
                         'White-fringed Antwren', 'White-headed Marsh Tyrant', 'White-tipped Dove',
                         'White-tailed Trogon', 'White-winged Swallow', 'Wood Stork', 'Crested Caracara',
                         'Yellow-bellied Elaenia', 'Yellow-breasted Flycatcher', 'Yellow-bellied Seedeater',
                         'Yellow-chinned Spinetail', 'Yellow-crowned Tyrannulet', 'Yellow-hooded Blackbird',
                         'Yellow-headed Caracara', 'Yellow Oriole', 'Yellow-olive Flycatcher', 'Yellow-rumped Cacique',
                         'Yellow-crowned Parrot'
                         ],
                 'insecta':['Ragoniella pulchella', 'Oxyprora surinamensis', 'Copiphora colombiae',
                            'Cocconotus aratifrons', 'Neoconocephalus brachypterus', 'Katydids',
                            'Typical Cicadas', 'Eschatoceras bipunctatus', 'Orophus conspersus',
                            'Panoploscelis specularis', 'True Crickets', 'Subria sylvestris', 
                            'Typophyllum inflatum', 'Daedadelus waehnerorum', 'Copris susanae','Docidocercus fasciatus'
                            ],
                 'mamalia':['Common Raccoon', 'Neotropical River Otter','Jaguar','Mountain Lion','Crab-eating Fox',
                            'Collared Peccary','Red-tailed Squirrel','Brown-throated Three-toed Sloth','Colombian Red Howler Monkey'
                            ]
                }

In [11]:
num_classes = sum([len(value) for value in class_mapping.values()])
num_classes

206

In [12]:
common_names = {item for sublist in class_mapping.values() for item in sublist}
missing = set(df['common_name'].unique()) - common_names
missing

{'Daedadelus waehnerorum '}

In [13]:
#inverse_map = {value:key for value, key in dict_list for dict_list, key in class_mapping.items()}
inverse_map = {value: key for key, values in class_mapping.items() for value in values}
inverse_map['Daedadelus waehnerorum '] = 'insecta'
df['class'] = df['common_name'].map(inverse_map)
df.head()

Unnamed: 0,primary_label,secondary_labels,filename,collection,latitude,longitude,scientific_name,common_name,author,filepath,class
0,1139490,[''],1139490/CSA36385.ogg,CSA,7.3206,-73.7128,Ragoniella pulchella,Ragoniella pulchella,Fabio A. Sarria-S,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...,insecta
1,1139490,[''],1139490/CSA36389.ogg,CSA,7.3206,-73.7128,Ragoniella pulchella,Ragoniella pulchella,Fabio A. Sarria-S,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...,insecta
2,1192948,[''],1192948/CSA36358.ogg,CSA,7.3791,-73.7313,Oxyprora surinamensis,Oxyprora surinamensis,Fabio A. Sarria-S,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...,insecta
3,1192948,[''],1192948/CSA36366.ogg,CSA,7.28,-73.8582,Oxyprora surinamensis,Oxyprora surinamensis,Fabio A. Sarria-S,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...,insecta
4,1192948,[''],1192948/CSA36373.ogg,CSA,7.3791,-73.7313,Oxyprora surinamensis,Oxyprora surinamensis,Fabio A. Sarria-S,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...,insecta


In [14]:
df['collection'].unique()

array(['CSA', 'XC', 'iNat'], dtype=object)

In [15]:
print(f'There are {len(df)} rows in the train dataframe')

There are 28564 rows in the train dataframe


In [16]:
df_names = df.drop_duplicates(subset='primary_label', keep='first')[['primary_label', 'common_name', 'scientific_name', 'class']].sort_values(by='primary_label')
df_names.head(3)

Unnamed: 0,primary_label,common_name,scientific_name,class
0,1139490,Ragoniella pulchella,Ragoniella pulchella,insecta
2,1192948,Oxyprora surinamensis,Oxyprora surinamensis,insecta
6,1194042,Copiphora colombiae,Copiphora colombiae,insecta


In [17]:
df_names.tail(3)

Unnamed: 0,primary_label,common_name,scientific_name,class
27595,yeofly1,Yellow-olive Flycatcher,Tolmomyias sulphurescens,aves
28120,yercac1,Yellow-rumped Cacique,Cacicus cela,aves
28422,ywcpar,Yellow-crowned Parrot,Amazona ochrocephala,aves


Let's save this out to CSV, but in the format that matches my BirdNamer class

In [18]:
naming_dict = {'primary_label':'eBird', 'common_name':'CommonName', 'scientific_name': 'ScientificName', 'class':'Class'}

df_names.rename(columns=naming_dict, inplace=True)
df_names.head()

Unnamed: 0,eBird,CommonName,ScientificName,Class
0,1139490,Ragoniella pulchella,Ragoniella pulchella,insecta
2,1192948,Oxyprora surinamensis,Oxyprora surinamensis,insecta
6,1194042,Copiphora colombiae,Copiphora colombiae,insecta
9,126247,Spotted Foam-nest Frog,Leptodactylus insularum,amphibia
15,1346504,Neoconocephalus brachypterus,Neoconocephalus brachypterus,insecta


In [19]:
df_names.to_csv(paths.OUTPUT_NAMING_CSV_PATH, index=False)

In [20]:
def plot_class_distribution(df, plot_col='primary_label', title='primary_label frequency', height=400):
    total_counts = df[plot_col].value_counts().reset_index()
    total_counts.columns = [plot_col, 'total_count']
    fig = px.bar(total_counts, x=total_counts[plot_col], log_y=True, y=total_counts.total_count, template='seaborn',
    hover_data=[plot_col, 'total_count'], color=plot_col, height=height)
    fig.update_layout(title=title)
    fig.update_layout(showlegend=False)
    fig.update_xaxes(categoryorder='total descending')
    fig.show()

plot_class_distribution(df, plot_col='primary_label', title='Frequency by primary_label')

In [50]:
total_counts = df['primary_label'].value_counts().reset_index()
total_counts.columns = ['primary_label', 'total_count']
total_counts.sort_values('total_count', ascending=True)
total_counts.tail(50)

Unnamed: 0,primary_label,total_count
156,rutpuf1,17
157,65344,16
158,65349,16
159,shghum1,15
160,41970,15
161,sahpar1,14
162,67252,14
163,norscr1,14
164,24322,13
165,turvul,11


In [51]:
total_counts.to_csv(paths.DATA_FOLDER / 'primary_label_counts.csv')

In [22]:
plot_class_distribution(df[df['class']=='aves'], plot_col='primary_label', title='Bird Label Counts')

In [23]:
plot_class_distribution(df[df['class']=='insecta'], plot_col='primary_label', title='Insecta Label Counts')

In [24]:
plot_class_distribution(df[df['class']=='amphibia'], plot_col='common_name', title='Amphibia Label Counts')

In [25]:
plot_class_distribution(df[df['class']=='mamalia'], plot_col='common_name', title='Mammalia Label Counts')

In [26]:
df_names.to_csv(paths.OUTPUT_NAMING_CSV_PATH, index=False)
result_list = df_names.to_dict(orient='records')

with paths.OUTPUT_JSON_PATH.open('w') as file:
    json.dump(result_list, file, indent=4)

In [27]:
total_counts_primary = df['common_name'].value_counts().reset_index()
total_counts_primary.tail(10)

Unnamed: 0,common_name,count
196,Brown-throated Three-toed Sloth,2
197,Vaillant's Frog,2
198,Orophus conspersus,2
199,Colombian Plump Frog,2
200,Palm Rocket Frog,2
201,Colombian Niputidea Nurse Frog,2
202,Rivero's Toad,2
203,Neotropical River Otter,2
204,Crab-eating Fox,2
205,Ragoniella pulchella,2


## Get the lengths of each sample

In [28]:
df.head()

Unnamed: 0,primary_label,secondary_labels,filename,collection,latitude,longitude,scientific_name,common_name,author,filepath,class
0,1139490,[''],1139490/CSA36385.ogg,CSA,7.3206,-73.7128,Ragoniella pulchella,Ragoniella pulchella,Fabio A. Sarria-S,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...,insecta
1,1139490,[''],1139490/CSA36389.ogg,CSA,7.3206,-73.7128,Ragoniella pulchella,Ragoniella pulchella,Fabio A. Sarria-S,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...,insecta
2,1192948,[''],1192948/CSA36358.ogg,CSA,7.3791,-73.7313,Oxyprora surinamensis,Oxyprora surinamensis,Fabio A. Sarria-S,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...,insecta
3,1192948,[''],1192948/CSA36366.ogg,CSA,7.28,-73.8582,Oxyprora surinamensis,Oxyprora surinamensis,Fabio A. Sarria-S,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...,insecta
4,1192948,[''],1192948/CSA36373.ogg,CSA,7.3791,-73.7313,Oxyprora surinamensis,Oxyprora surinamensis,Fabio A. Sarria-S,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...,insecta


In [29]:
def get_audio_duration(filepath):
    info = torchaudio.info(filepath)
    return info.num_frames / info.sample_rate  # Duration in seconds

filepaths = df['filepath'].to_list()

df["duration"] = Parallel(n_jobs=4)(
    delayed(get_audio_duration)(fp) for fp in tqdm(filepaths)
)

  0%|          | 0/28564 [00:00<?, ?it/s]

In [30]:
df.head()

Unnamed: 0,primary_label,secondary_labels,filename,collection,latitude,longitude,scientific_name,common_name,author,filepath,class,duration
0,1139490,[''],1139490/CSA36385.ogg,CSA,7.3206,-73.7128,Ragoniella pulchella,Ragoniella pulchella,Fabio A. Sarria-S,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...,insecta,98.853375
1,1139490,[''],1139490/CSA36389.ogg,CSA,7.3206,-73.7128,Ragoniella pulchella,Ragoniella pulchella,Fabio A. Sarria-S,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...,insecta,96.537719
2,1192948,[''],1192948/CSA36358.ogg,CSA,7.3791,-73.7313,Oxyprora surinamensis,Oxyprora surinamensis,Fabio A. Sarria-S,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...,insecta,116.599812
3,1192948,[''],1192948/CSA36366.ogg,CSA,7.28,-73.8582,Oxyprora surinamensis,Oxyprora surinamensis,Fabio A. Sarria-S,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...,insecta,105.446313
4,1192948,[''],1192948/CSA36373.ogg,CSA,7.3791,-73.7313,Oxyprora surinamensis,Oxyprora surinamensis,Fabio A. Sarria-S,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...,insecta,103.631469


In [31]:
df_aux = df[['filename', 'primary_label',  'collection', 'author', 'common_name', 'class', 'secondary_labels', 'duration']]
df_aux.head()

Unnamed: 0,filename,primary_label,collection,author,common_name,class,secondary_labels,duration
0,1139490/CSA36385.ogg,1139490,CSA,Fabio A. Sarria-S,Ragoniella pulchella,insecta,[''],98.853375
1,1139490/CSA36389.ogg,1139490,CSA,Fabio A. Sarria-S,Ragoniella pulchella,insecta,[''],96.537719
2,1192948/CSA36358.ogg,1192948,CSA,Fabio A. Sarria-S,Oxyprora surinamensis,insecta,[''],116.599812
3,1192948/CSA36366.ogg,1192948,CSA,Fabio A. Sarria-S,Oxyprora surinamensis,insecta,[''],105.446313
4,1192948/CSA36373.ogg,1192948,CSA,Fabio A. Sarria-S,Oxyprora surinamensis,insecta,[''],103.631469


In [32]:
df_aux.to_parquet(paths.OUTPUT_AUX_LABELS_PATH)

In [33]:
check_df = pd.read_parquet(paths.OUTPUT_AUX_LABELS_PATH)
check_df.head()

Unnamed: 0,filename,primary_label,collection,author,common_name,class,secondary_labels,duration
0,1139490/CSA36385.ogg,1139490,CSA,Fabio A. Sarria-S,Ragoniella pulchella,insecta,[''],98.853375
1,1139490/CSA36389.ogg,1139490,CSA,Fabio A. Sarria-S,Ragoniella pulchella,insecta,[''],96.537719
2,1192948/CSA36358.ogg,1192948,CSA,Fabio A. Sarria-S,Oxyprora surinamensis,insecta,[''],116.599812
3,1192948/CSA36366.ogg,1192948,CSA,Fabio A. Sarria-S,Oxyprora surinamensis,insecta,[''],105.446313
4,1192948/CSA36373.ogg,1192948,CSA,Fabio A. Sarria-S,Oxyprora surinamensis,insecta,[''],103.631469


In [34]:
check_df[check_df["duration"].isna()]    # Rows where column is NaN

Unnamed: 0,filename,primary_label,collection,author,common_name,class,secondary_labels,duration


In [35]:
primary_list = df['primary_label'].unique()
def filter_list(secondaries, allowed):
    # Convert string representation of list to an actual list
    #actual_list = ast.literal_eval(list_str)
    # Filter the list
    filtered_list = [item for item in secondaries if item in allowed]
    return str(filtered_list)  # Convert back to string if needed

def filter_list(secondaries, allowed):
    if not isinstance(secondaries, (list, str)):
        return secondaries  # Return as is if it's not a list or string
    # Ensure secondaries is a list if it's a string
    if isinstance(secondaries, str):
        secondaries = eval(secondaries)  # Convert string representation of list to actual list
    # Filter the list
    filtered_list = [item for item in secondaries if item in allowed]
    return str(filtered_list)  # Convert back to string if needed

df['secondary_labels'] = df['secondary_labels'].apply(lambda x: filter_list(x, primary_list))

In [36]:
secondary_labels = df['secondary_labels'].to_list()
seconds_list = [eval(string) for string in secondary_labels]
flattened_seconds = [item for sublist in seconds_list for item in sublist]
item_counts = dict(Counter(flattened_seconds))
total_secondaries = len(flattened_seconds)
total_secondaries

3941

It would be good to plot this.

In [37]:
unique_seconds = set(flattened_seconds)
len(unique_seconds)

132

In [38]:
df.head()

Unnamed: 0,primary_label,secondary_labels,filename,collection,latitude,longitude,scientific_name,common_name,author,filepath,class,duration
0,1139490,[],1139490/CSA36385.ogg,CSA,7.3206,-73.7128,Ragoniella pulchella,Ragoniella pulchella,Fabio A. Sarria-S,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...,insecta,98.853375
1,1139490,[],1139490/CSA36389.ogg,CSA,7.3206,-73.7128,Ragoniella pulchella,Ragoniella pulchella,Fabio A. Sarria-S,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...,insecta,96.537719
2,1192948,[],1192948/CSA36358.ogg,CSA,7.3791,-73.7313,Oxyprora surinamensis,Oxyprora surinamensis,Fabio A. Sarria-S,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...,insecta,116.599812
3,1192948,[],1192948/CSA36366.ogg,CSA,7.28,-73.8582,Oxyprora surinamensis,Oxyprora surinamensis,Fabio A. Sarria-S,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...,insecta,105.446313
4,1192948,[],1192948/CSA36373.ogg,CSA,7.3791,-73.7313,Oxyprora surinamensis,Oxyprora surinamensis,Fabio A. Sarria-S,/home/olly/Desktop/Kaggle_BC25/Data/Original_D...,insecta,103.631469


In [39]:
df=df[['filename', 'primary_label', 'secondary_labels', 'collection']]
df.head()

Unnamed: 0,filename,primary_label,secondary_labels,collection
0,1139490/CSA36385.ogg,1139490,[],CSA
1,1139490/CSA36389.ogg,1139490,[],CSA
2,1192948/CSA36358.ogg,1192948,[],CSA
3,1192948/CSA36366.ogg,1192948,[],CSA
4,1192948/CSA36373.ogg,1192948,[],CSA


In [40]:
list(df['secondary_labels'].unique())

['[]',
 "['65448', '22976', '476538']",
 "['compau']",
 "['65448']",
 "['65547']",
 "['22976']",
 "['soulap1']",
 "['bubwre1']",
 "['rinkin1']",
 "['yectyr1']",
 "['grekis']",
 "['trokin']",
 "['recwoo1', 'trokin', 'rumfly1', 'grekis']",
 "['rugdov']",
 "['neocor', 'banana']",
 "['whtdov']",
 "['banana']",
 "['rutjac1', 'paltan1']",
 "['rutjac1', 'bafibi1', 'banana', 'grekis']",
 "['bobfly1']",
 "['socfly1']",
 "['littin1']",
 "['sobtyr1']",
 "['chbant1']",
 "['yeofly1']",
 "['greegr']",
 "['saffin']",
 "['paltan1']",
 "['baymac']",
 "['rtlhum']",
 "['bobfly1', 'rumfly1', 'cinbec1']",
 "['plbwoo1']",
 "['pirfly1']",
 "['rumfly1']",
 "['orcpar']",
 "['trokin', 'grekis']",
 "['whbant1', 'rebbla1', 'whtdov', 'cinbec1', 'rugdov']",
 "['pavpig2']",
 "['whttro1', 'blcjay1', 'trokin']",
 "['blbwre1', 'crbtan1']",
 "['yebfly1']",
 "['yebela1']",
 "['creoro1', 'cocwoo1']",
 "['bugtan']",
 "['orcpar', 'saffin']",
 "['cotfly1', 'yebela1']",
 "['cotfly1']",
 "['bkmtou1']",
 "['saffin', 'bkcdon']",

In [41]:
len(list(df['secondary_labels'].unique()))

744

## Varify the audiofiles

I would like to be sure that the audio files can be opend OK

In [42]:
def crop_or_pad(y, length,  train='train', background_paths=None):
    initial_length = len(y)
    if 3 * initial_length < length:
        y = np.concatenate([y,np.zeros(initial_length),y])
    elif 2 * initial_length < length:
        y = np.concatenate([y,np.zeros(initial_length//2),y])
    if len(y) < length:
        y = np.concatenate([y, y]) 
    
    def Normalize(array):
        max_vol = np.abs(array).max()
        return array * 1 / max_vol

    if len(y) < length:
        difference = length - len(y)
        fill=np.zeros(difference)
        y = np.concatenate([y, fill])
    else:
        if train != 'train':
            start = 0
        else:
            start = 0
            start = np.random.randint(len(y) - length)
        y = y[start: start + length]
    y = Normalize(y)
    return y

def open_audio_clip(path, starts=None):    
    try:  
        y, _ = torchaudio.load(path)
        if y.ndim == 2 and y.shape[0] == 2:
            print(f'converting {path} to mono')
            y = torch.mean(y, dim=0).unsqueeze(0)  # from stereo to mono
        y = y.squeeze().numpy() 
    except Exception as e:
        y = np.random.randn(5*320000) 
        print(f'could not open {path}')
        print(e)
    
    if not np.isfinite(y).all():
        y[np.isnan(y)] = np.mean(y)
        y[np.isinf(y)] = np.mean(y)

    y = crop_or_pad(y, 10)  # background_paths=self.back_pths
    return y

In [43]:
tier1_files = [str(paths.DATA_FOLDER / fn) for fn in df['filename'].to_list()]

tier1_files[:5]

['/home/olly/Desktop/Kaggle_BC25/Data/1139490/CSA36385.ogg',
 '/home/olly/Desktop/Kaggle_BC25/Data/1139490/CSA36389.ogg',
 '/home/olly/Desktop/Kaggle_BC25/Data/1192948/CSA36358.ogg',
 '/home/olly/Desktop/Kaggle_BC25/Data/1192948/CSA36366.ogg',
 '/home/olly/Desktop/Kaggle_BC25/Data/1192948/CSA36373.ogg']

In [44]:
def play_audio(file_path):
    audio_abe, sr_abe = torchaudio.load(file_path)
    print(sr_abe)
    print(audio_abe.shape)
    return Audio(data=audio_abe.numpy(), rate=sr_abe)

play_audio(tier1_files[0])

RuntimeError: Failed to open the input "/home/olly/Desktop/Kaggle_BC25/Data/1139490/CSA36385.ogg" (No such file or directory).
Exception raised from get_input_format_context at /__w/_temp/conda_environment_8430229400/conda-bld/torchaudio_1711422726100/work/src/libtorio/ffmpeg/stream_reader/stream_reader.cpp:42 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x772e09b7dd87 in /home/olly/miniconda3/envs/bird_audio_ml/lib/python3.11/site-packages/torch/lib/libc10.so)
frame #1: c10::detail::torchCheckFail(char const*, char const*, unsigned int, std::string const&) + 0x64 (0x772e09b2e75f in /home/olly/miniconda3/envs/bird_audio_ml/lib/python3.11/site-packages/torch/lib/libc10.so)
frame #2: <unknown function> + 0x42904 (0x772db8dbf904 in /home/olly/miniconda3/envs/bird_audio_ml/lib/python3.11/site-packages/torio/lib/libtorio_ffmpeg6.so)
frame #3: torio::io::StreamingMediaDecoder::StreamingMediaDecoder(std::string const&, std::optional<std::string> const&, std::optional<std::map<std::string, std::string, std::less<std::string>, std::allocator<std::pair<std::string const, std::string> > > > const&) + 0x14 (0x772db8dc2304 in /home/olly/miniconda3/envs/bird_audio_ml/lib/python3.11/site-packages/torio/lib/libtorio_ffmpeg6.so)
frame #4: <unknown function> + 0x3ab5e (0x772d67589b5e in /home/olly/miniconda3/envs/bird_audio_ml/lib/python3.11/site-packages/torio/lib/_torio_ffmpeg6.so)
frame #5: <unknown function> + 0x32737 (0x772d67581737 in /home/olly/miniconda3/envs/bird_audio_ml/lib/python3.11/site-packages/torio/lib/_torio_ffmpeg6.so)
frame #6: <unknown function> + 0x201b06 (0x564388f2fb06 in /home/olly/miniconda3/envs/bird_audio_ml/bin/python)
frame #7: _PyObject_MakeTpCall + 0x253 (0x564388f0e8b3 in /home/olly/miniconda3/envs/bird_audio_ml/bin/python)
frame #8: <unknown function> + 0x2301b6 (0x564388f5e1b6 in /home/olly/miniconda3/envs/bird_audio_ml/bin/python)
frame #9: <unknown function> + 0x2190dc (0x564388f470dc in /home/olly/miniconda3/envs/bird_audio_ml/bin/python)
frame #10: <unknown function> + 0x1e0be1 (0x564388f0ebe1 in /home/olly/miniconda3/envs/bird_audio_ml/bin/python)
frame #11: <unknown function> + 0xf874 (0x772e09af0874 in /home/olly/miniconda3/envs/bird_audio_ml/lib/python3.11/site-packages/torchaudio/lib/_torchaudio.so)
frame #12: _PyObject_MakeTpCall + 0x253 (0x564388f0e8b3 in /home/olly/miniconda3/envs/bird_audio_ml/bin/python)
frame #13: _PyEval_EvalFrameDefault + 0x716 (0x564388f1c3b6 in /home/olly/miniconda3/envs/bird_audio_ml/bin/python)
frame #14: _PyFunction_Vectorcall + 0x181 (0x564388f3f981 in /home/olly/miniconda3/envs/bird_audio_ml/bin/python)
frame #15: <unknown function> + 0x218d3c (0x564388f46d3c in /home/olly/miniconda3/envs/bird_audio_ml/bin/python)
frame #16: _PyObject_MakeTpCall + 0x233 (0x564388f0e893 in /home/olly/miniconda3/envs/bird_audio_ml/bin/python)
frame #17: _PyEval_EvalFrameDefault + 0x716 (0x564388f1c3b6 in /home/olly/miniconda3/envs/bird_audio_ml/bin/python)
frame #18: <unknown function> + 0x2a5a8d (0x564388fd3a8d in /home/olly/miniconda3/envs/bird_audio_ml/bin/python)
frame #19: PyEval_EvalCode + 0x9f (0x564388fd311f in /home/olly/miniconda3/envs/bird_audio_ml/bin/python)
frame #20: <unknown function> + 0x2bc6ee (0x564388fea6ee in /home/olly/miniconda3/envs/bird_audio_ml/bin/python)
frame #21: _PyEval_EvalFrameDefault + 0x38c0 (0x564388f1f560 in /home/olly/miniconda3/envs/bird_audio_ml/bin/python)
frame #22: <unknown function> + 0x2b9c39 (0x564388fe7c39 in /home/olly/miniconda3/envs/bird_audio_ml/bin/python)
frame #23: _PyEval_EvalFrameDefault + 0x3387 (0x564388f1f027 in /home/olly/miniconda3/envs/bird_audio_ml/bin/python)
frame #24: <unknown function> + 0x2b9c39 (0x564388fe7c39 in /home/olly/miniconda3/envs/bird_audio_ml/bin/python)
frame #25: _PyEval_EvalFrameDefault + 0x3387 (0x564388f1f027 in /home/olly/miniconda3/envs/bird_audio_ml/bin/python)
frame #26: <unknown function> + 0x2b9c39 (0x564388fe7c39 in /home/olly/miniconda3/envs/bird_audio_ml/bin/python)
frame #27: <unknown function> + 0x2ba5b7 (0x564388fe85b7 in /home/olly/miniconda3/envs/bird_audio_ml/bin/python)
frame #28: _PyEval_EvalFrameDefault + 0x3a36 (0x564388f1f6d6 in /home/olly/miniconda3/envs/bird_audio_ml/bin/python)
frame #29: <unknown function> + 0x2308d4 (0x564388f5e8d4 in /home/olly/miniconda3/envs/bird_audio_ml/bin/python)
frame #30: <unknown function> + 0x2300be (0x564388f5e0be in /home/olly/miniconda3/envs/bird_audio_ml/bin/python)
frame #31: PyObject_Call + 0x130 (0x564388f496b0 in /home/olly/miniconda3/envs/bird_audio_ml/bin/python)
frame #32: _PyEval_EvalFrameDefault + 0x4a44 (0x564388f206e4 in /home/olly/miniconda3/envs/bird_audio_ml/bin/python)
frame #33: <unknown function> + 0x2b9c39 (0x564388fe7c39 in /home/olly/miniconda3/envs/bird_audio_ml/bin/python)
frame #34: _PyEval_EvalFrameDefault + 0x3387 (0x564388f1f027 in /home/olly/miniconda3/envs/bird_audio_ml/bin/python)
frame #35: <unknown function> + 0x2b9c39 (0x564388fe7c39 in /home/olly/miniconda3/envs/bird_audio_ml/bin/python)
frame #36: _PyEval_EvalFrameDefault + 0x3387 (0x564388f1f027 in /home/olly/miniconda3/envs/bird_audio_ml/bin/python)
frame #37: <unknown function> + 0x2b9c39 (0x564388fe7c39 in /home/olly/miniconda3/envs/bird_audio_ml/bin/python)
frame #38: _PyEval_EvalFrameDefault + 0x3387 (0x564388f1f027 in /home/olly/miniconda3/envs/bird_audio_ml/bin/python)
frame #39: <unknown function> + 0x2b9c39 (0x564388fe7c39 in /home/olly/miniconda3/envs/bird_audio_ml/bin/python)
frame #40: _PyEval_EvalFrameDefault + 0x3387 (0x564388f1f027 in /home/olly/miniconda3/envs/bird_audio_ml/bin/python)
frame #41: <unknown function> + 0x2b9c39 (0x564388fe7c39 in /home/olly/miniconda3/envs/bird_audio_ml/bin/python)
frame #42: _PyEval_EvalFrameDefault + 0x3387 (0x564388f1f027 in /home/olly/miniconda3/envs/bird_audio_ml/bin/python)
frame #43: <unknown function> + 0x2b9c39 (0x564388fe7c39 in /home/olly/miniconda3/envs/bird_audio_ml/bin/python)
frame #44: <unknown function> + 0x78b6 (0x772e526f68b6 in /home/olly/miniconda3/envs/bird_audio_ml/lib/python3.11/lib-dynload/_asyncio.cpython-311-x86_64-linux-gnu.so)
frame #45: <unknown function> + 0x2003fa (0x564388f2e3fa in /home/olly/miniconda3/envs/bird_audio_ml/bin/python)
frame #46: <unknown function> + 0x199fb5 (0x564388ec7fb5 in /home/olly/miniconda3/envs/bird_audio_ml/bin/python)
frame #47: <unknown function> + 0x19c065 (0x564388eca065 in /home/olly/miniconda3/envs/bird_audio_ml/bin/python)
frame #48: <unknown function> + 0x1fafbf (0x564388f28fbf in /home/olly/miniconda3/envs/bird_audio_ml/bin/python)
frame #49: _PyEval_EvalFrameDefault + 0x8c34 (0x564388f248d4 in /home/olly/miniconda3/envs/bird_audio_ml/bin/python)
frame #50: <unknown function> + 0x2a5a8d (0x564388fd3a8d in /home/olly/miniconda3/envs/bird_audio_ml/bin/python)
frame #51: PyEval_EvalCode + 0x9f (0x564388fd311f in /home/olly/miniconda3/envs/bird_audio_ml/bin/python)
frame #52: <unknown function> + 0x2bc6ee (0x564388fea6ee in /home/olly/miniconda3/envs/bird_audio_ml/bin/python)
frame #53: <unknown function> + 0x1fafbf (0x564388f28fbf in /home/olly/miniconda3/envs/bird_audio_ml/bin/python)
frame #54: PyObject_Vectorcall + 0x2c (0x564388f28eac in /home/olly/miniconda3/envs/bird_audio_ml/bin/python)
frame #55: _PyEval_EvalFrameDefault + 0x716 (0x564388f1c3b6 in /home/olly/miniconda3/envs/bird_audio_ml/bin/python)
frame #56: _PyFunction_Vectorcall + 0x181 (0x564388f3f981 in /home/olly/miniconda3/envs/bird_audio_ml/bin/python)
frame #57: <unknown function> + 0x2cf158 (0x564388ffd158 in /home/olly/miniconda3/envs/bird_audio_ml/bin/python)
frame #58: Py_RunMain + 0x139 (0x564388ffcad9 in /home/olly/miniconda3/envs/bird_audio_ml/bin/python)
frame #59: Py_BytesMain + 0x37 (0x564388fc3027 in /home/olly/miniconda3/envs/bird_audio_ml/bin/python)
frame #60: <unknown function> + 0x2a1ca (0x772e5302a1ca in /lib/x86_64-linux-gnu/libc.so.6)
frame #61: __libc_start_main + 0x8b (0x772e5302a28b in /lib/x86_64-linux-gnu/libc.so.6)
frame #62: <unknown function> + 0x294ecd (0x564388fc2ecd in /home/olly/miniconda3/envs/bird_audio_ml/bin/python)


In [None]:
play_audio(tier1_files[100])

In [None]:
play_audio(tier1_files[50])