# [BirdCLEF23](https://www.kaggle.com/competitions/birdclef-2023) New Dataset From Classifier Predictions


### Motivation

Use my existing classifier, which has LRAP > 0.8 to predict which 8 second chunks are most likely to contain the primary bird, then turn that into a cleaner and more balanced dataset.

I have saved to ogg this time, because I'm sharing my data around with colab, and my own machine.  In theory .wav will load a bit faster, but the file sizes were less practical for me.

### Sampling rules
- The 5 second clips are staggered by 3 seconds.  So for each sound file, order every adjacent 8 second pair, sum the probability of the primary bird, then order the pairings
- Maximum 1000 samples per bird class
- Min 2 samples from each clip, taken from first and last 8 seconds
- Randomly choose to 0 to 4 more samples only if they have equal or better scores than the first & last, to a maxumum depending on class frequency

### Output
- A CSV with the primary label, and also all labels, with one-hot encoding, including any secondary labels.  1 or 0, ignore the predicted probabilities
- A CSV with only the primary label (for use with CE loss)
- A cropped 8 second chunk dataset, with filenames identified in the above csv, in a single folder

### Usage
- The primary only one will drop into my existing trainer
- The OHE version will only work with BCE loss, and will need some modification

In [46]:
import os
import numpy as np
import pandas as pd
import soundfile as sf
from pathlib import Path
import random 
import librosa
import platform
from IPython.display import Audio 
from tqdm import tqdm
import ast
import torchaudio
import plotly.express as px
import gc
from multiprocessing import Pool

In [47]:
in_kaggle = os.environ.get('PWD') == '/kaggle/working'
in_windows = platform.system() == 'Windows'
in_linux = platform.system() == 'Linux'

if not in_kaggle and in_linux:
    working = "/home/olly/Desktop/BirdCLEF23/Kaggle_Kernel/"
elif in_windows:
    working = "C:/Users/ollyp/Desktop/BirdClef23/Kaggle_Kernel"

if not in_kaggle:
    os.chdir(working) # Set to folder containing the data files
print("Current working directory: {0}".format(os.getcwd()))

Current working directory: C:\Users\ollyp\Desktop\BirdClef23\Kaggle_Kernel


Configuration for this notebook

In [48]:
class Config:
    SR = 32000 # Sampling rate of all the source files
    CHUNK_DURATION = 8  # Clips the files to this number of seconds.
    FILE_TYPE = 'ogg'  # Save to .wav will potentially mean faster loading, but larger files
    MAX_SELECT = 500 # Maximum number of clips to be kept per bird
    SHORT_CLIP_THRESHOLD = 6  #Boost the score if clip is less that 3x(N-1)+5  eg 20 seconds if N=5
    KEEP_ALL_THRESHOLD = 12 #Just keep all clips if there are less than this many for a given primary_label
    KEEP_MIN_THRESHOLD = 600  #Everthing over this is culled by MAX_DROP_FACTOR
    MAX_EXTRAS = 12 # The maximum number of extra samples to be taken from each clip
    DROP_FACTOR = 1.5 # The minimum ratio of discards per sound clip.  EG if there could be 6, only 3 would be selected.  (Up to the MAX_EXTRAS limit)
    # If MIN_RATIO = 1 then short clips will keep all chunks in the middle, up to MAX_EXTRAS
    MIN_EXTRAS = 4
    NUM_WORKERS = 4 # For parallel processing
    MAKE_SOUNDFILES = False #  If false, will write out the sound files as well
    ALL_SOUNDCHUNKS = False # If true will not do any selection.  The entire dataset will be windowed in 3s steps
    YEAR = 23
    EXP = 79

In [49]:
# data_folder = Path('kaggle/input')  # modify to suit
#in_csv = data_folder / f'call_detects_{Config.YEAR}_{Config.EXP}' / f'detect_labels_{Config.YEAR}.csv' # for header format only
in_csv = 'kaggle/input/call_detects_23_65/detect_labels_23.csv'
out_dataset_name =  f'birdclef{Config.YEAR}-extras-{str(Config.CHUNK_DURATION)}-sec-{Config.FILE_TYPE}' #for the CSV rows
new_dataset_path = '/kaggle/input/' + out_dataset_name
if in_kaggle:
    out_folder = Path('/kaggle/working/') #/ out_dataset_name  # to save and make into a dataset
else: 
    out_folder = Path(f'kaggle/working/birdclef{Config.YEAR}-extras-{str(Config.CHUNK_DURATION)}-sec-{Config.FILE_TYPE}')

out_csv_multilabel = out_folder / f'birdclef{Config.YEAR}-extra-labels-multilabel.csv'
out_csv_multiclass = out_folder / f'birdclef{Config.YEAR}-extra-labels-multiclass.csv'
out_csv_scores = out_folder / f'birdclef{Config.YEAR}-extra-labels-multiclass.csv'
out_soundfile_folder = out_folder / 'train_audio'
os.makedirs(out_soundfile_folder, exist_ok=True)
pd.set_option('display.max_colwidth', None)

In [50]:
df = pd.read_csv(in_csv)
numeric_cols = df.select_dtypes(include=np.number).columns
df[numeric_cols] = df[numeric_cols].astype(np.float16)
df['start'] = df['start'].astype(int)
df['file_count'] = df.groupby('filepath')['filepath'].transform('count')
df.head(3)

  df['file_count'] = df.groupby('filepath')['filepath'].transform('count')


Unnamed: 0,filepath,start,primary_label,secondary_labels,type,abethr1,abhori1,abythr1,afbfly1,afdfly1,...,yeccan1,yefcan,yelbis1,yenspu1,yertin1,yesbar1,yespet1,yetgre1,yewgre1,file_count
0,kaggle/input/birdclef-2023/train_audio/abethr1/XC128013.ogg,0,abethr1,[],['song'],0.042603,0.0013,0.0009,0.0001,0.0008,...,0.002199,0.0011,0.0008,0.0002,0.0003,0.0001,0.0002,0.0001,0.0001,15
1,kaggle/input/birdclef-2023/train_audio/abethr1/XC128013.ogg,3,abethr1,[],['song'],0.010002,0.0002,0.0004,0.0,0.0002,...,0.0004,0.0002,0.0001,0.0001,0.0005,0.0001,0.0001,0.0,0.0,15
2,kaggle/input/birdclef-2023/train_audio/abethr1/XC128013.ogg,6,abethr1,[],['song'],0.015198,0.0007,0.002199,0.0001,0.0007,...,0.0008,0.0009,0.0004,0.0001,0.001,0.0002,0.0001,0.0002,0.0001,15


In [51]:
df.shape[0]

223582

In [52]:
len(df['filepath'].unique().tolist())

16941

In [53]:
all_birds = df['primary_label'].unique().tolist()
bird_headers = df.columns.to_list()
not_in_common = set(bird_headers) - set(all_birds)
not_in_common

{'file_count',
 'filepath',
 'no-call',
 'primary_label',
 'secondary_labels',
 'start',
 'type'}

In [54]:
def score_clips(row):
    second_birds =  ast.literal_eval(row['secondary_labels'])
    first_bird = row['primary_label']
    bird_scores=[]
    second_bird_list = []
    for bird in second_birds:
        if bird in all_birds:
            bird_scores.append(float(row[bird]))
            if float(row[bird]) > 0.5:
                second_bird_list.append(bird)
        else:
            bird_scores.append(0)
    secondaries = '[' + ', '.join([f"'{item}'" for item in second_bird_list]) + ']'
    
    score = float( - max(bird_scores + [float(row['no-call'])]))  #row[first_bird]
    
    row['secondary_labels'] = secondaries
    row['score'] = score
    return row
    

df['score'] = 0
df = df.apply(score_clips, axis=1)

pd.set_option('display.max_rows', 20)
df.head()

  df['score'] = 0


Unnamed: 0,filepath,start,primary_label,secondary_labels,type,abethr1,abhori1,abythr1,afbfly1,afdfly1,...,yefcan,yelbis1,yenspu1,yertin1,yesbar1,yespet1,yetgre1,yewgre1,file_count,score
0,kaggle/input/birdclef-2023/train_audio/abethr1/XC128013.ogg,0,abethr1,[],['song'],0.042603,0.0013,0.0009,0.0001,0.0008,...,0.0011,0.0008,0.0002,0.0003,0.0001,0.0002,0.0001,0.0001,15,-0.005001
1,kaggle/input/birdclef-2023/train_audio/abethr1/XC128013.ogg,3,abethr1,[],['song'],0.010002,0.0002,0.0004,0.0,0.0002,...,0.0002,0.0001,0.0001,0.0005,0.0001,0.0001,0.0,0.0,15,-0.004601
2,kaggle/input/birdclef-2023/train_audio/abethr1/XC128013.ogg,6,abethr1,[],['song'],0.015198,0.0007,0.002199,0.0001,0.0007,...,0.0009,0.0004,0.0001,0.001,0.0002,0.0001,0.0002,0.0001,15,-0.009399
3,kaggle/input/birdclef-2023/train_audio/abethr1/XC128013.ogg,9,abethr1,[],['song'],0.048004,0.0016,0.0013,0.0001,0.0003,...,0.0009,0.0004,0.0001,0.0005,0.0002,0.0002,0.0001,0.0,15,-0.0065
4,kaggle/input/birdclef-2023/train_audio/abethr1/XC128013.ogg,12,abethr1,[],['song'],0.054688,0.005699,0.0005,0.0003,0.0008,...,0.002001,0.002001,0.0008,0.0015,0.0007,0.0008,0.0003,0.0003,15,-0.029205


In [55]:
cols = ['filepath', 'primary_label', 'score', 'start', 'secondary_labels', 'type']
df_view = df.loc[:, cols]
df_copy = df_view.copy()
del df

df_copy['c_score'] = (df_copy['score'] + df_copy['score'].shift(-1))/2
changed_rows = df_copy[df_copy['filepath'] != df_copy['filepath'].shift(-1)]
df_copy.loc[changed_rows.index, 'c_score'] = -5   # set the last in every series from a file to -5 so they are never chosen
df=df_copy.copy()
del df_copy
df.head()

Unnamed: 0,filepath,primary_label,score,start,secondary_labels,type,c_score
0,kaggle/input/birdclef-2023/train_audio/abethr1/XC128013.ogg,abethr1,-0.005001,0,[],['song'],-0.004801
1,kaggle/input/birdclef-2023/train_audio/abethr1/XC128013.ogg,abethr1,-0.004601,3,[],['song'],-0.007
2,kaggle/input/birdclef-2023/train_audio/abethr1/XC128013.ogg,abethr1,-0.009399,6,[],['song'],-0.00795
3,kaggle/input/birdclef-2023/train_audio/abethr1/XC128013.ogg,abethr1,-0.0065,9,[],['song'],-0.017853
4,kaggle/input/birdclef-2023/train_audio/abethr1/XC128013.ogg,abethr1,-0.029205,12,[],['song'],-0.017153


In [56]:
def select_clips(one_bird_df):
    bird = one_bird_df.iloc[0]['primary_label']
    if one_bird_df.shape[0] <= 10:
        print(f'{bird} has only {one_bird_df.shape[0]} samples')
    all_files = one_bird_df['filepath'].unique().tolist()
    num_files = len(all_files)
    if num_files < 50:
        max_extras = Config.MAX_EXTRAS
    elif num_files > 400:
        max_extras = Config.MIN_EXTRAS
    else:
        max_extras = Config.MAX_EXTRAS-int(np.floor((Config.MAX_EXTRAS-Config.MIN_EXTRAS)*(num_files-50)/350))
    #afpkin1 has only 2 samples   I checked, there is just one file and it is 7 seconds long
    #whhsaw1 has 3 samples.  # A single file, 9 seconds long
    #whctur2 has only 3 sample  # Single file, 9 seconds long bird of interest near the middle
    #golher1 has only 4 samples
    
    clip_list = []
    for file in all_files:
        one_file_df = one_bird_df[one_bird_df['filepath'] == file]
        one_file_df = one_file_df.sort_values('start', ascending=True)
        num_chunks = one_file_df.shape[0]
        if num_chunks >= 4:
            max_extras = max(2, min(max_extras, int(num_chunks//Config.DROP_FACTOR)))  
            one_file_df = one_file_df.iloc[1:-2]
            one_file_df = one_file_df.sort_values('c_score', ascending=False)
            one_file_df = one_file_df.head(max_extras)
            clip_list = clip_list + list(zip(one_file_df['filepath'], one_file_df['start'])) 
    return clip_list

In [57]:
bird_clips = {}
for bird in tqdm(all_birds):
    cols = ['filepath', 'primary_label', 'c_score', 'start']
    one_bird_df = df[cols][df['primary_label'] == bird]
    bird_clips[bird] = select_clips(one_bird_df)  
    del one_bird_df
    gc.collect()
out_clips = [item for sublist in bird_clips.values() for item in sublist] # a list of lists tuples (filepath,start_time)
print(f'There are {len(out_clips)} clips in the list')
out_df = df[df[['filepath', 'start']].apply(tuple, axis=1).isin(out_clips)]
print(f'The new dataframe has {out_df.shape[0]} rows')
out_df.head(5)

  5%|▍         | 12/264 [00:02<00:56,  4.48it/s]

afpkin1 has only 2 samples


 20%|█▉        | 52/264 [00:12<00:45,  4.63it/s]

brtcha1 has only 9 samples


 31%|███       | 82/264 [00:20<00:42,  4.25it/s]

dotbar1 has only 9 samples


 39%|███▉      | 103/264 [00:26<00:35,  4.51it/s]

golher1 has only 4 samples


 53%|█████▎    | 141/264 [00:35<00:26,  4.73it/s]

lotlap1 has only 6 samples


 69%|██████▉   | 182/264 [00:44<00:17,  4.68it/s]

rehblu1 has only 6 samples


 91%|█████████ | 240/264 [00:58<00:04,  5.10it/s]

whctur2 has only 3 samples


 92%|█████████▏| 242/264 [00:58<00:04,  5.13it/s]

whhsaw1 has only 3 samples


100%|██████████| 264/264 [01:04<00:00,  4.09it/s]


There are 29208 clips in the list
The new dataframe has 29208 rows


Unnamed: 0,filepath,primary_label,score,start,secondary_labels,type,c_score
1,kaggle/input/birdclef-2023/train_audio/abethr1/XC128013.ogg,abethr1,-0.004601,3,[],['song'],-0.007
2,kaggle/input/birdclef-2023/train_audio/abethr1/XC128013.ogg,abethr1,-0.009399,6,[],['song'],-0.00795
5,kaggle/input/birdclef-2023/train_audio/abethr1/XC128013.ogg,abethr1,-0.0051,15,[],['song'],-0.007849
6,kaggle/input/birdclef-2023/train_audio/abethr1/XC128013.ogg,abethr1,-0.010597,18,[],['song'],-0.006099
7,kaggle/input/birdclef-2023/train_audio/abethr1/XC128013.ogg,abethr1,-0.0016,21,[],['song'],-0.003551


In [58]:
out_clips = [item for sublist in bird_clips.values() for item in sublist] # a list of lists tuples (filepath,start_time)
print(f'There are {len(out_clips)} clips in the list')

# Convert list of tuples to set of tuples
out_clips_set = set(out_clips)
print(f'There are {len(out_clips_set)} unique clips in the set')

# Convert dataframe columns to set of tuples
df_columns_set = set(df[['filepath', 'start']].apply(tuple, axis=1))
print(f'There are {len(df_columns_set)} unique tuples in the dataframe')

# Find the intersection between the sets
common_set = out_clips_set.intersection(df_columns_set)
print(f'There are {len(common_set)} common tuples in both sets')

# Subset the dataframe with the common tuples
out_df = df[df[['filepath', 'start']].apply(tuple, axis=1).isin(common_set)]
print(f'The output dataframe has {len(out_df)} rows')

There are 29208 clips in the list
There are 29208 unique clips in the set
There are 223582 unique tuples in the dataframe
There are 29208 common tuples in both sets
The output dataframe has 29208 rows


In [59]:
cols = ['filepath', 'start','primary_label', 'secondary_labels', 'type', 'score', 'c_score']
out_df[cols].to_csv(out_csv_scores)
out_iterable = list(zip(out_df['filepath'], out_df['start']))

def get_new_filepath(row):
    old_stem = str(Path(row['filepath']).stem)
    start_time = str(row['start'])
    row['filepath'] = new_dataset_path + '/train_audio/' + old_stem + '_' + start_time + '.ogg'
    return row 

multiclass_df = out_df.copy()
multiclass_df = multiclass_df.apply(get_new_filepath, axis=1)
cols = ['filepath','primary_label', 'secondary_labels', 'type']
multiclass_df[cols].to_csv(out_csv_multiclass, index=False)

multilabel_df = out_df.copy()
multilabel_df = multilabel_df.apply(get_new_filepath, axis=1)
multilabel_df = multilabel_df[cols]
multilabel_df = pd.concat([multilabel_df, pd.get_dummies(multilabel_df['primary_label'])], axis=1)

for index, row in multilabel_df.iterrows():
    second_birds =  ast.literal_eval(row['secondary_labels'])
    for bird in second_birds:
        row[bird] = 1

multilabel_df.to_csv(out_csv_multilabel, index=False)
multilabel_df.head()

Unnamed: 0,filepath,primary_label,secondary_labels,type,abethr1,abhori1,abythr1,afbfly1,afdfly1,afecuc1,...,yebsto1,yeccan1,yefcan,yelbis1,yenspu1,yertin1,yesbar1,yespet1,yetgre1,yewgre1
1,/kaggle/input/birdclef23-extras-8-sec-ogg/train_audio/XC128013_3.ogg,abethr1,[],['song'],1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,/kaggle/input/birdclef23-extras-8-sec-ogg/train_audio/XC128013_6.ogg,abethr1,[],['song'],1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,/kaggle/input/birdclef23-extras-8-sec-ogg/train_audio/XC128013_15.ogg,abethr1,[],['song'],1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,/kaggle/input/birdclef23-extras-8-sec-ogg/train_audio/XC128013_18.ogg,abethr1,[],['song'],1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,/kaggle/input/birdclef23-extras-8-sec-ogg/train_audio/XC128013_21.ogg,abethr1,[],['song'],1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [60]:
def load_ogg(path):
    y, _ = torchaudio.load(path)
    y = y.squeeze().numpy()
    #y = np.mean(y, 1) # For any sterio (X, 2) arrays
    if not np.isfinite(y).all():
        y[np.isnan(y)] = np.zeros_like(y)
        y[np.isinf(y)] = np.max(y)
    return y, len(y)


def play_audio(file_path):
    audio_abe, sr_abe = librosa.load(file_path)
    return Audio(data=audio_abe, rate=sr_abe)


def modify_path(old_path, number):
    new_name = str(Path(old_path).stem) + f'_{number}.ogg'
    new_path = out_soundfile_folder / new_name 
    return new_path



def file_to_chunks(item_tuple):
    old_path, start_time = item_tuple
    new_path = modify_path(old_path, start_time)
    if in_kaggle:
        y, length = load_ogg('/' + old_path)
    else:
        y, length = load_ogg(old_path)
    start_time=int(start_time)
    start = start_time * Config.SR
    end = start + Config.SR * Config.CHUNK_DURATION
    if length > end:
        chunk = y[start:end]
    else:
        chunk = y[start:]
    sf.write(new_path, chunk, Config.SR,  format='OGG', subtype='VORBIS')
    return 

In [61]:
print(out_iterable[:10])
print(len(out_iterable))

[('kaggle/input/birdclef-2023/train_audio/abethr1/XC128013.ogg', 3), ('kaggle/input/birdclef-2023/train_audio/abethr1/XC128013.ogg', 6), ('kaggle/input/birdclef-2023/train_audio/abethr1/XC128013.ogg', 15), ('kaggle/input/birdclef-2023/train_audio/abethr1/XC128013.ogg', 18), ('kaggle/input/birdclef-2023/train_audio/abethr1/XC128013.ogg', 21), ('kaggle/input/birdclef-2023/train_audio/abethr1/XC128013.ogg', 24), ('kaggle/input/birdclef-2023/train_audio/abethr1/XC128013.ogg', 27), ('kaggle/input/birdclef-2023/train_audio/abethr1/XC128013.ogg', 30), ('kaggle/input/birdclef-2023/train_audio/abethr1/XC128013.ogg', 33), ('kaggle/input/birdclef-2023/train_audio/abethr1/XC128013.ogg', 36)]
29208


In [62]:
if Config.MAKE_SOUNDFILES:
    if in_kaggle or in_linux:
        if __name__ == '__main__':
            with Pool(processes=4) as pool:
                results = list(tqdm(pool.imap(file_to_chunks, out_iterable), total=len(out_iterable)))
    else:
        for thing in tqdm(out_iterable):
            file_to_chunks(thing)

In [63]:
print(f'There are a total of {len(os.listdir(out_soundfile_folder))} new sound files in the output folder')
print(f'There are {multiclass_df.shape[0]} rows in the multiclass dataframe')

There are a total of 0 new sound files in the output folder
There are 29208 rows in the multiclass dataframe


Below I'm just checking a random sample of the file paths written to the csv

In [64]:
rand_list = random.sample(range(1,len(os.listdir(out_soundfile_folder))),200)
for num in rand_list[:4]:
    print(multiclass_df.iloc[num]['filepath'])

ValueError: Sample larger than population or is negative

And to varify a few files have saved correctly contain birds:

In [None]:
def path_from_csv(idx):
    if in_kaggle:
        return '/kaggle/working' + multiclass_df.iloc[rand_list[idx]]['filepath'].replace(new_dataset_path, '')
    else:
        return 'kaggle/working' + multiclass_df.iloc[rand_list[idx]]['filepath'].replace('kaggle/input/', '')
path_from_csv(5)

'kaggle/working/birdclef23-extras-8-sec-ogg/train_audio/XC419349_3.ogg'

In [None]:
play_audio(path_from_csv(0))

In [None]:
play_audio(path_from_csv(1))

In [None]:
play_audio(path_from_csv(20))

In [None]:
play_audio(path_from_csv(45))

In [None]:
play_audio(path_from_csv(70))

In [None]:
play_audio(path_from_csv(150))

In [None]:
play_audio(path_from_csv(190))

In [None]:
def plot_by_class(df):
    #df = pd.DataFrame({'names': names, 'counts': counts, 'scores': scores})
    fig = px.bar(df, x='mean_score', y='primary_label', color='frequency', orientation='h', hover_data=['frequency', 'mean_score'], range_x=[-.2, 1])
    fig.update_layout(height=1600)
    fig.show()
    return

In [None]:
plotting_df = multiclass_df[['primary_label', 'score']].copy()
plotting_df = plotting_df.groupby('primary_label')['score'].agg(['mean', 'count'])
plotting_df = plotting_df.rename(columns={'mean': 'mean_score', 'count': 'frequency'}).reset_index()
plotting_df = plotting_df.sort_values('frequency', ascending=True)
pd.set_option('display.max_rows', None)
plotting_df.head(10)

Unnamed: 0,primary_label,mean_score,frequency
177,rehblu1,-0.015202,2
80,dotbar1,-0.008499,2
239,witswa1,-0.0018,4
51,brtcha1,-0.002375,4
137,lotcor1,-0.072296,5
90,fatwid1,-0.097866,5
124,joygre1,-0.007702,5
74,crefra2,-0.0088,6
45,brcwea1,-0.024167,6
35,blksaw1,-0.0194,7


In [None]:
plotting_df[plotting_df['frequency']<=4]

Unnamed: 0,primary_label,mean_score,frequency
177,rehblu1,-0.015202,2
80,dotbar1,-0.008499,2
239,witswa1,-0.0018,4
51,brtcha1,-0.002375,4


In [None]:
plot_by_class(plotting_df)