In [20]:
import pandas as pd
import numpy as np
import glob
import librosa
import torchaudio.transforms
from tqdm.notebook import tqdm
from matplotlib import pyplot as plt
import torch
import torchaudio
import os

In [21]:
DATASET_PATH = '../../data/raw/pam20'

In [22]:
SR = 32000

In [23]:
meta = pd.read_csv(f'{DATASET_PATH}/annotations.csv')
meta.head()

Unnamed: 0,Filename,Start Time (s),End Time (s),Low Freq (Hz),High Freq (Hz),Species eBird Code
0,PER_001_S01_20190116_100007Z.flac,539.0,541.4,1250,2468,blfant1
1,PER_001_S01_20190116_100007Z.flac,520.5,644.6,961,2884,grasal3
2,PER_001_S01_20190116_100007Z.flac,596.9,598.8,1437,2375,greant1
3,PER_001_S01_20190116_100007Z.flac,655.0,656.8,812,1593,undtin1
4,PER_001_S01_20190116_100007Z.flac,466.3,468.2,552,2658,butwoo1


In [24]:
meta.iloc[:,3].value_counts()

Low Freq (Hz)
1428    257
1388    247
1349    236
1309    223
1230    216
       ... 
1903      1
186       1
1870      1
3046      1
1590      1
Name: count, Length: 1884, dtype: int64

# Convert Zenodo dataset to 5sec format
Assumes format from 2023 competition Kenya dataset, which has start=end time

In [25]:
SEG_LEN = 5
def process_file(group: pd.DataFrame):
    last_event = group['Start Time (s)'].max()
    num_segments = int(last_event // SEG_LEN + 1)
    
    
    # create labels list
    label_bins = [list() for _ in range(num_segments)]
    for idx, row in group.iterrows():
        bin_start = int(row.iloc[1] // SEG_LEN)
        bin_end = min(int(row.iloc[2] // SEG_LEN), len(label_bins)-1)
        bird = row.iloc[5]
        if bird == '????':
            continue
        print(bin_start,bin_end, bird, len(label_bins))
        for bin in range(bin_start, bin_end+1):
            if bird not in label_bins[bin]:
                label_bins[bin].append(bird)
        
    # create new dataframe
    filename = group.iloc[0,0]
    offsets = np.arange(0,num_segments)*SEG_LEN
    filenames = [f'{filename[:-5]}_{o}.ogg' for o in offsets]
    labels = [str(l) for l in label_bins]
    
    return pd.DataFrame({
        'filename': filenames,
        'offset': offsets,
        'labels': labels
    })

meta_5s = meta.groupby('Filename').apply(process_file).reset_index(drop=True)
meta_5s.head()

107 108 blfant1 720
104 128 grasal3 720
119 119 greant1 720
131 131 undtin1 720
93 93 butwoo1 720
100 100 blfant1 720
101 102 butwoo1 720
85 85 blfant1 720
86 87 grasal3 720
88 88 grasal3 720
104 104 blfant1 720
103 104 undtin1 720
110 111 bucmot4 720
111 111 bucmot4 720
112 112 blfant1 720
114 115 butwoo1 720
115 116 blfant1 720
123 124 undtin1 720
126 126 undtin1 720
125 125 bucmot4 720
127 127 blfant1 720
129 129 butwoo1 720
131 131 blfant1 720
132 132 undtin1 720
133 134 undtin1 720
134 135 blfant1 720
135 137 cintin1 720
137 138 butwoo1 720
137 138 bucmot4 720
139 139 blfant1 720
139 141 cintin1 720
140 142 elewoo1 720
145 146 undtin1 720
151 152 butwoo1 720
154 155 butwoo1 720
159 160 butwoo1 720
165 165 blfant1 720
167 168 undtin1 720
169 169 blfant1 720
169 172 whwbec1 720
172 173 butwoo1 720
160 160 blfant1 720
129 129 cintin1 720
173 173 undtin1 720
173 174 blfant1 720
177 178 blfant1 720
176 177 undtin1 720
179 180 butwoo1 720
180 180 undtin1 720
185 185 blfant1 720
190 190 

  meta_5s = meta.groupby('Filename').apply(process_file).reset_index(drop=True)


Unnamed: 0,filename,offset,labels
0,PER_001_S01_20190116_100007Z_0.ogg,0,[]
1,PER_001_S01_20190116_100007Z_5.ogg,5,[]
2,PER_001_S01_20190116_100007Z_10.ogg,10,[]
3,PER_001_S01_20190116_100007Z_15.ogg,15,[]
4,PER_001_S01_20190116_100007Z_20.ogg,20,[]


In [26]:
import ast

# Set the primary label, arbitrarily selects a bird, can be used for splitter
def set_primary(row):
    labels = ast.literal_eval(row['labels'])
    if len(labels) > 0:
        row['primary_label'] = labels[0]
    return row

meta_5s['rating'] = 5
meta_5s['primary_label'] = 'silent'
meta_5s = meta_5s.apply(set_primary, axis=1)
meta_5s.head()

Unnamed: 0,filename,offset,labels,rating,primary_label
0,PER_001_S01_20190116_100007Z_0.ogg,0,[],5,silent
1,PER_001_S01_20190116_100007Z_5.ogg,5,[],5,silent
2,PER_001_S01_20190116_100007Z_10.ogg,10,[],5,silent
3,PER_001_S01_20190116_100007Z_15.ogg,15,[],5,silent
4,PER_001_S01_20190116_100007Z_20.ogg,20,[],5,silent


In [27]:
meta_5s[meta_5s['labels'] != '[]'][meta_5s['filename'].str.startswith('PER_001_S01_20190116_100007Z')].sort_values('offset').head(20)

  meta_5s[meta_5s['labels'] != '[]'][meta_5s['filename'].str.startswith('PER_001_S01_20190116_100007Z')].sort_values('offset').head(20)


Unnamed: 0,filename,offset,labels,rating,primary_label
19,PER_001_S01_20190116_100007Z_95.ogg,95,['bucmot4'],5,bucmot4
61,PER_001_S01_20190116_100007Z_305.ogg,305,['butwoo1'],5,butwoo1
75,PER_001_S01_20190116_100007Z_375.ogg,375,['elewoo1'],5,elewoo1
76,PER_001_S01_20190116_100007Z_380.ogg,380,['elewoo1'],5,elewoo1
77,PER_001_S01_20190116_100007Z_385.ogg,385,"['elewoo1', 'cintin1']",5,elewoo1
81,PER_001_S01_20190116_100007Z_405.ogg,405,['undtin1'],5,undtin1
82,PER_001_S01_20190116_100007Z_410.ogg,410,"['undtin1', 'grasal3']",5,undtin1
83,PER_001_S01_20190116_100007Z_415.ogg,415,"['grasal3', 'undtin1']",5,grasal3
84,PER_001_S01_20190116_100007Z_420.ogg,420,"['grasal3', 'undtin1']",5,grasal3
85,PER_001_S01_20190116_100007Z_425.ogg,425,"['blfant1', 'grasal3']",5,blfant1


In [28]:
meta_5s.to_csv(f'{DATASET_PATH}/train_metadata.csv', index=False)

# Create 5 sec segments

In [29]:
import soundfile as sf

if not os.path.exists(f'{DATASET_PATH}/train_audio'):
    os.makedirs(f'{DATASET_PATH}/train_audio')

audio_loaded = None
audio_loaded_name = None
for i, row in tqdm(meta_5s.iterrows(), total=len(meta_5s)):
    source_file = '_'.join(row['filename'].split('_')[:-1])
    target_file = f'{DATASET_PATH}/train_audio/{row["filename"]}.ogg'
    if os.path.exists(target_file):
        continue   
    
    if audio_loaded_name != source_file:
        audio_loaded, _  = librosa.load(f'{DATASET_PATH}/soundscape_data/{source_file}.flac', sr=SR)
        audio_loaded_name = source_file
    
    # select segment by offset
    offset = row['offset'] * SR
    audio = audio_loaded[offset:offset+min(len(audio_loaded), SR*SEG_LEN)]
    
    # crop or pad
    if len(audio) > SR * SEG_LEN:
        audio = audio[:SR * SEG_LEN]
    if len(audio) < SR * SEG_LEN:
        audio = np.pad(audio, (0, SR * SEG_LEN - len(audio)))
    
    # save audio to ogg
    sf.write(target_file[:-4], audio, SR, format='ogg')

  0%|          | 0/15102 [00:00<?, ?it/s]