In [1]:
import pandas as pd
import numpy as np
import glob
import librosa
import torchaudio.transforms
from tqdm.notebook import tqdm
from matplotlib import pyplot as plt
import torch
import torchaudio
import os

In [2]:
DATASET_PATH = '../data/raw/kenya'

In [3]:
SR = 32000

In [4]:
meta = pd.read_csv(f'{DATASET_PATH}/annotations.csv')
meta.head()

Unnamed: 0,Filename,Start Time (s),End Time (s),Species eBird Code
0,KEN_001_20211207_153852.flac,67.8,67.8,slcbou1
1,KEN_001_20211207_153852.flac,106.8,106.8,slcbou1
2,KEN_001_20211207_153852.flac,107.5,107.5,hamerk1
3,KEN_001_20211207_153852.flac,118.4,118.4,slcbou1
4,KEN_001_20211207_153852.flac,209.1,209.1,hamerk1


In [5]:
meta.iloc[:,3].value_counts()

Species eBird Code
combul2    702
reccor     508
rbsrob1    494
wbswea1    469
gnbcam2    406
          ... 
yeccan1      1
whbcan1      1
chespa1      1
brcsta1      1
lawgol       1
Name: count, Length: 176, dtype: int64

# Convert Zenodo dataset to 5sec format
Assumes format from 2023 competition Kenya dataset, which has start=end time

In [6]:
SEG_LEN = 5
def process_file(group: pd.DataFrame):
    last_event = group['Start Time (s)'].max()
    num_segments = int(last_event // SEG_LEN + 1)
    
    
    # create labels list
    label_bins = [list() for _ in range(num_segments)]
    for idx, row in group.iterrows():
        bin = int(row.iloc[1] // SEG_LEN)
        bird = row.iloc[3]
        if bird not in label_bins[bin]:
            label_bins[bin].append(bird)
        
    # create new dataframe
    filename = group.iloc[0,0]
    offsets = np.arange(0,num_segments)*SEG_LEN
    filenames = [f'{filename[:-5]}_{o}.ogg' for o in offsets]
    labels = [str(l) for l in label_bins]
    
    return pd.DataFrame({
        'filename': filenames,
        'offset': offsets,
        'labels': labels
    })

meta_5s = meta.groupby('Filename').apply(process_file).reset_index(drop=True)
meta_5s.head()

  meta_5s = meta.groupby('Filename').apply(process_file).reset_index(drop=True)


Unnamed: 0,filename,offset,labels
0,KEN_001_20211207_153852_0.ogg,0,[]
1,KEN_001_20211207_153852_5.ogg,5,[]
2,KEN_001_20211207_153852_10.ogg,10,[]
3,KEN_001_20211207_153852_15.ogg,15,[]
4,KEN_001_20211207_153852_20.ogg,20,[]


In [7]:
import ast

# Set the primary label, arbitrarily selects a bird, can be used for splitter
def set_primary(row):
    labels = ast.literal_eval(row['labels'])
    if len(labels) > 0:
        row['primary_label'] = labels[0]
    return row

meta_5s['rating'] = 5
meta_5s['primary_label'] = 'silent'
meta_5s = meta_5s.apply(set_primary, axis=1)
meta_5s.head()

Unnamed: 0,filename,offset,labels,rating,primary_label
0,KEN_001_20211207_153852_0.ogg,0,[],5,silent
1,KEN_001_20211207_153852_5.ogg,5,[],5,silent
2,KEN_001_20211207_153852_10.ogg,10,[],5,silent
3,KEN_001_20211207_153852_15.ogg,15,[],5,silent
4,KEN_001_20211207_153852_20.ogg,20,[],5,silent


In [8]:
meta_5s[meta_5s['labels'] != '[]'].head()

Unnamed: 0,filename,offset,labels,rating,primary_label
13,KEN_001_20211207_153852_65.ogg,65,['slcbou1'],5,slcbou1
21,KEN_001_20211207_153852_105.ogg,105,"['slcbou1', 'hamerk1']",5,slcbou1
23,KEN_001_20211207_153852_115.ogg,115,['slcbou1'],5,slcbou1
41,KEN_001_20211207_153852_205.ogg,205,['hamerk1'],5,hamerk1
42,KEN_001_20211207_153852_210.ogg,210,['hamerk1'],5,hamerk1


In [9]:
meta_5s.to_csv(f'{DATASET_PATH}/train_metadata.csv', index=False)

# Create 5 sec segments

In [10]:
import soundfile as sf

if not os.path.exists(f'{DATASET_PATH}/train_audio'):
    os.makedirs(f'{DATASET_PATH}/train_audio')

audio_loaded = None
audio_loaded_name = None
for i, row in tqdm(meta_5s.iterrows(), total=len(meta_5s)):
    source_file = '_'.join(row['filename'].split('_')[:-1])
    target_file = f'{DATASET_PATH}/train_audio/{row["filename"]}.ogg'
    if os.path.exists(target_file):
        continue   
    
    if audio_loaded_name != source_file:
        audio_loaded, _  = librosa.load(f'{DATASET_PATH}/soundscape_data/{source_file}.flac', sr= SR)
        audio_loaded_name = source_file
    
    # select segment by offset
    offset = row['offset'] * SR
    audio = audio_loaded[offset:offset+min(len(audio_loaded), SR*SEG_LEN)]
    
    # crop or pad
    if len(audio) > SR * SEG_LEN:
        audio = audio[:SR * SEG_LEN]
    if len(audio) < SR * SEG_LEN:
        audio = np.pad(audio, (0, SR * SEG_LEN - len(audio)))
    
    # save audio to ogg
    sf.write(target_file[:-4], audio, SR, format='ogg')

  0%|          | 0/23209 [00:00<?, ?it/s]