### Load dependencies

In [4]:
import os
import shutil
import time
import pickle
import pandas as pd
import matplotlib.pyplot as pyplot
import librosa
import librosa.display
import gc
import numpy as np
import urllib

def download_rec(uri, rec_dir, filename):
    if not os.path.exists(rec_dir+'/'+uri.replace('/','_')):
        try:
            urllib.request.urlretrieve (uri, rec_dir+'/'+filename)
        except Exception as e:
            print(e)
            print('Error downloading '+uri)
        return
    

### Specify data paths:

1. train_set_dir - Folder where training data will be stored
2. recording_dir - Folder where audio recordings will be stored
3. sound_annotation_file - File storing template matching validation metadata
4. (Optional) sampling_rate - rate to resample training data recordings to
    

In [5]:
train_set_dir = '../data/train_tp/' # Folder where training data will be stored
recording_dir = '../data/recordings/' # Folder holding recordings

sound_annotation_files = ['./example_annotations.csv']
# File storing ROIs of detected sounds (animal calls) 
#     Required columns:
#          species
#          x1 (start time of sound)
#          x2 (end time of sound)
#          uri (recording file path)

sampling_rate = 48000 # training data recording sample rate

### Run remaining cells to generate training data

In [6]:
if not os.path.exists(recording_dir):
    os.mkdir(recording_dir)
if not os.path.exists(train_set_dir):
    os.mkdir(train_set_dir)

In [7]:
if len(sound_annotation_files)==1:
    rois= pd.read_csv(sound_annotation_files[0])
elif len(sound_annotation_files)==0:
    print('Must provide an annotation file')
elif len(sound_annotation_files)>1:
    rois = pd.read_csv(sound_annotation_files[0])
    for i in sound_annotation_files[1:]:
        tmp = pd.read_csv(sound_annotation_files[i])
        rois = pd.concat([rois,tmp])
rois.head()

Unnamed: 0,id,recording,site,year,month,day,hour,min,species,songtype,x1,x2,y1,y2,validated,uri,score,site_id
0,24735680,sabana_seca1-2014-07-15_05-50.flac,Sabana Seca,2014,7,15,5,50,Eleutherodactylus juanariveroi,Common Song,42.922667,44.032,6562.5,8343.75,(not validated),https://s3.amazonaws.com/arbimon2/project_1/de...,0.300001,506
1,24721339,sabana_seca1-2014-06-29_05-10.flac,Sabana Seca,2014,6,29,5,10,Eleutherodactylus juanariveroi,Common Song,17.36,18.469333,6562.5,8343.75,(not validated),https://s3.amazonaws.com/arbimon2/project_1/de...,0.300001,506
2,24728745,sabana_seca1-2014-04-04_03-20.flac,Sabana Seca,2014,4,4,3,20,Eleutherodactylus juanariveroi,Common Song,52.981333,54.090667,6562.5,8343.75,(not validated),https://s3.amazonaws.com/arbimon2/project_1/de...,0.300004,506
3,24723268,sabana_seca1-2014-03-16_01-40.flac,Sabana Seca,2014,3,16,1,40,Eleutherodactylus juanariveroi,Common Song,38.938667,40.048,6562.5,8343.75,(not validated),https://s3.amazonaws.com/arbimon2/project_1/de...,0.300005,506
4,24737104,sabana_seca1-2014-02-09_19-30.flac,Sabana Seca,2014,2,9,19,30,Eleutherodactylus juanariveroi,Common Song,1.477333,2.586667,6562.5,8343.75,(not validated),https://s3.amazonaws.com/arbimon2/project_1/de...,0.300007,506


In [8]:
# For using Arbimon 2 Pattern Matching results - convert uri to full download URL
rois['uri'] = [i[1].uri.split('detections')[0]+
                  'site_'+str(i[1].site_id)+'/'+
                  str(i[1].year)+'/'+
                  str(i[1].month)+'/'+
                  i[1].recording for i in rois.iterrows()]

In [9]:
print('Number of ROIs for each species\n')

for i in list(set(rois.species)):
    print(str(i)+'\t\t'+str(len(rois[rois.species==i])))

Number of ROIs for each species

Eleutherodactylus juanariveroi		5


In [14]:
window_length = 2 # sample time-window length in seconds
k = 0
t0 = time.time()
rec_loaded = False
for i in list(set(rois.uri)): # loop over recordings
    
    k = k+1
#     if k%200==0:
    print(k)
        
    tmp = rois[rois.uri==i]
    if 's3.amazonaws' in tmp.iloc[0].uri:
        audio_filename = tmp.iloc[0].uri.replace('/','_').split('arbimon2_')[1]

    for c in range(len(tmp)): # loop over spectrogram ROIs
        
        try:

            sound_start, sound_end = [tmp.iloc[c]['x1'], tmp.iloc[c]['x2']]
            species = tmp.iloc[c].species.replace(' ','_')
            
            if not os.path.exists(train_set_dir+'/'+str(species)):
                os.mkdir(train_set_dir+'/'+str(species))

            shft = ((sound_end-sound_start)-window_length)/2
            start_sample = round(sampling_rate*(sound_start+shft))
            start_sample = max(start_sample, 0)
            filename = audio_filename.split('.')[0]+'_'+str(round(start_sample/sampling_rate,2))+'-'+str(round((start_sample/sampling_rate)+window_length,2))+'.png'

            if not os.path.exists(train_set_dir+str(species)+'/'+filename):
                if not rec_loaded:
                    if not os.path.exists(recording_dir+'/'+audio_filename):
                        download_rec(tmp.iloc[0].uri, recording_dir, audio_filename)
                    try:
                        audio_data, sampling_rate = librosa.load(recording_dir+audio_filename, sr=sampling_rate)
                        rec_loaded = True
                    except Exception as e:
                        print(e)
                        continue
                S = librosa.feature.melspectrogram(y = audio_data[int(start_sample): int(start_sample+round(sampling_rate*window_length))], 
                                               sr = sampling_rate,
                                               n_fft=2048, 
                                               hop_length=512, 
                                               win_length=1024)
                dpi=100
                fig = pyplot.figure(num=None, figsize=(300/dpi, 300/dpi), dpi=dpi)
                pyplot.subplot(222)
                ax = pyplot.axes()
                ax.set_axis_off()
                librosa.display.specshow(librosa.power_to_db(S, ref=np.max))
                pyplot.savefig(train_set_dir+str(species.replace(' ','_'))+'/'+filename, bbox_inches='tight', transparent=True, pad_inches=0.0)
                pyplot.close()
                
        except Exception as e:
            print(e)
            continue
        
    rec_loaded = False    

1
2
3
4
5
