#### This script gets all anotations coordinates from excel file
##### -computes acoustic features 
##### -cleans and show metrics of data
##### -writes new .csv file for dash app#

In [1]:
import pandas as pd
from glob import glob
from pathlib import Path
import maad
import librosa
import soundfile as sf
import numpy as np
import warnings
warnings.filterwarnings(action='ignore')

In [3]:

datapath = '/Users/Paul/Paul/Desktop/My_projects/Bioacoustics/Maputo_Dash/datasets/COUA/wav'
# datapath = '/Users/Paul/Paul/Desktop/My_projects/Bioacoustics/Maputo_Dash/datasets/wav/xenocanto'
annot_path = '/Users/Paul/Paul/Desktop/My_projects/Bioacoustics/Maputo_Dash/datasets/tables/annot.csv'

In [4]:
filelist = glob(datapath+'/**/*.wav', recursive = True)
df_data=pd.DataFrame()
for file in filelist:
    species = Path(file).parts[-2].rsplit(sep='_')[1]
    gen = Path(file).parts[-2].rsplit(sep='_')[0]
    df_data = df_data.append({'fullfilename': file,
                            'sound_id': Path(file).parts[-1][:-4],
                            'species': species,
                            'gen' : gen},
                            ignore_index=True)

In [6]:
df_data.to_csv('/Users/Paul/Paul/Desktop/My_projects/Bioacoustics/Maputo_Dash/datasets/COUA/tables/df_soundfiles_paths.csv')

In [7]:
df_annot = pd.read_csv(annot_path)
# df_annot['sound_id']=df_annot['record'].astype(str)


FileNotFoundError: [Errno 2] No such file or directory: '/Users/Paul/Paul/Desktop/My_projects/Bioacoustics/Maputo_Dash/datasets/tables/annot.csv'

In [5]:
df_annot.shape

(3375, 17)

In [6]:
f"Theses are the missing species:{df_annot[df_annot.sound_id.astype(str).isin(df_data.sound_id)==False]['gen'].unique()}"


'Theses are the missing species:[]'

In [7]:
df_annot[df_annot.sound_id.astype(str).isin(df_data.sound_id)].shape

(3375, 17)

In [8]:

df_annot['sound_id'] = df_annot['sound_id'].astype(str)
#filter annotations to .wav files we have in datasets/xenocanto/ (in case someone lost a wav file or an annotation)
df_annot = df_annot[df_annot.sound_id.isin(df_data.sound_id)]


In [9]:
df_annot.shape

(3375, 17)

In [10]:
# COMPUTE MAAD SPECTRAL AND TEMPORAL ALPHA FEATURES ON EACH OF THE MANNUALLY CONTOURED ROI
#list of features computed by maad, we filter them out later and compute them anyway....
SPECTRAL_FEATURES = ['MEANf', 'VARf', 'SKEWf', 'KURTf', 'NBPEAKS', 'LEQf',
                        'ENRf', 'BGNf', 'SNRf', 'Hf', 'EAS', 'ECU', 'ECV', 'EPS', 'EPS_KURT', 'EPS_SKEW', 'ACI',
                        'NDSI', 'rBA', 'AnthroEnergy', 'BioEnergy', 'BI', 'ROU', 'ADI', 'AEI', 'LFC', 'MFC', 'HFC',
                        'ACTspFract', 'ACTspCount', 'ACTspMean', 'EVNspFract', 'EVNspMean', 'EVNspCount',
                        'TFSD', 'H_Havrda', 'H_Renyi', 'H_pairedShannon', 'H_gamma', 'H_GiniSimpson', 'RAOQ',
                        'AGI', 'ROItotal', 'ROIcover']

TEMPORAL_FEATURES = ['ZCR', 'MEANt', 'VARt', 'SKEWt', 'KURTt',
                        'LEQt', 'BGNt', 'SNRt', 'MED', 'Ht', 'ACTtFraction', 'ACTtCount',
                        'ACTtMean', 'EVNtFraction', 'EVNtMean', 'EVNtCount']


In [11]:

df_annot_final = pd.DataFrame()
i=1
for file in filelist[:]:
    print(f'----------------------loading ({i}/{len(filelist)}): {file}...')
    i+=1
    #load .wav
    temp, sr = librosa.load(file, sr=None)
    sf.write('tmp.wav', temp, sr)
    s, fs = maad.sound.load('tmp.wav')
    maxAmp = np.abs(s).max()  # used to normalize
    #get spectro for tn and fn
    Sxx_power, tn, fn, ext = maad.sound.spectrogram(
        s/maxAmp, fs, flims=(0, 20000), display=False)
    
    #get ROI from annotation and convert x y to t f
    df_roi_annot = df_annot[df_annot['sound_id']==Path(file).parts[-1][:-4]].reset_index(drop=True)
    df_roi_annot = maad.util.format_features(df_roi_annot,tn,fn)
    df_rois_shape = pd.DataFrame()
    #if soundfile is anotated extract sound features from roi coordinates
    if len(df_roi_annot)>0:
        #INFO
        df_info = df_roi_annot.loc[:,['species','gen','Family','Order','Biotope','sound_id','n_roi']]    

        #SHAPES
        df_rois_shape_temp = maad.features.all_shape_features(
        s, fs, df_roi_annot, resolution='med', display=False)
        df_rois_shape = pd.concat([df_rois_shape,df_rois_shape_temp.reset_index(drop=True)],axis=0,ignore_index=True)
        
        #ALPHA INDICES
        df_temporal_features = pd.DataFrame(columns=TEMPORAL_FEATURES)
        df_spectral_features = pd.DataFrame(columns=SPECTRAL_FEATURES)
        
        for index, row in df_roi_annot.iloc[0:].iterrows():
            df_rois_all_features_temp = pd.DataFrame()
            s_trim = maad.sound.trim(
                s, fs, row.min_t, row.max_t)
            s_trim = s_trim - np.mean(s_trim)
            # s_trim = s_trim / np.max(np.abs(s_trim))
            
            try:
                Sxx_trim_power, tn, fn, ext = maad.sound.spectrogram(s_trim, fs,
                                                                        verbose=False, display=False,
                                                                        savefig=None)

                spectral_features_temp, _ = maad.features.all_spectral_alpha_indices(
                        Sxx_trim_power, tn, fn, display=False)
                        
                temporal_features_temp = maad.features.all_temporal_alpha_indices(s_trim, fs)

            except:
                print(str(file)+': error at index (chunk too small?): '+str(index)+'\n s_trim looks like:'+str(s_trim.shape)+'\n temporal size:'+str(row.min_t - row.max_t))

            df_spectral_features = pd.concat([df_spectral_features,spectral_features_temp], axis=0,ignore_index=True)
            df_temporal_features = pd.concat([df_temporal_features,temporal_features_temp], axis=0,ignore_index=True)
            df_rois_all_features_temp = pd.concat([df_spectral_features,df_temporal_features],axis=1).reset_index(drop=True)
        
        df_annot_final_temp = pd.concat([df_info,df_rois_all_features_temp,df_rois_shape],axis=1)
        df_annot_final = pd.concat([df_annot_final,df_annot_final_temp],axis=0)
    else:
        print('no annotation available')
        pass
    
    
df_annot_final= df_annot_final.reset_index(drop=True)
    
df_annot_final = df_annot_final.T.drop_duplicates().T 
    

----------------------loading (1/127): /Users/Paul/Paul/Desktop/My_projects/Bioacoustics/Maputo_Dash/datasets/wav/xenocanto/Calendulauda_africanoides/449420.wav...
number of rois : 33
----------------------loading (2/127): /Users/Paul/Paul/Desktop/My_projects/Bioacoustics/Maputo_Dash/datasets/wav/xenocanto/Cisticola_galactotes/452901.wav...
no annotation available
----------------------loading (3/127): /Users/Paul/Paul/Desktop/My_projects/Bioacoustics/Maputo_Dash/datasets/wav/xenocanto/Cisticola_galactotes/201332.wav...
no annotation available
----------------------loading (4/127): /Users/Paul/Paul/Desktop/My_projects/Bioacoustics/Maputo_Dash/datasets/wav/xenocanto/Cisticola_galactotes/710853.wav...
no annotation available
----------------------loading (5/127): /Users/Paul/Paul/Desktop/My_projects/Bioacoustics/Maputo_Dash/datasets/wav/xenocanto/Cyanomitra_olivacea/721728.wav...
number of rois : 22
----------------------loading (6/127): /Users/Paul/Paul/Desktop/My_projects/Bioacoustics/

# Check your data

In [12]:
df_annot_final.loc[:,['species','gen','Family','Order','Biotope','sound_id']].dropna()

Unnamed: 0,species,gen,Family,Order,Biotope,sound_id
0,africanoides,Calendulauda,Alaudidae,Passeriformes,Grasslands,449420
1,africanoides,Calendulauda,Alaudidae,Passeriformes,Grasslands,449420
2,africanoides,Calendulauda,Alaudidae,Passeriformes,Grasslands,449420
3,africanoides,Calendulauda,Alaudidae,Passeriformes,Grasslands,449420
4,africanoides,Calendulauda,Alaudidae,Passeriformes,Grasslands,449420
...,...,...,...,...,...,...
3370,lugubris,Vanellus,Charadriidae,Charadriiformes,Grasslands,411262
3371,calvus,Treron,Colombidae,Colombiformes,Woodlands,104019
3372,calvus,Treron,Colombidae,Colombiformes,Woodlands,104019
3373,calvus,Treron,Colombidae,Colombiformes,Woodlands,104019


In [13]:
df_annot_final.loc[:,['df','bandwidth_f','bandwidth_y','df','duration_t','duration_x','area_xy', 'centroid_f', 'centroid_t',
       'area_tf']]


Unnamed: 0,df,bandwidth_f,bandwidth_y,df.1,duration_t,duration_x,area_xy,centroid_f,centroid_t,area_tf
0,2620.059,2627.050781,0.0,2620.059,0.12771,60.998821,2684,5124.902344,3.030204,335.5
1,2179.059,2196.386719,0.0,2179.059,0.20898,50.99102,3672,5426.367188,3.215964,459.0
2,1530.529,1507.324219,0.0,1530.529,0.05805,34.992472,700,4306.640625,3.494603,87.5
3,2256.882,2239.453125,0.0,2256.882,0.12771,51.989478,2288,6244.628906,3.622313,286.0
4,1011.706,990.527344,0.0,1011.706,0.06966,23.010975,552,6890.625,3.726803,69.0
...,...,...,...,...,...,...,...,...,...,...
3370,387.097,421.875,0.0,387.097,0.245333,9.002667,828,1875.0,30.144,103.5
3371,1884.712,1894.921875,0.0,1884.712,0.464399,44.001814,7040,1464.257812,1.741497,880.0
3372,1820.46,1851.855469,0.0,1820.46,0.394739,43.003356,5848,1464.257812,4.435011,731.0
3373,1927.546,1937.988281,0.0,1927.546,0.394739,45.000272,6120,1550.390625,8.846803,765.0


In [14]:
df_annot_final.sound_id.nunique()

51

In [15]:
#check for NaNs
for column in df_annot_final.columns.values[:]:
    if df_annot_final[str(column)].isna().sum() > 0:
        print([str(column)])

['ACTspMean']
['LEQt']
['ACTtMean']
['shp_001']
['shp_002']
['shp_003']
['shp_004']
['shp_005']
['shp_006']
['shp_007']
['shp_008']
['shp_009']
['shp_010']
['shp_011']
['shp_012']
['shp_013']
['shp_014']
['shp_015']
['shp_016']
['shp_017']
['shp_018']
['shp_019']
['shp_020']
['shp_021']
['shp_022']
['shp_023']
['shp_024']
['shp_025']
['shp_026']
['shp_027']
['shp_028']
['shp_029']
['shp_030']
['shp_031']
['shp_032']
['shp_033']
['shp_034']
['shp_035']
['shp_036']
['shp_037']
['shp_038']
['shp_039']
['shp_040']
['shp_041']
['shp_042']
['shp_043']
['shp_044']
['shp_045']
['shp_046']
['shp_047']
['shp_048']


In [16]:
df_annot_final.columns[:100]


Index(['species', 'gen', 'Family', 'Order', 'Biotope', 'sound_id', 'n_roi',
       'MEANf', 'VARf', 'SKEWf', 'KURTf', 'NBPEAKS', 'LEQf', 'ENRf', 'BGNf',
       'SNRf', 'Hf', 'EAS', 'ECU', 'ECV', 'EPS', 'EPS_KURT', 'EPS_SKEW', 'ACI',
       'NDSI', 'rBA', 'AnthroEnergy', 'BioEnergy', 'BI', 'ROU', 'ADI', 'AEI',
       'LFC', 'MFC', 'HFC', 'ACTspFract', 'ACTspCount', 'ACTspMean',
       'EVNspFract', 'EVNspMean', 'EVNspCount', 'TFSD', 'H_Havrda', 'H_Renyi',
       'H_pairedShannon', 'H_gamma', 'H_GiniSimpson', 'RAOQ', 'AGI',
       'ROItotal', 'ROIcover', 'ZCR', 'MEANt', 'VARt', 'SKEWt', 'KURTt',
       'LEQt', 'BGNt', 'SNRt', 'MED', 'Ht', 'ACTtFraction', 'ACTtCount',
       'ACTtMean', 'EVNtFraction', 'EVNtMean', 'EVNtCount', 'Unnamed: 0',
       'record', 'min_t', 'max_t', 'min_f', 'max_f', 'dt', 'df', 'APD', 'id',
       'min_y', 'min_x', 'max_y', 'max_x', 'shp_001', 'shp_002', 'shp_003',
       'shp_004', 'shp_005', 'shp_006', 'shp_007', 'shp_008', 'shp_009',
       'shp_010', 'shp_01

In [17]:
df_annot_final.columns[100:]


Index(['shp_020', 'shp_021', 'shp_022', 'shp_023', 'shp_024', 'shp_025',
       'shp_026', 'shp_027', 'shp_028', 'shp_029', 'shp_030', 'shp_031',
       'shp_032', 'shp_033', 'shp_034', 'shp_035', 'shp_036', 'shp_037',
       'shp_038', 'shp_039', 'shp_040', 'shp_041', 'shp_042', 'shp_043',
       'shp_044', 'shp_045', 'shp_046', 'shp_047', 'shp_048', 'centroid_y',
       'centroid_x', 'duration_x', 'bandwidth_y', 'area_xy', 'centroid_f',
       'centroid_t', 'duration_t', 'bandwidth_f', 'area_tf'],
      dtype='object')

# Reduce to interesting and non-null columns

In [22]:
df_annot_final.biotope.unique()

array(['Grasslands', 'Forest', 'Wetlands', 'Woodlands', 'Marshlands',
       'Woodlands '], dtype=object)

In [20]:
df_annot_final =df_annot_final.loc[:,['species','gen', 'Family', 'Order','Biotope', 'sound_id','min_t', 'max_t', 'min_f', 'max_f','dt', 'df', 'centroid_f','centroid_t', 'duration_t', 'bandwidth_f', 'area_tf','shp_002', 'shp_003', 'shp_004', 'shp_005', 'shp_006', 'shp_007','shp_008', 'shp_009', 'shp_010', 'shp_011', 'shp_012', 'shp_013','shp_014', 'shp_015', 'shp_016', 'shp_017', 'shp_018', 'shp_019','shp_020', 'shp_021', 'shp_022', 'shp_023', 'shp_024', 'shp_025','shp_026', 'shp_027', 'shp_028', 'shp_029', 'shp_030', 'shp_031','shp_032', 'shp_033', 'shp_034', 'shp_035', 'shp_036', 'shp_037','shp_038', 'shp_039', 'shp_040', 'shp_041', 'shp_042', 'shp_043','shp_044', 'shp_045', 'shp_046', 'shp_047', 'shp_048','MEANf', 'VARf', 'SKEWf', 'KURTf', 'NBPEAKS', 'LEQf', 'ENRf', 'BGNf', 'SNRf', 'Hf', 'EAS', 'ECU', 'ECV', 'EPS', 'EPS_KURT', 'EPS_SKEW', 'ACI', 'NDSI', 'ROU',  'ZCR', 'MEANt', 'VARt', 'SKEWt', 'KURTt', 'Ht']]

KeyError: "['gen', 'Family', 'Order', 'Biotope', 'n_roi'] not in index"

In [None]:
#this aint a test anymore

df_annot_final = df_annot_final.rename(columns={"Family": "family","Order":"order",'Biotope':'biotope','gen':'genus'})

df_annot_final.to_csv('/Users/Paul/Paul/Desktop/My_projects/Bioacoustics/Maputo_Dash/datasets/tables/annot_new2.csv')

In [None]:
df_annot_final.dropna(axis=1, how='any').columns

In [None]:
df_annot_final[df_temporal_features.columns].isna().sum().sum()

In [None]:
df_annot_final[df_spectral_features.columns].isna().sum().sum()

In [None]:
df_annot_final[df_rois_shape.columns].isna().sum().sum()

In [None]:
df_annot_final.isna().sum().sum()

In [None]:
df_rois_shape