In [1]:
import pandas as pd
import plotly.plotly as py
import plotly.graph_objs as go
from collections import OrderedDict
from plotly import tools

# Audio characteristic of calls
Now that we understand better where we have storm petrel calls, let's check their audio characteristic

In [2]:
features = pd.read_csv('/mnt/data/Birdman/results/warbler/features_warbler_buffer_250ms.csv', index_col=0)

In [3]:
legend_pairs = (
    ('meanfreq', 'mean frequency (in kHz)'),
    ('sd', 'standard deviation of frequency'),
    ('freq.median', 'median frequency (in kHz)'),
    ('freq.Q25', 'first quantile (in kHz)'),
    ('freq.Q75', 'third quantile (in kHz)'),
    ('freq.IQR', 'interquantile range (in kHz)'),
    ('time.Q25', 'first quartile time'),
    ('time.Q75', 'third quartile time'),
    ('time.IQR', 'interquartile time range'),
    ('skew', 'skewness - asymmetry of the spectrum'),
    ('kurt', 'kurtosis - peakedness of the spectrum'),
    ('sp.ent', 'spectral entropy'),
    ('sfm', 'spectral flatness'),
    ('meanfun', 'average of fundamental frequency'),
    ('minfun', 'minimum fundamental frequency'),
    ('maxfun', 'maximum fundamental frequency'),
    ('meandom', 'average of dominant frequency'),
    ('mindom', 'minimum of dominant frequency'),
    ('maxdom', 'maximum of dominant frequency'),
    ('dfrange', 'range of dominant frequency'),
    ('modindx', 'modulation index'),
    ('startdom', 'dominant frequency measurement at the start of the signal'),
    ('enddom', 'dominant frequency measurement at the end of the signal'),
    ('dfslope', 'slope of the change in dominant (kHz/s)'),
    ('peakf', 'peak frequency'),
    ('meanpeakf', 'mean peak frequency'))

feature_legend = OrderedDict(legend_pairs)
feature_names = [name for name, desc in legend_pairs]

In [4]:
petrels = features[features['storm_petrel_ground_truth'] == 1]
noise = features[features['storm_petrel_ground_truth'] == 0]

In [5]:
petrel_features = petrels[feature_names]
noise_features = noise[feature_names]

In [6]:
petrel_features.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
meanfreq,982.0,2.728596,0.19545,2.03859,2.610405,2.715565,2.830605,3.643788
sd,982.0,1.117594,0.151754,0.785409,1.011516,1.094598,1.198213,1.81876
freq.median,982.0,2.660556,0.235561,1.582573,2.530839,2.64317,2.768138,3.948985
freq.Q25,982.0,2.035085,0.320272,1.252862,1.800548,2.050442,2.260748,3.064304
freq.Q75,982.0,3.136562,0.31766,2.221577,2.938562,3.066591,3.276469,4.392531
freq.IQR,982.0,1.101477,0.38483,0.324324,0.814525,1.054227,1.33412,2.984592
time.Q25,982.0,0.204377,0.094487,0.016632,0.124131,0.180577,0.288417,0.486325
time.Q75,982.0,0.491649,0.112396,0.173479,0.389196,0.494889,0.556622,1.014839
time.IQR,982.0,0.287273,0.089492,0.041032,0.239503,0.288514,0.346724,0.63326
skew,982.0,3.303855,0.893688,1.473235,2.658779,3.211698,3.815779,7.797921


In [7]:
noise_features.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
meanfreq,102.0,2.841197,0.18395,2.312464,2.725747,2.816775,3.021934,3.133814
sd,102.0,1.675167,0.145909,1.131204,1.683698,1.70579,1.761159,1.806238
freq.median,102.0,2.253447,0.264111,1.811001,2.046055,2.161033,2.558956,3.081328
freq.Q25,102.0,1.44612,0.158454,1.2784,1.32104,1.354679,1.583652,2.018013
freq.Q75,102.0,3.926626,0.423113,2.485957,3.770793,4.02805,4.271944,4.357886
freq.IQR,102.0,2.480507,0.485134,0.891342,2.454118,2.692837,2.736048,2.910363
time.Q25,102.0,0.221188,0.075741,0.057576,0.165246,0.204918,0.272721,0.44792
time.Q75,102.0,0.525758,0.13857,0.346752,0.437837,0.485956,0.550821,1.044516
time.IQR,102.0,0.30457,0.111767,0.098578,0.247419,0.31263,0.357921,0.672053
skew,102.0,3.567559,0.647194,1.810486,3.097125,3.720198,4.011264,4.969608


By looking at these tables side by side, we can see that there is a difference between the two. Let's make a [box plot](https://en.wikipedia.org/wiki/Box_plot). We will also standardize features by removing the mean and scaling to unit variance. Centering and scaling happen independently on each feature by computing the relevant statistics on the samples in the training set.

We can already see that a few features are missing some values: `fundamental frequency`

In [8]:
features[['meanfun', 'minfun', 'maxfun']] = features.groupby('storm_petrel_ground_truth')[['meanfun', 'minfun', 'maxfun']].transform(lambda x: x.fillna(x.median()))

In [9]:
marker_petrels = {'color': '#FF4136', 'size': 2}
marker_nonpetrels = {'color': '#0000FF', 'size': 2}

In [10]:
no_features = len(feature_names)
fig = tools.make_subplots(rows=no_features, subplot_titles=tuple([desc for name, desc in legend_pairs]))

for idx, name in enumerate(feature_names):
    trace_petrel = go.Box(x=petrels[name], name='petrels', marker=marker_petrels, boxpoints='all', boxmean='sd')
    trace_nonpetrel = go.Box(x=noise[name], name='non-petrels', marker=marker_nonpetrels, boxpoints='all', boxmean='sd')
    fig.append_trace(trace_petrel, idx+1, 1)
    fig.append_trace(trace_nonpetrel, idx+1, 1)
    
fig['layout'].update(height=no_features * 400, width=1200, title='Boxplot for WarbleR features', showlegend=False)
py.plot(fig, filename='WarbleR boxplot')

This is the format of your plot grid:
[ (1,1) x1,y1 ]   
[ (2,1) x2,y2 ]   
[ (3,1) x3,y3 ]   
[ (4,1) x4,y4 ]   
[ (5,1) x5,y5 ]   
[ (6,1) x6,y6 ]   
[ (7,1) x7,y7 ]   
[ (8,1) x8,y8 ]   
[ (9,1) x9,y9 ]   
[ (10,1) x10,y10 ]
[ (11,1) x11,y11 ]
[ (12,1) x12,y12 ]
[ (13,1) x13,y13 ]
[ (14,1) x14,y14 ]
[ (15,1) x15,y15 ]
[ (16,1) x16,y16 ]
[ (17,1) x17,y17 ]
[ (18,1) x18,y18 ]
[ (19,1) x19,y19 ]
[ (20,1) x20,y20 ]
[ (21,1) x21,y21 ]
[ (22,1) x22,y22 ]
[ (23,1) x23,y23 ]
[ (24,1) x24,y24 ]
[ (25,1) x25,y25 ]
[ (26,1) x26,y26 ]

