In [11]:
import os
import glob
import pandas as pd
from collections import OrderedDict, namedtuple
from plotly import tools

Load features computed for the predetermined sound samples.

In [2]:
path = '/mnt/data/Birdman/samples/features/features_petrels_bp1-8_wl256_th4.csv'
features = pd.read_csv(path, index_col=None)

In [3]:
selected_features = (
    ('petrel', 'presence of storm petrel'),
    ('sound.files', 'name of the file'),
    ('meanfreq', 'mean frequency (in kHz)'),
    ('sd', 'standard deviation of frequency'),
    ('freq.Q75', 'third quantile (in kHz)'),
    ('freq.IQR', 'interquantile range (in kHz)'),
    ('skew', 'skewness - asymmetry of the spectrum'),
    ('kurt', 'kurtosis - peakedness of the spectrum'),
    ('sp.ent', 'spectral entropy'),
    ('sfm', 'spectral flatness'),
    ('meanfun', 'average of fundamental frequency'),
    ('maxfun', 'maximum fundamental frequency'),
    ('meandom', 'average of dominant frequency'),
    ('dfrange', 'range of dominant frequency'),
    ('modindx', 'modulation index'),
    ('meanpeakf', 'mean peak frequency'))

feature_legend = OrderedDict(selected_features)
selected_features_names = [name for name, desc in selected_features]
features = features[selected_features_names]
features = features.fillna(0)

In [4]:
petrel_count = features['petrel'].value_counts()
print(petrel_count)

0    2482
1    1281
Name: petrel, dtype: int64


`1` denotes petrel, `0` lack of thereof (we'll call it *noise* for brevity). There is imbalance in favour of non-petrel features, primarily introduced with three files that in the selected regions represent only noise.

### Subsampling
STHELENA-02_20140605_* contain relatively few samples compared to the top 3. For training, we're going to take all sampels from STHELENA-02_20140605_* and subsample the top 3. It should be noted that top 3 has no petrels.

In [5]:
count_per_file = features['sound.files'].value_counts()
top3_names = list(count_per_file[:3].index)
rest_names = list(count_per_file[3:].index)
print(f'There are {len(features)} samples in total\n')
print(count_per_file)

There are 3763 samples in total

STHELENA-01_20140106_210000_0-15min.wav       840
STHELENA-02_20140108_210100_110-120min.wav    600
STHELENA-01_20140101_210000_55-105min.wav     600
STHELENA-02_20140605_200000_1.wav             209
STHELENA-02_20140605_200000_10.wav            185
STHELENA-02_20140605_200000_7.wav             178
STHELENA-02_20140605_200000_11.wav            164
STHELENA-02_20140605_200000_2.wav             156
STHELENA-02_20140605_200000_8.wav             138
STHELENA-02_20140605_200000_4.wav             135
STHELENA-02_20140605_200000_3.wav             134
STHELENA-02_20140605_200000_5.wav             132
STHELENA-02_20140605_200000_12.wav            104
STHELENA-02_20140605_200000_9.wav              89
STHELENA-02_20140605_200000_6.wav              51
STHELENA-02_20140605_200000_13.wav             48
Name: sound.files, dtype: int64


In [6]:
features_top3 = features[features['sound.files'].isin(top3_names)]
features_rest = features[features['sound.files'].isin(rest_names)]
noise_samples_top3 = sum(features_top3['petrel'] == 0)
noise_samples_rest = sum(features_rest['petrel'] == 0)
petrels_samples_top3 = sum(features_top3['petrel'] == 1)
petrels_samples_rest = sum(features_rest['petrel'] == 1)
assert(petrels_samples_top3 == 0) # There should be no petrel in the top 3
extra_noise_to_sample = petrels_samples_rest - noise_samples_rest
print(f'We need {extra_noise_to_sample} extra samples to have equal population of noise and petrel features')

We need 839 extra samples to have equal population of noise and petrel features


In [7]:
extra_noise_top3_df = features_top3.sample(extra_noise_to_sample, random_state=42)
features_top3_extra_removed_df = features_top3.drop(extra_noise_top3_df.index)
print(f'The leftover {len(features_top3_extra_removed_df)} will be used for testing purposes')

The leftover 1201 will be used for testing purposes


In [8]:
df = pd.concat([extra_noise_top3_df, features_rest]).reset_index()
df.head(5).T

Unnamed: 0,0,1,2,3,4
index,1756,1440,1508,2407,969
petrel,0,0,0,0,0
sound.files,STHELENA-02_20140108_210100_110-120min.wav,STHELENA-02_20140108_210100_110-120min.wav,STHELENA-02_20140108_210100_110-120min.wav,STHELENA-01_20140101_210000_55-105min.wav,STHELENA-01_20140106_210000_0-15min.wav
meanfreq,3.60811,3.58382,3.59702,3.45414,3.93983
sd,1.82977,1.82665,1.8226,1.7458,1.75462
freq.Q75,5.202,5.164,5.158,4.883,5.453
freq.IQR,3.314,3.267,3.237,2.974,3.06
skew,1.89412,1.83392,1.78846,1.48091,0.658565
kurt,9.50197,9.44598,8.89957,6.3322,3.33522
sp.ent,0.979052,0.978609,0.978441,0.977618,0.983968


Now that we see that results are OK, we can preprocess all the data. **One change though**.  We will concatenate the noise together and for ML training simply cut off 1201 samples from the bottom.

In [12]:
features_path = glob.glob('/mnt/data/Birdman/samples/features/raw/features*.csv')

In [13]:
for path in features_path:
    print(path)
    filename = os.path.splitext(os.path.basename(path))[0]
    features = pd.read_csv(path, index_col=None)
    features = features.fillna(0)
    count_per_file = features['sound.files'].value_counts()
    top3_names = list(count_per_file[:3].index)
    rest_names = list(count_per_file[3:].index)
    features_top3 = features[features['sound.files'].isin(top3_names)]
    features_rest = features[features['sound.files'].isin(rest_names)]
    noise_samples_top3 = sum(features_top3['petrel'] == 0)
    noise_samples_rest = sum(features_rest['petrel'] == 0)
    petrels_samples_top3 = sum(features_top3['petrel'] == 1)
    petrels_samples_rest = sum(features_rest['petrel'] == 1)
    assert(petrels_samples_top3 == 0) # There should be no petrel in the top 3
    extra_noise_to_sample = petrels_samples_rest - noise_samples_rest
    extra_noise_top3_df = features_top3.sample(extra_noise_to_sample, random_state=42)
    features_top3_extra_removed_df = features_top3.drop(extra_noise_top3_df.index)
    df = pd.concat([features_top3_extra_removed_df, extra_noise_top3_df, features_rest]).reset_index()
    df = df.drop('index', axis=1)
    df.to_csv(filename + '.csv', index=False)

/mnt/data/Birdman/samples/features/raw/features_petrels_bp1-6_wl256_th2.csv
/mnt/data/Birdman/samples/features/raw/features_petrels_bp1-8_wl256_th10.csv
/mnt/data/Birdman/samples/features/raw/features_petrels_bp1-8_wl256_th2.csv
/mnt/data/Birdman/samples/features/raw/features_petrels_bp1-8_wl256_th4.csv
/mnt/data/Birdman/samples/features/raw/features_petrels_bp1-12_wl256_th4.csv
