In [29]:
%matplotlib inline

from comet_ml import Experiment

import os
import warnings
import plotutils
import xgboost as xg
import pandas as pd
import plotly.plotly as py
import plotly.graph_objs as go
import matplotlib.pyplot as plt
from collections import OrderedDict, namedtuple
from plotly import tools
from sklearn import preprocessing
from sklearn import model_selection
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix, accuracy_score

warnings.filterwarnings(module='sklearn*', action='ignore', category=DeprecationWarning)

Load features computed for the predetermined sound samples.

In [2]:
path = '/mnt/data/Birdman/samples/features/features_petrels_bp1-8_wl256_th4.csv'
features = pd.read_csv(path, index_col=None)

## Feature selection
The features have been selected based on analysis of their covariance and visual inspection present [here](https://plot.ly/~tracewsl/43)

In [3]:
selected_features = (
    ('petrel', 'presence of storm petrel'),
    ('sound.files', 'name of the file'),
    ('meanfreq', 'mean frequency (in kHz)'),
    ('sd', 'standard deviation of frequency'),
    ('freq.Q75', 'third quantile (in kHz)'),
    ('freq.IQR', 'interquantile range (in kHz)'),
    ('skew', 'skewness - asymmetry of the spectrum'),
    ('kurt', 'kurtosis - peakedness of the spectrum'),
    ('sp.ent', 'spectral entropy'),
    ('sfm', 'spectral flatness'),
    ('meanfun', 'average of fundamental frequency'),
    ('maxfun', 'maximum fundamental frequency'),
    ('meandom', 'average of dominant frequency'),
    ('dfrange', 'range of dominant frequency'),
    ('modindx', 'modulation index'),
    ('meanpeakf', 'mean peak frequency'))

feature_legend = OrderedDict(selected_features)
selected_features_names = [name for name, desc in selected_features]
features = features[selected_features_names]
features = features.fillna(0)

In [4]:
petrel_count = features['petrel'].value_counts()
print(petrel_count)

0    2482
1    1281
Name: petrel, dtype: int64


`1` denotes petrel, `0` lack of thereof (we'll call it *noise* for brevity). There is imbalance in favour of non-petrel features, primarily introduced with three files that in the selected regions represent only noise.

### Subsampling
STHELENA-02_20140605_* contain relatively few samples compared to the:
```
STHELENA-01_20140106_210000_0-15min.wav       840
STHELENA-02_20140108_210100_110-120min.wav    600
STHELENA-01_20140101_210000_55-105min.wav     600
```
For training, we're going to take all sampels from STHELENA-02_20140605_* and subsample the top 3. The data set is already organised such that first 1201 rows from `path` are non-petrels coming from the top 3. We're going to split the dataframe into two: noise-only and mixed.

In [25]:
split_idx = 1200
noise_only_df, df = features.iloc[:split_idx], features.iloc[split_idx:]
petrels = df[df['petrel'] == 1]
nonpetrels = df[df['petrel'] == 0]
print(f'Noise-only count: {len(noise_only_df)}')
print(f'Mixed noise: {len(nonpetrels)}') 
print(f'Mixed petrels: {len(petrels)}') 

Noise-only count: 1200
Mixed noise: 1282
Mixed petrels: 1281


In [26]:
df.head(3)

Unnamed: 0,petrel,sound.files,meanfreq,sd,freq.Q75,freq.IQR,skew,kurt,sp.ent,sfm,meanfun,maxfun,meandom,dfrange,modindx,meanpeakf
1200,0,STHELENA-01_20140101_210000_55-105min.wav,3.449775,1.749425,4.891,3.015,1.475437,6.202251,0.977536,0.786448,4.855395,5.333333,1.575387,2.9375,20.787234,0.913194
1201,0,STHELENA-02_20140108_210100_110-120min.wav,3.60811,1.829769,5.202,3.314,1.894124,9.501967,0.979052,0.816575,5.333333,5.333333,1.278226,5.125,6.146341,0.97619
1202,0,STHELENA-02_20140108_210100_110-120min.wav,3.583817,1.826647,5.164,3.267,1.833918,9.445981,0.978609,0.825397,5.333333,5.333333,1.424731,5.0625,11.407407,0.97619


In [27]:
df = df.drop(['sound.files'], axis=1)

In [28]:
y = df.pop('petrel')
X = df.values

In [52]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=42)

In [55]:
scaler = preprocessing.StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [54]:
hyperparams = {
    'n_estimators': [100, 300],
    'learning_rate': [0.1],
    'gamma': [0.0, 0.5],
    'max_depth': [2, 3, 5],
    'min_child_weight': [1, 2],
    'subsample': [1, 0.8],
    'reg_alpha': [0, 0.1],
    'reg_lambda': [1]
}

In [None]:
experiment = Experiment(api_key="4PdGdUZmGf6P8QsMa5F2zB4Ui",
                        project_name="general", workspace="tracewsl")

In [None]:
experiment.log_multiple_params(hyperparams)

In [56]:
estimator = xg.XGBClassifier(objective='binary:logistic',
                             n_jobs=-1)
clf = model_selection.GridSearchCV(estimator=estimator,
                                   param_grid=hyperparams,
                                   cv=4)

In [57]:
fit_params = clf.fit(X_train, y_train)

In [None]:
est = fit_params.best_estimator_

In [None]:
est.feature_importances_

In [None]:
fit_params.best_params_

In [None]:
test_pred = est.predict(X_test)

In [None]:
sklearn.__version__

In [None]:
experiment.log_multiple_metrics()

In [None]:
est.feature_importances_

In [None]:
df.head(5).T

In [30]:
import numpy as np

In [47]:
y_true = np.array([1,2])
y_pred = np.array([0,0])

In [48]:
confusion_matrix(y_true, y_pred)

array([[0, 0, 0],
       [1, 0, 0],
       [1, 0, 0]])