In [1]:
%load_ext autoreload
%autoreload 3

In [2]:
from pyha_analyzer import PyhaTrainer, PyhaTrainingArguments, extractors

# Loading in Data

In [3]:
# Sorry TQ, I don't have these files mounted yet...
# peru132_extr = extractors.Peru132Extractor()
# peru_132_ads = peru132_extr("/data/XC_wav")

In [4]:
# peru_132_ads = peru_132_ads.get_provenance()

In [5]:
birdset_extactor = extractors.Birdset()

hsn_ads = birdset_extactor("HSN")
# per_ads = birdset_extactor("PER")

hsn_ads

DatasetDict({
    train: Dataset({
        features: ['audio', 'filepath', 'start_time', 'end_time', 'low_freq', 'high_freq', 'ebird_code', 'ebird_code_multilabel', 'ebird_code_secondary', 'call_type', 'sex', 'lat', 'long', 'length', 'microphone', 'license', 'source', 'local_time', 'detected_events', 'event_cluster', 'peaks', 'quality', 'recordist', 'genus', 'species_group', 'order', 'genus_multilabel', 'species_group_multilabel', 'order_multilabel', 'audio_in', 'labels'],
        num_rows: 4368
    })
    valid: Dataset({
        features: ['audio', 'filepath', 'start_time', 'end_time', 'low_freq', 'high_freq', 'ebird_code', 'ebird_code_multilabel', 'ebird_code_secondary', 'call_type', 'sex', 'lat', 'long', 'length', 'microphone', 'license', 'source', 'local_time', 'detected_events', 'event_cluster', 'peaks', 'quality', 'recordist', 'genus', 'species_group', 'order', 'genus_multilabel', 'species_group_multilabel', 'order_multilabel', 'audio_in', 'labels'],
        num_rows: 1092
    }

# Online Preprocessing example

Suppose we just wanted spectrograms with no audio preprocessing

In [6]:
hsn_ads["test"][0]["audio"]

{'bytes': None,
 'path': '/home/s.perry.543/.cache/huggingface/datasets/downloads/extracted/e7a5318118cabfab47a509edeb627860a60537535aeea20a19fced4c110d579e/HSN_001_20150708_061805_000_005.ogg'}

In [7]:
hsn_ads["train"][0]

{'audio': {'bytes': None,
  'path': '/home/s.perry.543/.cache/huggingface/datasets/downloads/extracted/053832b39de8ba5377824eb2d8364a1e31d9355ede4974f3741a1daa9822e8ab/XC535222.ogg'},
 'filepath': '/home/s.perry.543/.cache/huggingface/datasets/downloads/extracted/053832b39de8ba5377824eb2d8364a1e31d9355ede4974f3741a1daa9822e8ab/XC535222.ogg',
 'start_time': None,
 'end_time': None,
 'low_freq': None,
 'high_freq': None,
 'ebird_code': 15,
 'ebird_code_multilabel': [15],
 'ebird_code_secondary': [],
 'call_type': 'song',
 'sex': None,
 'lat': 67.00975,
 'long': -154.70158,
 'length': 44,
 'microphone': 'focal',
 'license': '//creativecommons.org/licenses/by-nc-sa/4.0/',
 'source': 'xenocanto',
 'local_time': '08:46:17',
 'detected_events': [[2.112, 3.408],
  [10.096, 11.536],
  [19.2, 20.672],
  [27.168, 28.656],
  [35.456, 36.896]],
 'event_cluster': [0, -1, 0, 0, -1],
 'peaks': [10.81238268483804,
  11.00885157447716,
  11.205226714049646,
  19.90463914710638,
  20.115545547006878,
  2

In [8]:
from pyha_analyzer.preprocessors import MelSpectrogramPreprocessors

# preprocessor acts as a function for processing
# class allows us to configure parameters and whatnot
preprocessor = MelSpectrogramPreprocessors(duration=5, class_list=hsn_ads["train"].features["labels"].feature.names)

# Set split spefific transforms here
# If preprocessor had data augmentations, probably want to disable for training
hsn_ads["train"].set_transform(preprocessor)
hsn_ads["valid"].set_transform(preprocessor)
hsn_ads["test"].set_transform(preprocessor)
hsn_ads["train"][[0, 1]]["audio"][0]

array([[[0.00392157, 0.01568628, 0.08235294, ..., 0.00392157,
         0.00784314, 0.01176471],
        [0.        , 0.00392157, 0.00784314, ..., 0.00392157,
         0.        , 0.        ],
        [0.        , 0.        , 0.        , ..., 0.        ,
         0.        , 0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 0.        ,
         0.        , 0.        ],
        [0.        , 0.        , 0.        , ..., 0.        ,
         0.        , 0.        ],
        [0.        , 0.        , 0.        , ..., 0.        ,
         0.        , 0.        ]]], shape=(1, 128, 216), dtype=float32)

In [9]:
hsn_ads["train"][[0, 1]]["labels"]

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
        0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 1.]])

# Model Training

As a demo, implementing a model here because we haven't fleshed out the AudioDataset api yet

In the future this exists in `/pyha_analyzer/models/`

In [10]:
from pyha_analyzer.models.demo_CNN import ResnetConfig, ResnetModel

# Going to make notes on anything that should be handled not here
# This is one of these things, this should be handled by potentially pyha_trainer
resnet50d_config = ResnetConfig(
    num_classes=len(hsn_ads["train"].features["ebird_code"].names), input_channels=1
)
model = ResnetModel(resnet50d_config)

In [11]:
args = PyhaTrainingArguments(
    working_dir="working_dir"
)
args.num_train_epochs = 1
args.eval_steps = 500

trainer = PyhaTrainer(
    model=model,
    dataset=hsn_ads,
    training_args=args
)
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mshperry[0m ([33macoustic-species-identification[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss,Validation Loss


TrainOutput(global_step=69, training_loss=0.4575408703999795, metrics={'train_runtime': 382.1448, 'train_samples_per_second': 11.43, 'train_steps_per_second': 0.181, 'total_flos': 0.0, 'train_loss': 0.4575408703999795, 'epoch': 1.0})

In [12]:
trainer.evaluate(eval_dataset=hsn_ads["test"], metric_key_prefix="Soundscape")

#