In [1]:
import os
import sys
sys.path.insert(1, os.path.join(sys.path[0], '../utils'))
import numpy as np
import argparse
import h5py
import math
import time
import logging
import matplotlib.pyplot as plt

import pandas as pd
import librosa

import torch
torch.backends.cudnn.benchmark=True
torch.manual_seed(0)
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data

from models import *

Import of 'jit' requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.[0m
  from numba.decorators import jit as optional_jit


In [2]:
MODEL_TYPE = "Cnn6" #Transfer_
MODEL_PATH = "pretrained_models/Cnn6_mAP=0.343.pth"
LABELS_PATH = "filtered_birds_df.csv"
DATA_PATH = "audiodata/wav"

NB_SPECIES = 527

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

In [3]:
df = pd.read_csv(LABELS_PATH)

In [4]:
df.head()

Unnamed: 0,iD,quality,length,country,url,birdName
0,548389,A,59,United Kingdom,www.xeno-canto.org/548389/download,Sturnus vulgaris
1,546936,A,32,France,www.xeno-canto.org/546936/download,Sturnus vulgaris
2,546935,A,65,France,www.xeno-canto.org/546935/download,Sturnus vulgaris
3,543800,A,36,Sweden,www.xeno-canto.org/543800/download,Sturnus vulgaris
4,542770,A,44,Poland,www.xeno-canto.org/542770/download,Sturnus vulgaris


In [5]:
class Transfer_Cnn6(nn.Module):
    def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin, 
        fmax, classes_num, freeze_base):
        """Classifier for a new task using pretrained Cnn14 as a sub module.
        """
        super(Transfer_Cnn6, self).__init__()
        audioset_classes_num = 527
        
        self.base = Cnn6(sample_rate, window_size, hop_size, mel_bins, fmin, 
            fmax, audioset_classes_num)

        # Transfer to another task layer
        self.fc_transfer = nn.Linear(512, classes_num, bias=True)

        if freeze_base:
            # Freeze AudioSet pretrained layers
            for param in self.base.parameters():
                param.requires_grad = False

        self.init_weights()

    def init_weights(self):
        init_layer(self.fc_transfer)

    def load_from_pretrain(self, pretrained_checkpoint_path):
        checkpoint = torch.load(pretrained_checkpoint_path, map_location=torch.device(DEVICE))
        self.base.load_state_dict(checkpoint['model'])

    def forward(self, input, mixup_lambda=None):
        """Input: (batch_size, data_length)
        """
        output_dict = self.base(input, mixup_lambda)
        embedding = output_dict['embedding']

        clipwise_output =  torch.log_softmax(self.fc_transfer(embedding), dim=-1)
        output_dict['clipwise_output'] = clipwise_output
 
        return output_dict

In [6]:
window_size = 1024
hop_size = 320
mel_bins = 64
fmin = 50
fmax = 14000
model_type = MODEL_TYPE
pretrained_checkpoint_path = MODEL_PATH
freeze_base = False

sample_rate = 14000
classes_num = NB_SPECIES
pretrain = True if pretrained_checkpoint_path else False

In [7]:
# Model
Model = eval(model_type)
model = Model(sample_rate, window_size, hop_size, mel_bins, fmin, fmax, 
    classes_num)#, freeze_base)

print(model)

# Load pretrained model
# if pretrain:
#     logging.info('Load pretrained model from {}'.format(pretrained_checkpoint_path))
#     model.load_from_pretrain(pretrained_checkpoint_path)

checkpoint = torch.load(pretrained_checkpoint_path, map_location=torch.device(DEVICE))
model.load_state_dict(checkpoint['model'])



# Parallel
print('GPU number: {}'.format(torch.cuda.device_count()))
model = torch.nn.DataParallel(model)

if 'cuda' in DEVICE:
    model.to(DEVICE)

print('Load pretrained model successfully!')



Cnn6(
  (spectrogram_extractor): Spectrogram(
    (stft): STFT(
      (conv_real): Conv1d(1, 513, kernel_size=(1024,), stride=(320,), bias=False)
      (conv_imag): Conv1d(1, 513, kernel_size=(1024,), stride=(320,), bias=False)
    )
  )
  (logmel_extractor): LogmelFilterBank()
  (spec_augmenter): SpecAugmentation(
    (time_dropper): DropStripes()
    (freq_dropper): DropStripes()
  )
  (bn0): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv_block1): ConvBlock5x5(
    (conv1): Conv2d(1, 64, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (conv_block2): ConvBlock5x5(
    (conv1): Conv2d(64, 128, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2), bias=False)
    (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (conv_block3): ConvBlock5x5(
    (conv1): Conv2d(128, 256, kernel_size=(5, 5), stride

In [8]:
def move_data_to_device(x, device):
    if 'float' in str(x.dtype):
        x = torch.Tensor(x)
    elif 'int' in str(x.dtype):
        x = torch.LongTensor(x)
    else:
        return x

    return x.to(device)

In [40]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
offset=5
duration=10
birdName = "linaria_cannabina"
filename = "linaria_cannabina_120708.wav"
audio_path = f"{DATA_PATH}/{birdName}/{filename}"

(waveform, _) = librosa.core.load(audio_path, sr=sample_rate, mono=True,offset=offset,duration=duration)

waveform = waveform[None, :]    # (1, audio_length)
waveform = move_data_to_device(waveform, device)
batch_output_dict = model(waveform, None)

In [41]:
clipwise_output = batch_output_dict['clipwise_output'].data.cpu().numpy()[0]

sorted_indexes = np.argsort(clipwise_output)[::-1]

In [42]:
#Print audio tagging top probabilities
labels = pd.read_csv("audioset_tagging_cnn_class_labels_indices.csv")
for k in range(10):
    print('{}: {:.3f}'.format(np.array(labels)[sorted_indexes[k]], 
        clipwise_output[sorted_indexes[k]]))

[137 '/m/04rlf' 'Music']: 0.394
[0 '/m/09x0r' 'Speech']: 0.063
[300 '/m/07yv9' 'Vehicle']: 0.059
[138 '/m/04szw' 'Musical instrument']: 0.038
[329 '/m/07jdr' 'Train']: 0.028
[239 '/m/02lkt' 'Electronic music']: 0.023
[496 '/m/07rcgpl' 'Hum']: 0.019
[332 '/m/01g50p' 'Railroad car, train wagon']: 0.019
[72 '/m/0jbk' 'Animal']: 0.018
[328 '/m/06d_3' 'Rail transport']: 0.018


In [30]:
np.array(labels)[34]

array([34, '/t/dd00005', 'Child singing'], dtype=object)

In [31]:
np.argmax(clipwise_output)

137

In [44]:
np.max(clipwise_output)

0.39386606

In [24]:
len(embedding)

512