# Notes

In this kenel, I'm going to use a classical **ResneSt50** for bird identification.

In [1]:
try:
    import resnest
except ModuleNotFoundError:
    !pip install -q "../input/resnest50-fast-package/resnest-0.0.6b20200701/resnest"

In [2]:
import numpy as np
import librosa as lb
import soundfile as sf
import pandas as pd
import cv2
from pathlib import Path
import librosa
import re

import torch
from torch import nn
from  torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

from tqdm.notebook import tqdm

import time
from resnest.torch import resnest50_fast_1s1x64d
from resnest.torch import resnest50
import torchvision
import pytorch_lightning as pl
!pip install ../input/audiomentations/audiomentations-0.15.0-py3-none-any.whl

import audiomentations
from torchvision.models.resnet import ResNet, Bottleneck

Processing /kaggle/input/audiomentations/audiomentations-0.15.0-py3-none-any.whl
Installing collected packages: audiomentations
Successfully installed audiomentations-0.15.0


# Configs

In [3]:
NUM_CLASSES = 397 
SR = 32_000
THRESH = 0.18
WEIGHT = 0.00

DURATION = 5
TOP_N = 3 # max preds = 3

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("DEVICE:", DEVICE)

TEST_AUDIO_ROOT = Path("../input/birdclef-2021/test_soundscapes")
SAMPLE_SUB_PATH = "../input/birdclef-2021/sample_submission.csv"
TARGET_PATH = None
    
if not len(list(TEST_AUDIO_ROOT.glob("*.ogg"))):
    TEST_AUDIO_ROOT = Path("../input/birdclef-2021/train_soundscapes")
    SAMPLE_SUB_PATH = None
    # SAMPLE_SUB_PATH = "../input/birdclef-2021/sample_submission.csv"
    TARGET_PATH = Path("../input/birdclef-2021/train_soundscape_labels.csv")

DEVICE: cuda


# Data

In [4]:
class MelSpecComputer:
    def __init__(self, sr, n_mels, fmin, fmax, **kwargs):
        self.sr = sr
        self.n_mels = n_mels
        self.fmin = fmin
        self.fmax = fmax
        kwargs["n_fft"] = kwargs.get("n_fft", self.sr//10)
        kwargs["hop_length"] = kwargs.get("hop_length", self.sr//(10*4))
        self.kwargs = kwargs
        self.tta1 = audiomentations.Normalize(p = 1.0)

    def __call__(self, y):
        y1 = y.copy()
        #y2 = self.tta1(y.copy(), sample_rate = 32000)
        
        ys = [y1]#, y2]
        melspecs = []
        for y in ys:
            melspec1 = lb.feature.melspectrogram(
                y, sr=self.sr, n_mels=self.n_mels, fmin=self.fmin, fmax=self.fmax, **self.kwargs,
            )
            melspec1 = lb.power_to_db(melspec1).astype(np.float32)
            melspecs += [melspec1]
        return melspecs

In [5]:
def mono_to_color(X, eps=1e-6, mean=None, std=None):
    mean = mean or X.mean()
    std = std or X.std()
    X = (X - mean) / (std + eps)
    
    _min, _max = X.min(), X.max()

    if (_max - _min) > eps:
        V = np.clip(X, _min, _max)
        V = 255 * (V - _min) / (_max - _min)
        V = V.astype(np.uint8)
    else:
        V = np.zeros_like(X, dtype=np.uint8)

    return V

def crop_or_pad(y, length):
    if len(y) < length:
        y = np.concatenate([y, length - np.zeros(len(y))])
    elif len(y) > length:
        y = y[:length]
    return y

In [6]:
class BirdCLEFDataset(Dataset):
    def __init__(self, data, sr=SR, n_mels=128, fmin=0, fmax=None, duration=DURATION, step=None, res_type="kaiser_fast", resample=True):
        
        self.data = data
        
        self.sr = sr
        self.n_mels = n_mels
        self.fmin = fmin
        self.fmax = fmax or self.sr//2

        self.duration = duration
        self.audio_length = self.duration*self.sr
        self.step = step or self.audio_length
        
        self.res_type = res_type
        self.resample = resample

        self.mel_spec_computer = MelSpecComputer(sr=self.sr, n_mels=self.n_mels, fmin=self.fmin,
                                                 fmax=self.fmax)
    
        self.mean_train = np.array([0.485, 0.456, 0.406])
        self.mean_train =torch.tensor(np.expand_dims(np.expand_dims(self.mean_train, axis = -1), axis = -1))
        self.std_train = np.array([0.229, 0.224, 0.225])
        self.std_train = torch.tensor(np.expand_dims(np.expand_dims(self.std_train, axis = -1), axis = -1))
        self.stats = (self.mean_train, self.std_train) # ImageNet Stats 
        
    def __len__(self):
        return len(self.data)
    
    def normalize(self, image):
        image = image - self.mean_train
        image = image / self.std_train
        return image
    
    def audio_to_image(self, audio):
        melspecs = self.mel_spec_computer(audio) 
        images = []
        for melspec in melspecs:
            image = torch.tensor(mono_to_color(melspec))
            image = image / 255.0

            pcen = lb.pcen(image.numpy())
            mel = image
            power = image ** 1.5
            image = torch.stack([torch.tensor(pcen), torch.tensor(mel), torch.tensor(power)], dim = 0)
            images += [image]
        return torch.stack(images)[0]

    def read_file(self, filepath):
        old_audio, orig_sr = sf.read(filepath, dtype="float32")

        if self.resample and orig_sr != self.sr:
            old_audio = lb.resample(old_audio, orig_sr, self.sr, res_type=self.res_type)
          
        audios = []
        for i in range(self.audio_length, len(old_audio) + self.step, self.step):
            start = max(0, i - self.audio_length)
            end = start + self.audio_length
            audios.append(old_audio[start:end])
            
        if len(audios[-1]) < self.audio_length:
            audios = audios[:-1]
            
        images = [self.audio_to_image(audio) for audio in audios]
        images = torch.stack(images)
        
        images2 = self.audio_to_image(old_audio)[:, :, :-1]
        
        return images2, images
    
        
    def __getitem__(self, idx):
        return self.read_file(self.data.loc[idx, "filepath"]) # Chop off the last bit for clean 24000 images

In [7]:
data = pd.DataFrame(
     [(path.stem, *path.stem.split("_"), path) for path in Path(TEST_AUDIO_ROOT).glob("*.ogg")],
    columns = ["filename", "id", "site", "date", "filepath"]
)
print(data.shape)
data.head()

(20, 5)


Unnamed: 0,filename,id,site,date,filepath
0,20152_SSW_20170805,20152,SSW,20170805,../input/birdclef-2021/train_soundscapes/20152...
1,57610_COR_20190904,57610,COR,20190904,../input/birdclef-2021/train_soundscapes/57610...
2,7843_SSW_20170325,7843,SSW,20170325,../input/birdclef-2021/train_soundscapes/7843_...
3,42907_SSW_20170708,42907,SSW,20170708,../input/birdclef-2021/train_soundscapes/42907...
4,7019_COR_20190904,7019,COR,20190904,../input/birdclef-2021/train_soundscapes/7019_...


In [8]:
df_train = pd.read_csv("../input/birdclef-2021/train_metadata.csv")

LABEL_IDS = {label: label_id for label_id,label in enumerate(sorted(df_train["primary_label"].unique()))}
INV_LABEL_IDS = {val: key for key,val in LABEL_IDS.items()}


# Model Config (Load Learned Weights)

# CNN 14

In [9]:
def init_layer(layer):
    nn.init.xavier_uniform_(layer.weight)

    if hasattr(layer, "bias"):
        if layer.bias is not None:
            layer.bias.data.fill_(0.)


def init_bn(bn):
    bn.bias.data.fill_(0.)
    bn.weight.data.fill_(1.0)


def interpolate(x: torch.Tensor, ratio: int):
    """Interpolate data in time domain. This is used to compensate the
    resolution reduction in downsampling of a CNN.

    Args:
      x: (batch_size, time_steps, classes_num)
      ratio: int, ratio to interpolate
    Returns:
      upsampled: (batch_size, time_steps * ratio, classes_num)
    """
    (batch_size, time_steps, classes_num) = x.shape
    upsampled = x[:, :, None, :].repeat(1, 1, ratio, 1)
    upsampled = upsampled.reshape(batch_size, time_steps * ratio, classes_num)
    return upsampled


def pad_framewise_output(framewise_output: torch.Tensor, frames_num: int):
    """Pad framewise_output to the same length as input frames. The pad value
    is the same as the value of the last frame.
    Args:
      framewise_output: (batch_size, frames_num, classes_num)
      frames_num: int, number of frames to pad
    Outputs:
      output: (batch_size, frames_num, classes_num)
    """
    pad = framewise_output[:, -1:, :].repeat(
        1, frames_num - framewise_output.shape[1], 1)
    """tensor for padding"""

    output = torch.cat((framewise_output, pad), dim=1)
    """(batch_size, frames_num, classes_num)"""

    return output


class ConvBlock(nn.Module):
    def __init__(self, in_channels: int, out_channels: int):
        super().__init__()

        self.conv1 = nn.Conv2d(
            in_channels=in_channels,
            out_channels=out_channels,
            kernel_size=(3, 3),
            stride=(1, 1),
            padding=(1, 1),
            bias=False)

        self.conv2 = nn.Conv2d(
            in_channels=out_channels,
            out_channels=out_channels,
            kernel_size=(3, 3),
            stride=(1, 1),
            padding=(1, 1),
            bias=False)

        self.bn1 = nn.BatchNorm2d(out_channels)
        self.bn2 = nn.BatchNorm2d(out_channels)

        self.init_weight()

    def init_weight(self):
        init_layer(self.conv1)
        init_layer(self.conv2)
        init_bn(self.bn1)
        init_bn(self.bn2)

    def forward(self, input, pool_size=(2, 2), pool_type='avg'):

        x = input
        x = F.relu_(self.bn1(self.conv1(x)))
        x = F.relu_(self.bn2(self.conv2(x)))
        if pool_type == 'max':
            x = F.max_pool2d(x, kernel_size=pool_size)
        elif pool_type == 'avg':
            if pool_size == (1, 1):
                x = F.avg_pool2d(x, kernel_size = (1, 1))
            else:
                x = F.avg_pool2d(x, kernel_size=pool_size, padding = 1, stride = 2)
        elif pool_type == 'avg+max':
            x1 = F.avg_pool2d(x, kernel_size=pool_size)
            x2 = F.max_pool2d(x, kernel_size=pool_size)
            x = x1 + x2
        else:
            raise Exception('Incorrect argument!')

        return x


class AttBlock(nn.Module):
    def __init__(self,
                 in_features: int,
                 out_features: int,
                 activation="linear",
                 temperature=1.0):
        super().__init__()

        self.activation = activation
        self.temperature = temperature
        self.att = nn.Conv1d(
            in_channels=in_features,
            out_channels=out_features,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=True)
        self.cla = nn.Conv1d(
            in_channels=in_features,
            out_channels=out_features,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=True)

        self.bn_att = nn.BatchNorm1d(out_features)
        self.init_weights()

    def init_weights(self):
        init_layer(self.att)
        init_layer(self.cla)
        init_bn(self.bn_att)

    def forward(self, x):
        # x: (n_samples, n_in, n_time)
        norm_att = torch.softmax(torch.clamp(self.att(x), -10, 10), dim=-1)
        cla = self.nonlinear_transform(self.cla(x))
        x = torch.sum(norm_att * cla, dim=2)
        return x, norm_att, cla

    def nonlinear_transform(self, x):
        if self.activation == 'linear':
            return x
        elif self.activation == 'sigmoid':
            return torch.sigmoid(x)

In [10]:
class DFTBase(nn.Module):
    def __init__(self):
        """Base class for DFT and IDFT matrix"""
        super(DFTBase, self).__init__()

    def dft_matrix(self, n):
        (x, y) = np.meshgrid(np.arange(n), np.arange(n))
        omega = np.exp(-2 * np.pi * 1j / n)
        W = np.power(omega, x * y)
        return W

    def idft_matrix(self, n):
        (x, y) = np.meshgrid(np.arange(n), np.arange(n))
        omega = np.exp(2 * np.pi * 1j / n)
        W = np.power(omega, x * y)
        return W
    
    
class STFT(DFTBase):
    def __init__(self, n_fft=2048, hop_length=None, win_length=None, 
        window='hann', center=True, pad_mode='reflect', freeze_parameters=True):
        """Implementation of STFT with Conv1d. The function has the same output 
        of librosa.core.stft
        """
        super(STFT, self).__init__()

        assert pad_mode in ['constant', 'reflect']

        self.n_fft = n_fft
        self.center = center
        self.pad_mode = pad_mode

        # By default, use the entire frame
        if win_length is None:
            win_length = n_fft

        # Set the default hop, if it's not already specified
        if hop_length is None:
            hop_length = int(win_length // 4)

        fft_window = librosa.filters.get_window(window, win_length, fftbins=True)

        # Pad the window out to n_fft size
        fft_window = librosa.util.pad_center(fft_window, n_fft)

        # DFT & IDFT matrix
        self.W = self.dft_matrix(n_fft)

        out_channels = n_fft // 2 + 1

        self.conv_real = nn.Conv1d(in_channels=1, out_channels=out_channels, 
            kernel_size=n_fft, stride=hop_length, padding=0, dilation=1, 
            groups=1, bias=False)

        self.conv_imag = nn.Conv1d(in_channels=1, out_channels=out_channels, 
            kernel_size=n_fft, stride=hop_length, padding=0, dilation=1, 
            groups=1, bias=False)

        self.conv_real.weight.data = torch.Tensor(
            np.real(self.W[:, 0 : out_channels] * fft_window[:, None]).T)[:, None, :]
        # (n_fft // 2 + 1, 1, n_fft)

        self.conv_imag.weight.data = torch.Tensor(
            np.imag(self.W[:, 0 : out_channels] * fft_window[:, None]).T)[:, None, :]
        # (n_fft // 2 + 1, 1, n_fft)

        if freeze_parameters:
            for param in self.parameters():
                param.requires_grad = False

    def forward(self, input):
        """input: (batch_size, data_length)
        Returns:
          real: (batch_size, n_fft // 2 + 1, time_steps)
          imag: (batch_size, n_fft // 2 + 1, time_steps)
        """

        x = input[:, None, :]   # (batch_size, channels_num, data_length)

        if self.center:
            x = F.pad(x, pad=(self.n_fft // 2, self.n_fft // 2), mode=self.pad_mode)

        real = self.conv_real(x)
        imag = self.conv_imag(x)
        # (batch_size, n_fft // 2 + 1, time_steps)

        real = real[:, None, :, :].transpose(2, 3)
        imag = imag[:, None, :, :].transpose(2, 3)
        # (batch_size, 1, time_steps, n_fft // 2 + 1)

        return real, imag
    
    
class Spectrogram(nn.Module):
    def __init__(self, n_fft=2048, hop_length=None, win_length=None, 
        window='hann', center=True, pad_mode='reflect', power=2.0, 
        freeze_parameters=True):
        """Calculate spectrogram using pytorch. The STFT is implemented with 
        Conv1d. The function has the same output of librosa.core.stft
        """
        super(Spectrogram, self).__init__()

        self.power = power

        self.stft = STFT(n_fft=n_fft, hop_length=hop_length, 
            win_length=win_length, window=window, center=center, 
            pad_mode=pad_mode, freeze_parameters=True)

    def forward(self, input):
        """input: (batch_size, 1, time_steps, n_fft // 2 + 1)
        Returns:
          spectrogram: (batch_size, 1, time_steps, n_fft // 2 + 1)
        """

        (real, imag) = self.stft.forward(input)
        # (batch_size, n_fft // 2 + 1, time_steps)

        spectrogram = real ** 2 + imag ** 2

        if self.power == 2.0:
            pass
        else:
            spectrogram = spectrogram ** (power / 2.0)

        return spectrogram

    
class LogmelFilterBank(nn.Module):
    def __init__(self, sr=32000, n_fft=2048, n_mels=64, fmin=50, fmax=14000, is_log=True, 
        ref=1.0, amin=1e-10, top_db=80.0, freeze_parameters=True):
        """Calculate logmel spectrogram using pytorch. The mel filter bank is 
        the pytorch implementation of as librosa.filters.mel 
        """
        super(LogmelFilterBank, self).__init__()

        self.is_log = is_log
        self.ref = ref
        self.amin = amin
        self.top_db = top_db

        self.melW = librosa.filters.mel(sr=sr, n_fft=n_fft, n_mels=n_mels,
            fmin=fmin, fmax=fmax).T
        # (n_fft // 2 + 1, mel_bins)

        self.melW = nn.Parameter(torch.Tensor(self.melW))

        if freeze_parameters:
            for param in self.parameters():
                param.requires_grad = False

    def forward(self, input):
        """input: (batch_size, channels, time_steps)
        
        Output: (batch_size, time_steps, mel_bins)
        """

        # Mel spectrogram
        mel_spectrogram = torch.matmul(input, self.melW)

        # Logmel spectrogram
        if self.is_log:
            output = self.power_to_db(mel_spectrogram)
        else:
            output = mel_spectrogram

        return output


    def power_to_db(self, input):
        """Power to db, this function is the pytorch implementation of 
        librosa.core.power_to_lb
        """
        ref_value = self.ref
        log_spec = 10.0 * torch.log10(torch.clamp(input, min=self.amin, max=np.inf))
        log_spec -= 10.0 * np.log10(np.maximum(self.amin, ref_value))

        if self.top_db is not None:
            if self.top_db < 0:
                raise ParameterError('top_db must be non-negative')
            log_spec = torch.clamp(log_spec, min=log_spec.max().item() - self.top_db, max=np.inf)

        return log_spec

In [11]:
class DropStripes(nn.Module):
    def __init__(self, dim, drop_width, stripes_num):
        """Drop stripes. 
        Args:
          dim: int, dimension along which to drop
          drop_width: int, maximum width of stripes to drop
          stripes_num: int, how many stripes to drop
        """
        super(DropStripes, self).__init__()

        assert dim in [2, 3]    # dim 2: time; dim 3: frequency

        self.dim = dim
        self.drop_width = drop_width
        self.stripes_num = stripes_num

    def forward(self, input):
        """input: (batch_size, channels, time_steps, freq_bins)"""

        assert input.ndimension() == 4

        if self.training is False:
            return input

        else:
            batch_size = input.shape[0]
            total_width = input.shape[self.dim]

            for n in range(batch_size):
                self.transform_slice(input[n], total_width)

            return input


    def transform_slice(self, e, total_width):
        """e: (channels, time_steps, freq_bins)"""

        for _ in range(self.stripes_num):
            distance = torch.randint(low=0, high=self.drop_width, size=(1,))[0]
            bgn = torch.randint(low=0, high=total_width - distance, size=(1,))[0]

            if self.dim == 2:
                e[:, bgn : bgn + distance, :] = 0
            elif self.dim == 3:
                e[:, :, bgn : bgn + distance] = 0


class SpecAugmentation(nn.Module):
    def __init__(self, time_drop_width, time_stripes_num, freq_drop_width, 
        freq_stripes_num):
        """Spec augmetation. 
        [ref] Park, D.S., Chan, W., Zhang, Y., Chiu, C.C., Zoph, B., Cubuk, E.D. 
        and Le, Q.V., 2019. Specaugment: A simple data augmentation method 
        for automatic speech recognition. arXiv preprint arXiv:1904.08779.
        Args:
          time_drop_width: int
          time_stripes_num: int
          freq_drop_width: int
          freq_stripes_num: int
        """

        super(SpecAugmentation, self).__init__()

        self.time_dropper = DropStripes(dim=2, drop_width=time_drop_width, 
            stripes_num=time_stripes_num)

        self.freq_dropper = DropStripes(dim=3, drop_width=freq_drop_width, 
            stripes_num=freq_stripes_num)

    def forward(self, input):
        x = self.time_dropper(input)
        x = self.freq_dropper(x)
        return x

In [12]:
class PANNsCNN14Att(nn.Module):
    def __init__(self):
        super().__init__()
        
        
        window = 'hann'
        center = True
        pad_mode = 'reflect'
        ref = 1.0
        amin = 1e-10
        top_db = None
        mel_bins = 64
        self.interpolate_ratio = 32  # Downsampled ratio

         
        window = 'hann'
        center = True
        pad_mode = 'reflect'
        ref = 1.0
        amin = 1e-10
        top_db = None
        self.interpolate_ratio = 32  # Downsampled ratio
        sample_rate = 32000
        window_size =  1024
        hop_size = 320
        mel_bins = 64
        fmin = 50
        fmax = 14000
        classes_num = 264
        # Spectrogram extractor
        self.spectrogram_extractor = Spectrogram(
            n_fft=window_size,
            hop_length=hop_size,
            win_length=window_size,
            window=window,
            center=center,
            pad_mode=pad_mode,
            freeze_parameters=True)

        # Logmel feature extractor
        self.logmel_extractor = LogmelFilterBank(
            sr=sample_rate,
            n_fft=window_size,
            n_mels=mel_bins,
            fmin=fmin,
            fmax=fmax,
            ref=ref,
            amin=amin,
            top_db=top_db,
            freeze_parameters=True)

        # Spec augmenter
        self.spec_augmenter = SpecAugmentation(
            time_drop_width=64,
            time_stripes_num=2,
            freq_drop_width=8,
            freq_stripes_num=2)
        
        # Spectrogram extractor
        self.bn0 = nn.BatchNorm2d(mel_bins)

        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
        self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
        self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
        self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
        self.conv_block5 = ConvBlock(in_channels=512, out_channels=1024)
        self.conv_block6 = ConvBlock(in_channels=1024, out_channels=2048)
        classes_num = 264
        self.fc1 = nn.Linear(2048, 2048, bias=True)
        self.att_block = AttBlock(2048, classes_num, activation='sigmoid')

        self.init_weight()
        
    def init_weight(self):
        init_bn(self.bn0)
        init_layer(self.fc1)
        
    def cnn_feature_extractor(self, x):
        x = self.conv_block1(x, pool_size=(3, 3), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv_block2(x, pool_size=(3, 3), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv_block3(x, pool_size=(3, 3), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv_block4(x, pool_size=(3, 3), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv_block5(x, pool_size=(3, 3), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv_block6(x, pool_size=(1, 1), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        return x
    
   

    def forward(self, x, mixup_lambda=None):
        """
        Input: (batch_size, data_length)"""
        # Output shape (batch size, channels, time, frequency)
        x = x.transpose(2, 3) # (B, C, T, F)
        x = self.cnn_feature_extractor(x)
        
        return x.transpose(2, 3)

In [13]:
def load_model(name):
    if "resnest50-1s" in name:
        print('resnest1s')
        model = resnest50_fast_1s1x64d()
    elif 'resnest' in name:
        print("resnest")
        model = resnest50()
    elif 'CNN14' in name:
        print('cnn14')
        model = PANNsCNN14Att()
    elif 'densenet121' in name:
        print('dense121')
        model = torchvision.models.densenet121()
        return model
    elif 'densenet169' in name:
        print('dense169')
        model = torchvision.models.densenet169()
        return model
    elif 'densenet201' in name:
        print('dense201')
        model = torchvision.models.densenet201()
        return model
    else:
        print('resnext')
        model = torchvision.models.resnext50_32x4d(pretrained=False)
    if 'CNN14' not in name:
        nb_ft = model.fc.in_features
        del model.fc
        num_cls = 397
        model.fc = nn.Linear(nb_ft, num_cls)
    model.conv_block1 = ConvBlock(in_channels = 3, out_channels = 64)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    return model
class FeatureExtractor(pl.LightningModule):
    def __init__(self, model_name):
        super().__init__()
        self.model_name = model_name
        self.model = load_model(self.model_name)
        if self.model_name != 'CNN14' and 'densenet' not in self.model_name :
            self.conv1 = self.model.conv1
            self.bn1 = self.model.bn1
            self.act1 = self.model.relu
            self.maxpool = self.model.maxpool

            self.layer1 = self.model.layer1
            self.layer2 = self.model.layer2
            self.layer3 = self.model.layer3
            self.layer4 = self.model.layer4
            #self.fc = self.model.fc
            #self.global_Avg = nn.AdaptiveAvgPool2d((1, 1))
            del self.model
    def forward(self, x):
        if 'densenet' in self.model_name:
            return self.model.features(x)
        if self.model_name == 'CNN14':
            return self.model(x)
        x = self.maxpool(self.bn1(self.act1(self.conv1(x))))
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        return x

In [14]:
class ModelConfig:
    sed_model_path = [
        #'../input/sedresnest/models/model_2.pth',
        #'../input/cnn14-sed/models/model_1.pth',
        '../input/sed1s-bird-clef/models/model_2.pth',
        #'../input/densenetsed/models/model_1.pth'
    ]
    sed_model_names = [
        #'resnest',
        #'CNN14',
        'resnest50-1s',
        #'densenet121'
    ]
    
    cls_model_path = [
        '../input/birdcleftrained/models/model.pth',
        '../input/birdclefbaselinefold1/models/model_1.pth',
        '../input/birdclefbaselinefold3/models/model_3.pth',
        '../input/resnextbirdclef/models/model_2.pth',
        '../input/densenet121birdclef/models/model_1.pth',
        '../input/densenet169birdclef/models/model_1.pth',
        '../input/densenet201birdclef/models/model_4.pth'
    ]
    cls_model_names = [
        'resnest',
        'resnest',
        'resnest',
        'resnext',
        'densenet121',
        'densenet169',
        'densenet201'
    ]
    num_classes = 397
    sed_feature_extractor_dim = [
    #    2048,
    #    2048,
        2048,
    #    1024
    ]
    cls_feature_extractor_dim = [
        2048,
        2048,
        2048,
        2048,
        1024,
        1664,
        1920
    ]
    transformer_dim = 768
    num_heads = 8
    
    model_name = 'resnest'
    dropout = 0.5
    num_layers = 2
    head = 'Conv2DAtt'

In [15]:
class AttentionHead(pl.LightningModule):
    # Heng's 2D attention Head, not sure if it will work or not.
    def __init__(self, feature_dim):
        super().__init__()
        self.in_features = feature_dim
        self.num_classes = ModelConfig.num_classes + 1
        self.attention = nn.Sequential(*[
            nn.Conv2d(self.in_features, self.in_features * 2, kernel_size=3, padding=1, stride=1, bias=False),
            nn.BatchNorm2d(self.in_features * 2),
            nn.Tanh(),
        ]) #use 8x1 to make location-aware convolution
        self.attention1 = nn.Conv2d(self.in_features // 2, 1, kernel_size=1)
        self.global_avg_pool = nn.AdaptiveAvgPool2d((1, 1))
        self.Linear =  nn.Linear(self.in_features, self.num_classes)

    def forward(self, x):
        B, C, Freq, T = x.shape
        a = self.attention(x).mean(dim = 2) #x is 4x9 feature map
        a = a.reshape(B, self.in_features // 2, Freq, -1)
        a = self.attention1(a)
        a = F.softmax(a.reshape(B ,-1),-1).reshape(B, 1, Freq, -1)

        x = (a * x + x) # (B, 2048)
        x = torch.squeeze(self.global_avg_pool(x))
        return self.Linear(x)[:, :-1] # Cut off the last one.

In [16]:
class SEDAttention(pl.LightningModule):
    def __init__(self, feature_dim):
        super().__init__()
        # SED Attention, for Clipwise and Framewise Preds
        self.in_features =feature_dim
        self.out_features = ModelConfig.num_classes
        
        self.framewise = nn.Conv1d(self.in_features, self.out_features, 1)
        self.attention = nn.Conv1d(self.in_features, self.out_features, 1)
        self.ratio = 32
        self.duration = 600
        self.seg_length = 5
        self.num_clips = self.duration // self.seg_length
        
    def interpolate(self, x):
        """Interpolate data in time domain. This is used to compensate the
        resolution reduction in downsampling of a CNN.

        Args:
          x: (batch_size, time_steps, classes_num)
          ratio: int, ratio to interpolate
        Returns:
          upsampled: (batch_size, time_steps * ratio, classes_num)
        """
        (batch_size, time_steps, classes_num) = x.shape
        upsampled = x[:, :, None, :].repeat(1, 1, self.ratio, 1)
        upsampled = upsampled.reshape(batch_size, time_steps * self.ratio, classes_num)
        return upsampled
    
    def forward(self, x):
        attention = F.softmax(self.attention(x), dim = -1).transpose(1, 2) # (B, L, C)
        framewise = self.framewise(x).transpose(1, 2) # (B, L, C)
    
        # Compute the Clipwise
        x = attention * framewise # (B, L, C)
        # Sum over time
        clipwise = x.sum(dim = 1) # (B, C)
        # Convert Framewise to Segwise
        framewise = self.interpolate(framewise) # (B, 32 * L, C)

        B, L, C = framewise.shape
        segwise = framewise.view(B, self.num_clips, L // self.num_clips, C)
        segwise = segwise.transpose(0, 1) # (NumClips, B, 200, C)
        # Compute the Sum
        segwise = segwise.mean(dim = 2) # (N, B, C)
        return segwise, clipwise
        
        
    
class SEDHead(pl.LightningModule):
    def __init__(self, feature_dim):
        super().__init__()
        self.feature_extractor = feature_dim
        self.n_mels = 128
        self.dim_reduce = 32
        self.drop_prob = ModelConfig.dropout
        #self.Conv2d = nn.Conv2d(self.feature_extractor, self.feature_extractor, (self.n_mels // self.dim_reduce, 1))
        self.drop1 = nn.Dropout(self.drop_prob)
        self.drop2 = nn.Dropout(self.drop_prob)
        
        self.fc = nn.Linear(self.feature_extractor, self.feature_extractor) 
        self.relu = nn.ReLU(inplace = True)
        
        self.attention = SEDAttention(self.feature_extractor)
    def forward(self, x):
        # X; Tensor(B, 2048, F, L):
        x = torch.mean(x, dim=2)  # BS x nb_ft x t

        x1 = F.max_pool1d(x, kernel_size=3, stride=1, padding=1)
        x2 = F.avg_pool1d(x, kernel_size=3, stride=1, padding=1)
        x = x1 + x2  # BS x nb_ft x t
        x = self.drop1(x) 
        x = x.transpose(1, 2) # (B, L, 2048) 
        # --------FC-------
        x = self.relu(self.fc(x)) # (B, L, 2048)
        x = x.transpose(1, 2)
        x = self.drop2(x) # (B, 2048, L)
        # ---------Attention---------------
        return self.attention(x) # (B, C)

In [17]:
def scale_logits(o):
    desired_mean = 0.5
    desired_std = 0.1
    
    mean = o.mean()
    std = o.std()
    
    o = (o - mean) * (desired_std / std) + desired_mean
    return o
class FullModel(pl.LightningModule):
    def __init__(self, model_name, feature_dim, sed = True):
        super().__init__()
        self.feature_extractor = FeatureExtractor(model_name)
        self.head = SEDHead(feature_dim) if sed else AttentionHead(feature_dim)
    def forward(self, x):
        features = self.feature_extractor(x)
        head = self.head(features)
        return head
class TestingModel(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.sed_model_path = ModelConfig.sed_model_path
        self.sed_model_names = ModelConfig.sed_model_names
        self.cls_model_path = ModelConfig.cls_model_path
        self.cls_model_names = ModelConfig.cls_model_names
        self.cls_feature = ModelConfig.cls_feature_extractor_dim
        self.sed_feature = ModelConfig.sed_feature_extractor_dim
        self.sed_model = nn.ModuleList([FullModel(self.sed_model_names[i], self.sed_feature[i]) for i in range(len(self.sed_model_path))])
        for idx in range(len(self.sed_model_path)):
            self.sed_model[idx].load_state_dict(torch.load(self.sed_model_path[idx], map_location = DEVICE))
        
        self.cls_model = nn.ModuleList([FullModel(self.cls_model_names[i], self.cls_feature[i], sed = False) for i in range(len(self.cls_model_path))])
        for idx in range(len(self.cls_model_path)):
            self.cls_model[idx].load_state_dict(torch.load(self.cls_model_path[idx], map_location = DEVICE))
        
    def forward(self, x, x2):
        self.eval()
    
        with torch.no_grad():
            framewise_output = None
            count =0
            
            for idx in range(len(self.sed_model)):
                if framewise_output is None:
                    framewise_output, _ = self.sed_model[idx](x)
                    framewise_output = torch.sigmoid(torch.squeeze(framewise_output))
                    #framewise_output = scale_logits(framewise_output)
                else:
                    f, _= self.sed_model[idx](x)    
                    f = torch.sigmoid(f)
                    f = torch.squeeze(f)
                    #f = scale_logits(torch.squeeze(f))
                    framewise_output = framewise_output + f
                count += 1
            for idx in range(len(self.cls_model)):
                if framewise_output is None:
                    f = torch.sigmoid(self.cls_model[idx](x2))
                    #f = scale_logits(f)
                    framewise_output = f
                    count = 1

                else:
                    f = torch.sigmoid(self.cls_model[idx](x2))
                    #f = scale_logits(f)
                    framewise_output = framewise_output + f
                    count += 1


            framewise_output =framewise_output / count
            return framewise_output
                    
def get_model():
    model = TestingModel()
    model.eval()
    return model

# Inference
- Sliding Window PPA 
- overlapping Window PPA?
- Threshold -> Additive Averaging.
- Post Process CSV instead of during prediction.

In [18]:
@torch.no_grad()
def get_thresh_preds(out, thresh=None, clipwise_thresh = None):
    framewise = out  # (120, B, C), (B, C)
    thresh = thresh or THRESH
    # Use ClipWise to mask out framewise
    #clipwise_threshed = clipwise >= clipwise_thresh # (B, C)
    #framewise[:, clipwise_threshed] = 0.0 # mask out
    
    o = (-framewise).argsort(1) # (120, B, C)
    all_npreds = (framewise >= thresh).sum(1) # (120, B)
    
    preds = []
    for ooo, npreds in zip(o, all_npreds):
        preds.append(ooo[:min(npreds, TOP_N)].cpu().numpy().tolist())
       
    return preds
def get_bird_names(preds):
    ex_names = []
    for pred in preds:

        if not pred:
            ex_names.append("nocall")
        else:

            ex_names.append(" ".join([INV_LABEL_IDS[bird_id] for bird_id in pred]))
    
    return ex_names

In [19]:
def predict(nets, test_data, names=True):
    preds = []
    with torch.no_grad():
        for idx in tqdm(range(len(test_data))):
            xb, xb2 = test_data[idx]
            xb = xb.to(DEVICE).to(torch.float32)
            xb2 = xb2.to(DEVICE).to(torch.float32)
            if len(xb.shape) == 3:
                xb = xb.unsqueeze(0)
                
            pred = nets[0](xb, xb2)
            
            if names:
                pred = get_bird_names(get_thresh_preds(pred))
            
            preds.append(pred)
    return preds

In [20]:
model = get_model().to(DEVICE)

resnest1s
resnest
resnest
resnest
resnext
dense121
dense169
dense201


In [21]:
dataset = BirdCLEFDataset(data = data)

In [22]:
pred_probas = predict([model], dataset, names=False)

  0%|          | 0/20 [00:00<?, ?it/s]



In [23]:
pred_string = [get_bird_names(get_thresh_preds(pred, thresh=THRESH)) for pred in pred_probas]

In [24]:
def preds_as_df(data, preds):
    sub = {
        "row_id": [],
        "birds": [],
    }
    
    for row, pred in zip(data.itertuples(False), preds):
        row_id = [f"{row.id}_{row.site}_{5*i}" for i in range(1, len(pred)+1)]
        sub["birds"] += pred
        sub["row_id"] += row_id
    
    sub = pd.DataFrame(sub)
    
    if SAMPLE_SUB_PATH:
        sample_sub = pd.read_csv(SAMPLE_SUB_PATH, usecols=["row_id"])
        sub = sample_sub.merge(sub, on="row_id", how="left")
        sub["birds"] = sub["birds"].fillna("nocall")
    return sub

In [25]:
sub = preds_as_df(data, pred_string)
sub.to_csv("submission.csv", index=False)