In [None]:
!pip install -q pysndfx SoundFile audiomentations pretrainedmodels efficientnet_pytorch resnest

# No Random Sampling, only first 5 seconds.

In [None]:
%%capture
!pip install colorednoise
import colorednoise as cn
import numpy as np
import librosa as lb
import librosa
import torchvision
import warnings
warnings.filterwarnings('ignore')

import librosa.display as lbd
import soundfile as sf
from  soundfile import SoundFile
import pandas as pd
from  IPython.display import Audio
from pathlib import Path

import glob
import torch
from torch import nn, optim
import torch.nn.functional as F
from  torch.utils.data import Dataset, DataLoader
import torchvision

from resnest.torch import resnest50

from matplotlib import pyplot as plt

import os, random, gc
import re, time, json
from  ast import literal_eval


from IPython.display import Audio
from sklearn.metrics import label_ranking_average_precision_score
from sklearn.model_selection import StratifiedKFold

from tqdm.notebook import tqdm
import joblib
import pytorch_lightning as pl

from efficientnet_pytorch import EfficientNet
import pretrainedmodels
from resnest.torch import resnest50_fast_1s1x64d
from resnest.torch import resnest50

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything()
from fastai.vision.all import *

# Config Vars

In [None]:
NUM_CLASSES = 397 + 1
SR = 32_000
DURATION = 7
MAX_READ_SAMPLES = 1 
audio_image_store = None
DATA_ROOT = Path("../input/birdclef-2021")


MEL_PATHS = sorted(Path("../input").glob("kkiller-birdclef-mels-computer-d7-part?/rich_train_metadata.csv"))
TRAIN_LABEL_PATHS = sorted(Path("../input").glob("kkiller-birdclef-mels-computer-d7-part?/LABEL_IDS.json"))
SOUNDSCAPES_PATH = '../input/birdclef-2021/train_soundscapes/'
BACKGROUND_PATH = '../input/birdclef-background/audio_images/nocall/'
BACKGROUND_CSV_PATH = '../input/birdclef-background/rich_train_metadata.csv'
BACKGROUND_CSV = pd.read_csv(BACKGROUND_CSV_PATH).set_index("Unnamed: 0")
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
ResNestPath = '../input/birds-cp-1/resnest50_fast_1s1x64d_conf_0.pt'
ResNextPath = '../input/birds-cp-2/resnext50_32x4d__0.pt'
CNN14Path = '../input/birdcall-pannsatt-aux-weak/best.pth'
external_soundscapes = '../input/trainsoundscapes/audio_images/'
external_soundscapes_csv = pd.read_csv('../input/trainsoundscapes/rich_train_metadata.csv')
external_soundscapes_csv['secondary_labels'] = external_soundscapes_csv['second_labels']
del external_soundscapes_csv['second_labels']
external_soundscapes_csv = external_soundscapes_csv.drop([np.where(external_soundscapes_csv['filename'] == 'SSW49_595')[0].item()])
external_soundscapes_csv = external_soundscapes_csv.set_index("Unnamed: 0")

class DataConfig:
    # Stores Config vars pertaining to data
    soundscapes_val = False # Means to put all of the soundscapes(training) as your vlaidation set
    # IF False, validation becomes split between soundscapes(80/20 split) and monophone is added too.

print("Device:", DEVICE)

# Clean Up DF Functions

In [None]:
def clean_df(df):
    # Cleans DataFrames from any erroneous entroes.
    to_drop = []
    for idx, row in tqdm(enumerate(df.iterrows())):
        row = row[1]
        if len(glob.glob(str(row.impath))) == 1:
            continue
        to_drop += [idx]
    df = df.drop(to_drop)
    return df
        

In [None]:
def get_df(mel_paths=MEL_PATHS, train_label_paths=TRAIN_LABEL_PATHS):
  df = None
  LABEL_IDS = {}
    
  for file_path in mel_paths:
    temp = pd.read_csv(str(file_path), index_col=0)
    temp["impath"] = temp.apply(lambda row: file_path.parent/"audio_images/{}/{}.npy".format(row.primary_label, row.filename), axis=1) 
    df = temp if df is None else df.append(temp)
    
  df["secondary_labels"] = df["secondary_labels"].apply(literal_eval)

  for file_path in train_label_paths:
    with open(str(file_path)) as f:
      LABEL_IDS.update(json.load(f))
  LABEL_IDS['nocall'] = NUM_CLASSES - 1
  return LABEL_IDS, df
LABEL_IDS, df = get_df()

In [None]:
def preprocess_background(df):
    impath = []
    label_id = []
    nocall_id = NUM_CLASSES - 1
    secondary_class = []
    all_background = glob.glob(f"{BACKGROUND_PATH}*")
    to_drop = []
    for idx, row in enumerate(df.iterrows()):
        row = row[1]
        path = BACKGROUND_PATH + row.filename + '.npy'
        if path not in all_background:
            to_drop += [idx]
            continue
        impath += [path]
        label_id += [nocall_id]
        secondary_class += [[]]
    df = df.drop(to_drop)
    df['impath'] = impath
    df['label_id'] = label_id
    df['secondary_labels'] = secondary_class
    return df
def preprocess_external(df, base_path):
    # Preprocesses the External Dataset into the correct format.
    impath = []
    label_id = []
    secondary_label = []
    for row in tqdm(df.iterrows()):
        row = row[1]
        path = f'{base_path}{row.primary_label}/{row.filename}.npy'
        impath += [path]
        label_id += [LABEL_IDS[row.primary_label]]
    df['impath'] = impath
    df['label_id'] = label_id
    df['secondary_labels'] = [[]] * len(df)
    return df
def preprocess_soundscapes(df):
    # In order to model the test set, you need soundscapes.
    impath = []
    label_id = []
    site = []
    for row in df.iterrows():
        row = row[1]
        path = f'{external_soundscapes}{row.filename}.npy'
        if 'SSW' in path:
            site += ['SSW']
        else:
            site += ['COR']
        impath += [path]
        classes = row.primary_label.split()
        im_id = None
        classes_added = []
        for class_name in classes:
            if im_id is None:
                im_id = str(LABEL_IDS[class_name]) 
                classes_added += [class_name]
            else:
                if class_name not in classes_added:
                    im_id += f' {LABEL_IDS[class_name]}'
                    classes_added += [class_name]
        label_id += [im_id]
    df['impath'] = impath 
    df['label_id'] = label_id
    df['site'] = site
    # Segregate into External Soundscapes and Train Sound Scapes
    train_idx = []
    external_idx = []
    for idx, row in enumerate(df.iterrows()):
        row = row[1]
        filename = row.filename
        # Strip the base from the name
        base_filename = ''
        for i in range(len(filename) - 1, -1, -1):
            if filename[i] == '_':
                base_filename = filename[:i]
                break
        # Check if it exists
        num_files = len(glob.glob(f"{SOUNDSCAPES_PATH}{base_filename}*"))
        if num_files > 0:
            train_idx += [idx]
        else:
            external_idx += [idx]
    train = df.iloc[train_idx]
    external = df.iloc[external_idx] 
            
    return train, external

# Clean Up Dfs

In [None]:
BACKGROUND_CSV = preprocess_background(BACKGROUND_CSV)
external_soundscapes_csv, extra_soundscapes_csv = preprocess_soundscapes(external_soundscapes_csv)
#external_data_csv = preprocess_external(external_data_csv, external_data)
#external_data2_csv = preprocess_external(external_data2_csv, external_data2)
#external_data3_csv = preprocess_external(external_data3_csv, external_data3)


df = clean_df(df)
BACKGROUND_CSV = clean_df(BACKGROUND_CSV)
external_soundscapes_csv =clean_df(external_soundscapes_csv)
extra_soundscapes_csv = clean_df(extra_soundscapes_csv)
#external_data_csv = clean_df(external_data_csv)
#external_data2_csv = clean_df(external_data2_csv)
#external_data3_csv = clean_df(external_data3_csv)



# Append Extra Data

In [None]:
# Append Extra Data
df = df.append(BACKGROUND_CSV)
#df = df.append(external_data_csv) # 40000 -> 100000
# Append External 2
#df = df.append(external_data2_csv) # 100000 -> 130000 # All Xeno-Canto Data Added.
#df = df.append(external_data3_csv)

# FOlds

In [None]:
def stratified_KFold():
    # Special Splitting Strategy that splits the files from the soundspace separately(So they also have 4/5 split)
    # Performs a shuffled Stratified Split on the DataFrame.
    splitter = StratifiedKFold(shuffle = True, random_state = 42) 
    FOLDS = []
    SOUNDSCAPES_FOLDS = []
    if not DataConfig.soundscapes_val:
        for idx, (train, test) in enumerate(splitter.split(np.zeros(len(external_soundscapes_csv)), external_soundscapes_csv.site)):
            train_fold= df.append(external_soundscapes_csv.iloc[train]).append(extra_soundscapes_csv)
            test_fold = external_soundscapes_csv.iloc[test]
            FOLDS += [(train_fold, test_fold)]
            SOUNDSCAPES_FOLDS += [(external_soundscapes_csv.iloc[train].append(extra_soundscapes_csv), external_soundscapes_csv.iloc[test])]
    else:
        # No Folds, all monophone goes in train, all soundscapes in val
        FOLDS = [(df.append(extra_soundscapes_csv), external_soundscapes_csv)]
        SOUNDSCAPES_FOLDS += [(extra_soundscapes_csv, external_soundscapes_csv)]
    return FOLDS, SOUNDSCAPES_FOLDS

In [None]:
FOLDS, SOUNDSCAPES_FOLDS = stratified_KFold()

# Now Append the External soundscapes Data
#if DataConfig.soundscapes_val:
df = df.append(external_soundscapes_csv).append(extra_soundscapes_csv)

# Model Definition

In [None]:
def init_layer(layer):
    nn.init.xavier_uniform_(layer.weight)

    if hasattr(layer, "bias"):
        if layer.bias is not None:
            layer.bias.data.fill_(0.)


def init_bn(bn):
    bn.bias.data.fill_(0.)
    bn.weight.data.fill_(1.0)


def interpolate(x: torch.Tensor, ratio: int):
    """Interpolate data in time domain. This is used to compensate the
    resolution reduction in downsampling of a CNN.

    Args:
      x: (batch_size, time_steps, classes_num)
      ratio: int, ratio to interpolate
    Returns:
      upsampled: (batch_size, time_steps * ratio, classes_num)
    """
    (batch_size, time_steps, classes_num) = x.shape
    upsampled = x[:, :, None, :].repeat(1, 1, ratio, 1)
    upsampled = upsampled.reshape(batch_size, time_steps * ratio, classes_num)
    return upsampled


def pad_framewise_output(framewise_output: torch.Tensor, frames_num: int):
    """Pad framewise_output to the same length as input frames. The pad value
    is the same as the value of the last frame.
    Args:
      framewise_output: (batch_size, frames_num, classes_num)
      frames_num: int, number of frames to pad
    Outputs:
      output: (batch_size, frames_num, classes_num)
    """
    pad = framewise_output[:, -1:, :].repeat(
        1, frames_num - framewise_output.shape[1], 1)
    """tensor for padding"""

    output = torch.cat((framewise_output, pad), dim=1)
    """(batch_size, frames_num, classes_num)"""

    return output


class ConvBlock(nn.Module):
    def __init__(self, in_channels: int, out_channels: int):
        super().__init__()

        self.conv1 = nn.Conv2d(
            in_channels=in_channels,
            out_channels=out_channels,
            kernel_size=(3, 3),
            stride=(1, 1),
            padding=(1, 1),
            bias=False)

        self.conv2 = nn.Conv2d(
            in_channels=out_channels,
            out_channels=out_channels,
            kernel_size=(3, 3),
            stride=(1, 1),
            padding=(1, 1),
            bias=False)

        self.bn1 = nn.BatchNorm2d(out_channels)
        self.bn2 = nn.BatchNorm2d(out_channels)

        self.init_weight()

    def init_weight(self):
        init_layer(self.conv1)
        init_layer(self.conv2)
        init_bn(self.bn1)
        init_bn(self.bn2)

    def forward(self, input, pool_size=(2, 2), pool_type='avg'):

        x = input
        x = F.relu_(self.bn1(self.conv1(x)))
        x = F.relu_(self.bn2(self.conv2(x)))
        if pool_type == 'max':
            x = F.max_pool2d(x, kernel_size=pool_size)
        elif pool_type == 'avg':
            if pool_size == (1, 1):
                x = F.avg_pool2d(x, kernel_size = (1, 1))
            else:
                x = F.avg_pool2d(x, kernel_size=pool_size, padding = 1, stride = 2)
        elif pool_type == 'avg+max':
            x1 = F.avg_pool2d(x, kernel_size=pool_size)
            x2 = F.max_pool2d(x, kernel_size=pool_size)
            x = x1 + x2
        else:
            raise Exception('Incorrect argument!')

        return x


class AttBlock(nn.Module):
    def __init__(self,
                 in_features: int,
                 out_features: int,
                 activation="linear",
                 temperature=1.0):
        super().__init__()

        self.activation = activation
        self.temperature = temperature
        self.att = nn.Conv1d(
            in_channels=in_features,
            out_channels=out_features,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=True)
        self.cla = nn.Conv1d(
            in_channels=in_features,
            out_channels=out_features,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=True)

        self.bn_att = nn.BatchNorm1d(out_features)
        self.init_weights()

    def init_weights(self):
        init_layer(self.att)
        init_layer(self.cla)
        init_bn(self.bn_att)

    def forward(self, x):
        # x: (n_samples, n_in, n_time)
        norm_att = torch.softmax(torch.clamp(self.att(x), -10, 10), dim=-1)
        cla = self.nonlinear_transform(self.cla(x))
        x = torch.sum(norm_att * cla, dim=2)
        return x, norm_att, cla

    def nonlinear_transform(self, x):
        if self.activation == 'linear':
            return x
        elif self.activation == 'sigmoid':
            return torch.sigmoid(x)

In [None]:
class DFTBase(nn.Module):
    def __init__(self):
        """Base class for DFT and IDFT matrix"""
        super(DFTBase, self).__init__()

    def dft_matrix(self, n):
        (x, y) = np.meshgrid(np.arange(n), np.arange(n))
        omega = np.exp(-2 * np.pi * 1j / n)
        W = np.power(omega, x * y)
        return W

    def idft_matrix(self, n):
        (x, y) = np.meshgrid(np.arange(n), np.arange(n))
        omega = np.exp(2 * np.pi * 1j / n)
        W = np.power(omega, x * y)
        return W
    
    
class STFT(DFTBase):
    def __init__(self, n_fft=2048, hop_length=None, win_length=None, 
        window='hann', center=True, pad_mode='reflect', freeze_parameters=True):
        """Implementation of STFT with Conv1d. The function has the same output 
        of librosa.core.stft
        """
        super(STFT, self).__init__()

        assert pad_mode in ['constant', 'reflect']

        self.n_fft = n_fft
        self.center = center
        self.pad_mode = pad_mode

        # By default, use the entire frame
        if win_length is None:
            win_length = n_fft

        # Set the default hop, if it's not already specified
        if hop_length is None:
            hop_length = int(win_length // 4)

        fft_window = librosa.filters.get_window(window, win_length, fftbins=True)

        # Pad the window out to n_fft size
        fft_window = librosa.util.pad_center(fft_window, n_fft)

        # DFT & IDFT matrix
        self.W = self.dft_matrix(n_fft)

        out_channels = n_fft // 2 + 1

        self.conv_real = nn.Conv1d(in_channels=1, out_channels=out_channels, 
            kernel_size=n_fft, stride=hop_length, padding=0, dilation=1, 
            groups=1, bias=False)

        self.conv_imag = nn.Conv1d(in_channels=1, out_channels=out_channels, 
            kernel_size=n_fft, stride=hop_length, padding=0, dilation=1, 
            groups=1, bias=False)

        self.conv_real.weight.data = torch.Tensor(
            np.real(self.W[:, 0 : out_channels] * fft_window[:, None]).T)[:, None, :]
        # (n_fft // 2 + 1, 1, n_fft)

        self.conv_imag.weight.data = torch.Tensor(
            np.imag(self.W[:, 0 : out_channels] * fft_window[:, None]).T)[:, None, :]
        # (n_fft // 2 + 1, 1, n_fft)

        if freeze_parameters:
            for param in self.parameters():
                param.requires_grad = False

    def forward(self, input):
        """input: (batch_size, data_length)
        Returns:
          real: (batch_size, n_fft // 2 + 1, time_steps)
          imag: (batch_size, n_fft // 2 + 1, time_steps)
        """

        x = input[:, None, :]   # (batch_size, channels_num, data_length)

        if self.center:
            x = F.pad(x, pad=(self.n_fft // 2, self.n_fft // 2), mode=self.pad_mode)

        real = self.conv_real(x)
        imag = self.conv_imag(x)
        # (batch_size, n_fft // 2 + 1, time_steps)

        real = real[:, None, :, :].transpose(2, 3)
        imag = imag[:, None, :, :].transpose(2, 3)
        # (batch_size, 1, time_steps, n_fft // 2 + 1)

        return real, imag
    
    
class Spectrogram(nn.Module):
    def __init__(self, n_fft=2048, hop_length=None, win_length=None, 
        window='hann', center=True, pad_mode='reflect', power=2.0, 
        freeze_parameters=True):
        """Calculate spectrogram using pytorch. The STFT is implemented with 
        Conv1d. The function has the same output of librosa.core.stft
        """
        super(Spectrogram, self).__init__()

        self.power = power

        self.stft = STFT(n_fft=n_fft, hop_length=hop_length, 
            win_length=win_length, window=window, center=center, 
            pad_mode=pad_mode, freeze_parameters=True)

    def forward(self, input):
        """input: (batch_size, 1, time_steps, n_fft // 2 + 1)
        Returns:
          spectrogram: (batch_size, 1, time_steps, n_fft // 2 + 1)
        """

        (real, imag) = self.stft.forward(input)
        # (batch_size, n_fft // 2 + 1, time_steps)

        spectrogram = real ** 2 + imag ** 2

        if self.power == 2.0:
            pass
        else:
            spectrogram = spectrogram ** (power / 2.0)

        return spectrogram

    
class LogmelFilterBank(nn.Module):
    def __init__(self, sr=32000, n_fft=2048, n_mels=64, fmin=50, fmax=14000, is_log=True, 
        ref=1.0, amin=1e-10, top_db=80.0, freeze_parameters=True):
        """Calculate logmel spectrogram using pytorch. The mel filter bank is 
        the pytorch implementation of as librosa.filters.mel 
        """
        super(LogmelFilterBank, self).__init__()

        self.is_log = is_log
        self.ref = ref
        self.amin = amin
        self.top_db = top_db

        self.melW = librosa.filters.mel(sr=sr, n_fft=n_fft, n_mels=n_mels,
            fmin=fmin, fmax=fmax).T
        # (n_fft // 2 + 1, mel_bins)

        self.melW = nn.Parameter(torch.Tensor(self.melW))

        if freeze_parameters:
            for param in self.parameters():
                param.requires_grad = False

    def forward(self, input):
        """input: (batch_size, channels, time_steps)
        
        Output: (batch_size, time_steps, mel_bins)
        """

        # Mel spectrogram
        mel_spectrogram = torch.matmul(input, self.melW)

        # Logmel spectrogram
        if self.is_log:
            output = self.power_to_db(mel_spectrogram)
        else:
            output = mel_spectrogram

        return output


    def power_to_db(self, input):
        """Power to db, this function is the pytorch implementation of 
        librosa.core.power_to_lb
        """
        ref_value = self.ref
        log_spec = 10.0 * torch.log10(torch.clamp(input, min=self.amin, max=np.inf))
        log_spec -= 10.0 * np.log10(np.maximum(self.amin, ref_value))

        if self.top_db is not None:
            if self.top_db < 0:
                raise ParameterError('top_db must be non-negative')
            log_spec = torch.clamp(log_spec, min=log_spec.max().item() - self.top_db, max=np.inf)

        return log_spec

In [None]:
class DropStripes(nn.Module):
    def __init__(self, dim, drop_width, stripes_num):
        """Drop stripes. 
        Args:
          dim: int, dimension along which to drop
          drop_width: int, maximum width of stripes to drop
          stripes_num: int, how many stripes to drop
        """
        super(DropStripes, self).__init__()

        assert dim in [2, 3]    # dim 2: time; dim 3: frequency

        self.dim = dim
        self.drop_width = drop_width
        self.stripes_num = stripes_num

    def forward(self, input):
        """input: (batch_size, channels, time_steps, freq_bins)"""

        assert input.ndimension() == 4

        if self.training is False:
            return input

        else:
            batch_size = input.shape[0]
            total_width = input.shape[self.dim]

            for n in range(batch_size):
                self.transform_slice(input[n], total_width)

            return input


    def transform_slice(self, e, total_width):
        """e: (channels, time_steps, freq_bins)"""

        for _ in range(self.stripes_num):
            distance = torch.randint(low=0, high=self.drop_width, size=(1,))[0]
            bgn = torch.randint(low=0, high=total_width - distance, size=(1,))[0]

            if self.dim == 2:
                e[:, bgn : bgn + distance, :] = 0
            elif self.dim == 3:
                e[:, :, bgn : bgn + distance] = 0


class SpecAugmentation(nn.Module):
    def __init__(self, time_drop_width, time_stripes_num, freq_drop_width, 
        freq_stripes_num):
        """Spec augmetation. 
        [ref] Park, D.S., Chan, W., Zhang, Y., Chiu, C.C., Zoph, B., Cubuk, E.D. 
        and Le, Q.V., 2019. Specaugment: A simple data augmentation method 
        for automatic speech recognition. arXiv preprint arXiv:1904.08779.
        Args:
          time_drop_width: int
          time_stripes_num: int
          freq_drop_width: int
          freq_stripes_num: int
        """

        super(SpecAugmentation, self).__init__()

        self.time_dropper = DropStripes(dim=2, drop_width=time_drop_width, 
            stripes_num=time_stripes_num)

        self.freq_dropper = DropStripes(dim=3, drop_width=freq_drop_width, 
            stripes_num=freq_stripes_num)

    def forward(self, input):
        x = self.time_dropper(input)
        x = self.freq_dropper(x)
        return x

In [None]:
class PANNsCNN14Att(nn.Module):
    def __init__(self):
        super().__init__()
        
        
        window = 'hann'
        center = True
        pad_mode = 'reflect'
        ref = 1.0
        amin = 1e-10
        top_db = None
        mel_bins = 64
        self.interpolate_ratio = 32  # Downsampled ratio

         
        window = 'hann'
        center = True
        pad_mode = 'reflect'
        ref = 1.0
        amin = 1e-10
        top_db = None
        self.interpolate_ratio = 32  # Downsampled ratio
        sample_rate = 32000
        window_size =  1024
        hop_size = 320
        mel_bins = 64
        fmin = 50
        fmax = 14000
        classes_num = 264
        # Spectrogram extractor
        self.spectrogram_extractor = Spectrogram(
            n_fft=window_size,
            hop_length=hop_size,
            win_length=window_size,
            window=window,
            center=center,
            pad_mode=pad_mode,
            freeze_parameters=True)

        # Logmel feature extractor
        self.logmel_extractor = LogmelFilterBank(
            sr=sample_rate,
            n_fft=window_size,
            n_mels=mel_bins,
            fmin=fmin,
            fmax=fmax,
            ref=ref,
            amin=amin,
            top_db=top_db,
            freeze_parameters=True)

        # Spec augmenter
        self.spec_augmenter = SpecAugmentation(
            time_drop_width=64,
            time_stripes_num=2,
            freq_drop_width=8,
            freq_stripes_num=2)
        
        # Spectrogram extractor
        self.bn0 = nn.BatchNorm2d(mel_bins)

        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
        self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
        self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
        self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
        self.conv_block5 = ConvBlock(in_channels=512, out_channels=1024)
        self.conv_block6 = ConvBlock(in_channels=1024, out_channels=2048)
        classes_num = 264
        self.fc1 = nn.Linear(2048, 2048, bias=True)
        self.att_block = AttBlock(2048, classes_num, activation='sigmoid')

        self.init_weight()
        
    def init_weight(self):
        init_bn(self.bn0)
        init_layer(self.fc1)
        
    def cnn_feature_extractor(self, x):
        x = self.conv_block1(x, pool_size=(3, 3), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv_block2(x, pool_size=(3, 3), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv_block3(x, pool_size=(3, 3), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv_block4(x, pool_size=(3, 3), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv_block5(x, pool_size=(3, 3), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv_block6(x, pool_size=(1, 1), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        return x
    
   

    def forward(self, x, mixup_lambda=None):
        """
        Input: (batch_size, data_length)"""
        # Output shape (batch size, channels, time, frequency)
        x = x.transpose(2, 3) # (B, C, T, F)
        x = self.cnn_feature_extractor(x)
        
        return x.transpose(2, 3)

In [None]:
class tmp_model(pl.LightningModule):
    def __init__(self, model):
        super().__init__()
        self.model = model
        self.load_state_dict(torch.load(ResNestPath, map_location = DEVICE))
def load_prev_model(name):
    if "resnest" in name:
        model = resnest50_fast_1s1x64d(pretrained = False)
    elif 'CNN14' in name:
        model = PANNsCNN14Att()
    elif 'densenet' in name:
        model = torchvision.models.densenet121(pretrained = True)
    else:
        model = torchvision.models.resnext50_32x4d(pretrained=False)
    #nb_ft = model.fc.in_features
    #del model.fc
    #num_cls = 264#397 if 'resnest' in name else 264
    #model.fc = nn.Linear(nb_ft, num_cls)
    #if 'resnest' in name:
    #    model = tmp_model(model).model
    #else:
    if 'CNN14' in name:
        model.load_state_dict(torch.load(CNN14Path, map_location = DEVICE)['model_state_dict'])
        model.conv_block1 = ConvBlock(in_channels = 3, out_channels = 64)
    return model
class FeatureExtractor(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.model_name = ModelConfig.model_name
        self.model = load_prev_model(self.model_name)
        
    def forward(self, x):
        if 'densenet' in self.model_name:
            return self.model.features(x)
        x = self.model(x)
        return x

# Transformer Blocks, Added on Top of CNN Features

In [None]:
# Not Real Transformer, just 2 stacked MAH blocks(Similar to SED Attention, but for classification instead of segmentation of audio.)
class MultiHeadAttention(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.num_heads = ModelConfig.num_heads
        self.in_dim = ModelConfig.transformer_dim
        
        self.drop_prob = ModelConfig.dropout
        
        self.MultiHeadAttention = nn.MultiheadAttention(self.in_dim, self.num_heads)
        self.LayerNorm = nn.LayerNorm((self.in_dim))
        self.dropout = nn.Dropout(self.drop_prob)
    def forward(self, x):
        return self.dropout(self.LayerNorm(self.MultiHeadAttention(key = x, value = x, query = x)[0]))
        
class MAHHead(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.in_features = ModelConfig.feature_extractor_dim
        self.inner_features = ModelConfig.transformer_dim
        self.num_classes = ModelConfig.num_classes
        self.ConvHead = ConvBlock(self.in_features, self.inner_features, (4, 1), 0, 1, 1, 1)
        self.MAH = nn.Sequential(*[
            MultiHeadAttention() for i in range(ModelConfig.num_layers)
        ])
        self.ConvStem = ConvBlock(self.inner_features, self.in_features, 1, 0, 1, 1, 1)
        self.Linear = nn.Linear(self.in_features, self.num_classes)
    def forward(self, x):
        # X: tensor(B, 2048, 4, 9)
        x = torch.squeeze(self.ConvHead(x)).transpose(1, 2) # (B, 9, 768)
        x = x.transpose(0, 1)
        x = self.MAH(x).transpose(0, 1).transpose(1, 2).unsqueeze(2) # (B, 768, 1, 9)
        x = self.ConvStem(x) # (B, 2048, 1, 9) 
        x = torch.sum(x, dim = (2, 3)) # (B, 2048)
        return self.Linear(x)
        
class AttentionHead(pl.LightningModule):
    # Heng's 2D attention Head, not sure if it will work or not.
    def __init__(self):
        super().__init__()
        self.in_features = ModelConfig.feature_extractor_dim
        self.num_classes = ModelConfig.num_classes
        self.attention = nn.Sequential(
            nn.Conv2d(self.in_features, self.in_features * 2, kernel_size=3, padding=1, stride=1, bias=False),
            nn.BatchNorm2d(self.in_features * 2),
            nn.Tanh(),
        ) #use 8x1 to make location-aware convolution
        self.attention1 = nn.Conv2d(self.in_features // 2, 1, kernel_size=1)
        self.global_avg_pool = nn.AdaptiveAvgPool2d((1, 1))
        self.drop_prob = ModelConfig.dropout
        self.Dropout = nn.Dropout(self.drop_prob)
        self.Linear =  nn.Linear(self.in_features, self.num_classes)

    def forward(self, x):
        B, C, Freq, L = x.shape
        a = self.attention(x).mean(dim = 2) #x is 4x9 feature map
      
        a = a.reshape(-1, self.in_features // 2, Freq, L)
        a = self.attention1(a)
        a = F.softmax(a.reshape(-1, Freq * L),-1).reshape(-1, 1, Freq, L)

        x = (a * x + x) # (B, 2048)
        x = torch.squeeze(self.global_avg_pool(x))
        x = self.Dropout(x)
        return self.Linear(x)
class BaseLineHead(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.in_dim = ModelConfig.feature_extractor_dim
        self.num_classes = ModelConfig.num_classes
        self.drop_prob = ModelConfig.dropout
        self.avgPool = nn.AdaptiveAvgPool2d((1, 1))
        self.Dropout = nn.Dropout(self.drop_prob)
        self.Linear = nn.Linear(self.in_dim, self.num_classes)
        
    def forward(self, x):
        # Tensor: (B, 768, 4, 9)
        x = x.sum(dim = (2, 3)) # (B, 768)
        x = self.Dropout(x)
        linear = self.Linear(x) # (B, self.num_classes)
        return linear

In [None]:
class FullModel(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.feature_extractor = FeatureExtractor()
        self.model_head = ModelConfig.head
        if self.model_head == 'mah':
            self.head = MAHHead()
        elif self.model_head == 'baseline':
            self.head = BaseLineHead()
        else:
            self.head = AttentionHead()
    def forward(self, x):
        features = self.feature_extractor(x)
        head = self.head(features)
        return head


In [None]:
class ModelConfig:
    num_classes = NUM_CLASSES
    bam_dilate = 3
    reduce = 4
    expand = 1.5
    
    
    feature_extractor_dim = 1024
    transformer_dim = 768
    num_heads = 12
    
    model_name = 'densenet'
    dropout = 0.5
    num_layers = 2
    head = 'Conv2DAtt'
    num_channels = 3 # 1 for just Melspecs 
    
    use_mixup = False # Whether or not to use mixup augmentation.
    
    

# PreCache The Dataset

In [None]:
def load_data(df):
    def load_row(row):
        # impath = TRAIN_IMAGES_ROOT/f"{row.primary_label}/{row.filename}.npy"
        image = np.load(str(row.impath))
        if len(image.shape) > 2:
            image = image[:MAX_READ_SAMPLES]
        else:
            image = np.expand_dims(image, axis = 0)
        return row.filename, image
    pool = joblib.Parallel(4)
    mapper = joblib.delayed(load_row)
    tasks = [mapper(row) for row in df.itertuples(False)]
    res = pool(tqdm(tasks))
    res = dict(res)
    return res

In [None]:
# We cache the train set to reduce training time
#audio_image_store = None
if audio_image_store is None:
    audio_image_store = load_data(df)

# Dataset Definition

In [None]:
def freq_mask(spec, F=30, num_masks=1, replace_with_zero=False):
    # Mel Spec Augments
    cloned = spec.clone()
    num_mel_channels = cloned.shape[0]
    
    for i in range(0, num_masks):        
        f = random.randrange(0, F)
        f_zero = random.randrange(0, num_mel_channels - f)

        # avoids randrange error if values are equal and range is empty
        if (f_zero == f_zero + f): return cloned

        mask_end = random.randrange(f_zero, f_zero + f) 
        if (replace_with_zero): cloned[f_zero:mask_end] = 0
        else: cloned[f_zero:mask_end] = cloned.mean()
    
    return cloned

#Export
def time_mask(spec, T=40, num_masks=1, replace_with_zero=False):
    cloned = spec.clone()
    len_spectro = cloned.shape[1]
    
    for i in range(0, num_masks):
        t = random.randrange(0, T)
        t_zero = random.randrange(0, len_spectro - t)

        # avoids randrange error if values are equal and range is empty
        if (t_zero == t_zero + t): return cloned

        mask_end = random.randrange(t_zero, t_zero + t)
        if (replace_with_zero): cloned[:,t_zero:mask_end] = 0
        else: cloned[:,t_zero:mask_end] = cloned.mean()
    return cloned
def pad_tensor(y):
    # Pads a Tensor to shape (128, 281) 
    shape = (128, 281)
    new_tensor = torch.zeros(shape)
    new_tensor[:, :y.shape[1]] = y
    return new_tensor
def lower_gain(y):
    # Lowers the Gain of the image
    lower_bound = 0.5
    upper_bound = 1
    gain = random.uniform(lower_bound, upper_bound)
    return y ** gain 
def mixup_with_val(y, idx):
    # Mixes up the melspec with the melspec from train_soundscapes(In order to produce "artificial" noise that exists in the test set.
    # This is only ever used at train, so idx is guaranteed to be 0
    soundscapes_df = SOUNDSCAPES_FOLDS[idx][0]
    soundscapes_df = soundscapes_df[soundscapes_df.primary_label == 'nocall']
    idx = random.randint(0, len(soundscapes_df) - 1)
    row = torch.tensor(np.load(soundscapes_df.iloc[idx].impath))
    background_mix = pad_tensor(row)
    
    mix_min = 0.4
    mix_max = 1 
    mix_quan = random.uniform(mix_min, mix_max)
    return y + mix_quan * background_mix
    
    
def cutmix_with_val(y, idx):
    soundscapes_df = SOUNDSCAPES_FOLDS[idx][0]
    soundscapes_df = soundscapes_df[soundscapes_df.primary_label == 'nocall']
    idx = random.randint(0, len(soundscapes_df) - 1) 
    row = torch.tensor(np.load(soundscapes_df.iloc[idx].impath))
    background_mix = pad_tensor(row)
    _, L = background_mix.shape
    start_idx = random.randint(0, L - 1)
    end_idx = random.randint(start_idx + 1, L)
    initial = y[:, :start_idx]
    end = y[:, end_idx:]
    middle= background_mix[:, start_idx:end_idx] / 255.0
    y = torch.cat([initial, middle, end], dim = -1)
    return y
#def cutout(y):
#    # Cutouts a Box in the Melspec, alternative to specAugment
    
    
def augment(y, idx):
    # Augmentation on MelSpecs
    
    if random.random() < 0.25:
        y = time_mask(y, num_masks = 2)
    if random.random() < 0.25:
        y = freq_mask(y, num_masks = 2)
    #if random.random() < 0.5:
    #    y = mixup_with_val(y, idx)
    if random.random() < 0.5:
        y = lower_gain(y)
    return y # Augmentations aren't working right now.

In [None]:
class BirdClefDataset(Dataset):

    def __init__(self, audio_image_store, idx, sr=SR, is_train=True, num_classes=NUM_CLASSES, duration=DURATION):
        
        self.idx = idx
        self.number = 0 if is_train else 1
        self.audio_image_store = audio_image_store
        self.meta = FOLDS[idx][self.number].copy().reset_index(drop=True)
        self.sr = sr
        self.is_train = is_train
        self.num_classes = num_classes
        self.duration = duration
        
        self.audio_length = self.duration*self.sr
        
        self.mean_train = np.array([0.485, 0.456, 0.406])
        self.mean_train =torch.tensor(np.expand_dims(np.expand_dims(self.mean_train, axis = -1), axis = -1))
        self.std_train = np.array([0.229, 0.224, 0.225])
        self.std_train = torch.tensor(np.expand_dims(np.expand_dims(self.std_train, axis = -1), axis = -1))
        self.stats = (self.mean_train, self.std_train) # ImageNet Stats 
        self.shape = (128, 281)
    
    
   

    def __len__(self):
        return len(self.meta)
    
    def __getitem__(self, idx):
        row = self.meta.iloc[idx]
        #secondary_labels = row.secondary_tensors
        image = self.audio_image_store[row.filename]
        image = torch.tensor(image[0]).to(torch.float32) / 255.0 # Sample From the Front.

        padded = torch.zeros(self.shape)
        padded[:, :image.shape[1]] = image
        image = padded
        if self.is_train:
            image = augment(image, self.idx)
        # One image is Regular power to db, another is cleaned powered power to db, one is pcen
        if ModelConfig.num_channels == 3:
            pcen = lb.pcen(image.numpy())
            mel = image
            power = image ** 1.5
            image = torch.stack([torch.tensor(pcen), torch.tensor(mel), torch.tensor(power)], dim = 0)
        else:
            image = torch.stack([image, image, image])
       
        
        t = np.zeros(self.num_classes, dtype=np.float32) 
        label_id = row.label_id
        try:
            label_id = label_id.item()
        except:
            pass
        
        if type(label_id) == type(1.0) or type(label_id) == type(1):
            t[row.label_id] = 1
        else:
            
            # Soundscape, so multiple things
            labels = row.label_id.split()
            for label in labels:
                t[int(label)] = 1
                
        
        return torch.tensor(image).to(torch.float32), torch.tensor(t).to(torch.float32)

In [None]:
dataset = BirdClefDataset(audio_image_store, 0, is_train = True)

In [None]:
count = 0
for images, labels in dataset:
    lbd.specshow(images[1].numpy())
    plt.show()
    count += 1
    if count == 32:
        break

In [None]:
def label_smooth(primary): 
    primary = primary.clone() 
    ones = primary == 1 
    not_ones = primary == 0 
    primary[ones] = 0.995 
    primary[not_ones] = 0.0025 
    return primary

# mixup and cutmix.

In [None]:
def mixup(x, y):
    # Performs Mixup on Melspecs
    # x: Tensor(B, 128, 281)
    # Mixup sampled from beta distribution.
    beta = 0.4
    gamma = random.beta(beta, beta)
    gamma = max(1-gamma, gamma)
    shuffle = torch.randperm(x.shape[0]).to(x.device)
    x = gamma*x + (1-gamma)*x[shuffle]
    y = gamma*y + (1-gamma)*y[shuffle]
    # for hard mixup, anything that isn't 0 is set to 1.
    return x, y # efficient mixup impletation(not completely random, but it should be fine.)
    
def cutmix(batch, y):
    # Performs Time wise Cutmix on batch of melspecs
    pass

# collate functions

In [None]:
def train_collate(values):
    images = torch.stack([value[0] for value in values])
    labels = torch.stack([value[1] for value in values]) # (B, 128, 281)
    # ----------MixUp--------(or Cutmix?)
    if ModelConfig.use_mixup:
        images, labels = mixup(images, labels)
    return images, labels
def val_collate(values):
    images = torch.stack([value[0] for value in values])
    labels = torch.stack([value[1] for value in values])
    return images, labels 
    

# Training the model

In [None]:
CRITERION = nn.BCEWithLogitsLoss(reduction = 'mean') # TODO: Mask Secondary Outputs Loss.
def loss_fn(y_pred, primary):
    # Primary: Tensor(B)
    # Secondary: Tensor(B, C)
    # Y_Pred: Tensor(B, C)
    # TODO Split the Losses.
    B, C = primary.shape
    smooth = label_smooth(primary)
    y_pred = y_pred.float()
    loss = CRITERION(y_pred, smooth)

    return loss

# Metrics

In [None]:
class F1_score(Metric):
    def __init__(self):
        self.thresholds = np.arange(0.0, 1.0, 0.01)
        self.f1_scores = [0.0 for i in range(len(self.thresholds))]
        self.count = 0
    def round_pred(self, y_pred, thresh):
        ones = y_pred >= thresh
        logits = torch.zeros_like(y_pred, device = y_pred.device)
        logits = logits + ones.int()
        return logits
    def round_true(self, y_true):
        ones = y_true >= 0.5
        logits = torch.zeros_like(y_true, device = y_true.device)
        logits = logits + ones.int()
        return logits
    def metric(self, y_pred, target, thresh_idx):
        # Given y_pred = Tensor(B, C) and primary = Tensor(B, C), Computes the Row-wise F1 Score
        # Threshold Predictions
        predictions = self.round_pred(y_pred, self.thresholds[thresh_idx])
        # PRedictions: (B, C)
        zeros = torch.sum(predictions, axis = -1) == 0
        predictions[zeros, NUM_CLASSES - 1] = 1 # No Call
        tp = (predictions * target).sum(1)
        fp = (predictions * (1 - target)).sum(1)
        fn = ((1 - predictions) * target).sum(1)
        
        eps = 1e-9
        f1 = (tp + eps) / (tp + (fp + fn) / 2 + eps)
        # Accumulate the f1 score
        self.f1_scores[thresh_idx] += f1.mean().item()
    def compute_f1_score(self, y_pred, primary):
        primary = self.round_true(primary)
        for th_idx in range(len(self.thresholds)):
            y_p = self.round_pred(y_pred, self.thresholds[th_idx])
            self.metric(y_p, primary, th_idx)
        self.count += 1
    def accumulate(self, learn):
        # y_pred: Tensor(B, C)
        y_pred = torch.sigmoid(learn.pred).cpu()
        primary = learn.y.cpu()
        self.compute_f1_score(y_pred, primary)
        
    def reset(self):
        self.f1_scores = [0.0 for i in range(len(self.f1_scores))]
        self.count = 0
        
    @property
    def value(self):
        eps = 1e-8
        # Computes the best threshold and result
        best = 0
        best_th = 0
        for th_idx in range(len(self.f1_scores)):
            if self.f1_scores[th_idx] > best:
                best = self.f1_scores[th_idx]
                best_th = self.thresholds[th_idx]
        best = (best + eps) / (self.count + eps)
        print(f"F1Score: {best}")
        return best
class F1Score_th(Metric):
    def __init__(self):
        self.thresholds = np.arange(0.0, 1.0, 0.01)
        self.f1_scores = [0.0 for i in range(len(self.thresholds))]
        self.count = 0
    def round_pred(self, y_pred, thresh):
        ones = y_pred >= thresh
        logits = torch.zeros_like(y_pred, device = y_pred.device)
        logits = logits + ones.int()
        return logits
    def round_true(self, y_true):
        ones = y_true >= 0.5
        logits = torch.zeros_like(y_true, device = y_true.device)
        logits = logits + ones.int()
        return logits
    def metric(self, y_pred, target, thresh_idx):
        # Given y_pred = Tensor(B, C) and primary = Tensor(B, C), Computes the Row-wise F1 Score
        # Threshold Predictions
        predictions = self.round_pred(y_pred, self.thresholds[thresh_idx])
    
        tp = (predictions * target).sum(1)
        fp = (predictions * (1 - target)).sum(1)
        fn = ((1 - predictions) * target).sum(1)
        
        eps = 1e-9
        f1 = (tp + eps) / (tp + (fp + fn) / 2 + eps)
        # Accumulate the f1 score
        self.f1_scores[thresh_idx] += f1.mean().item()
    def compute_f1_score(self, y_pred, primary):
        primary = self.round_true(primary)
        for th_idx in range(len(self.thresholds)):
            y_p = self.round_pred(y_pred, self.thresholds[th_idx])
            self.metric(y_p, primary, th_idx)
        self.count += 1
    def accumulate(self, learn):
        # y_pred: Tensor(B, C)
        y_pred = torch.sigmoid(learn.pred).cpu()
        primary = learn.y.cpu()
        self.compute_f1_score(y_pred, primary)
        
    def reset(self):
        self.f1_scores = [0.0 for i in range(len(self.f1_scores))]
        self.count = 0
        
    @property
    def value(self):
        eps = 1e-8
        # Computes the best threshold and result
        best = 0
        best_th = 0
        for th_idx in range(len(self.f1_scores)):
            if self.f1_scores[th_idx] > best:
                best = self.f1_scores[th_idx]
                best_th = self.thresholds[th_idx]
        best = (best + eps) / (self.count + eps)
        return best_th
        
    
class Accuracy(Metric):
    def __init__(self):
        self.accuracy = 0
        self.count = 0
    def round_pred(self, y_pred):
        y_pred = torch.sigmoid(y_pred)
        ones = y_pred >= 0.5
        scores = torch.zeros_like(y_pred, device = y_pred.device)
        scores[ones] = 1
        return scores
    def round_true(self, y_true):
        ones = y_true >= 0.5
        logits = torch.zeros_like(ones, device = ones.device)
        logits = logits + ones.int()
        return logits
    def accumulate(self,learn):
        y_pred = self.round_pred(learn.pred)
        primary = self.round_true(learn.y)
        # y_pred: Tensor(B, C)
        # primary: Tensor(B, C)
        B, C = y_pred.shape
        tp = torch.sum(y_pred == primary)
        acc = tp / (B * C)
        
        self.accuracy += acc.item()
        self.count += 1
    def reset(self):
        self.accuracy = 0 
        self.count = 0
    @property
    def value(self):
        eps = 1e-8
        return round((self.accuracy + eps) / (self.count + eps), 3)

# TRaining Config

In [None]:
class TrainingConfig:
    start_lr = 1e-3
    max_lr = 4e-3
    min_lr = 1e-5
    warm_steps = 0.1
    peak_steps = 0.2 # 0.3 ramp up, 0.7 rabsamp down = super-convergence
    
    num_epochs = 16
    train_steps = len(FOLDS[0][0])
    
    total_steps = num_epochs * train_steps
    
    weight_decay = 0
    

# Training Fn

# DataLoader Config

In [None]:
class DataLoaderConfig:
    def __init__(self, is_train):
        num_workers = 4
        pin_memory = True
        bs = 64
        create_batch = train_collate if is_train else val_collate
        shuffle = is_train
        self.config = {'num_workers': num_workers, 'pin_memory': pin_memory, 'bs': bs, 'shuffle': shuffle, 'create_batch': create_batch}
        


In [None]:
def train_folds_fast_ai(folds):
    # Fast Ai, with their variety of tricks, trains faster and better
    for fold_idx in folds:
        # Overfit Testing
        train = BirdClefDataset(audio_image_store, fold_idx, is_train = True)
        val = BirdClefDataset(audio_image_store,fold_idx, is_train = False)#BirdClefDataset(audio_image_store, val, is_train = False)
        
        train_config = DataLoaderConfig(is_train = True)
        val_config = DataLoaderConfig(is_train = False)
        
        train = DataLoader(train, **train_config.config)
        val = DataLoader(val, **val_config.config)
        dls = DataLoaders(train, val)
        model = FullModel()
        if torch.cuda.is_available(): dls.cuda(),model.cuda()
        learn = Learner(dls, model, loss_func=loss_fn,
                metrics=[F1Score_th(), F1_score], opt_func = Adam, lr = TrainingConfig.start_lr, wd = TrainingConfig.weight_decay).to_fp16()
        cbs = [
            SaveModelCallback(monitor = 'f1_score',comp = np.greater, fname = f'model_{fold_idx}'),
            EarlyStoppingCallback(monitor = 'f1_score', comp = np.greater, patience = 5),
            ReduceLROnPlateau(monitor = 'f1_score', comp = np.greater, patience = 3, min_lr = 1e-7)
        ]
        learn.fit_one_cycle(TrainingConfig.num_epochs, lr_max = TrainingConfig.max_lr, cbs = cbs, wd = TrainingConfig.weight_decay)
        learn.save(f"final_{fold_idx}")

In [None]:
FOLDS_IDX = [1]
model = train_folds_fast_ai(FOLDS_IDX)