In [1]:
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
import pandas as pd
import random
import time

from tqdm import tqdm
from sklearn.utils import shuffle
from sklearn.model_selection import KFold,StratifiedKFold
import sklearn
from skimage.transform import resize
import os
import gc
import datetime
import pickle
import warnings
warnings.filterwarnings('ignore')

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, SequentialSampler
from torch.utils.tensorboard import SummaryWriter
from torchlibrosa.stft import Spectrogram, LogmelFilterBank
from torchlibrosa.augmentation import SpecAugmentation
from transformers import get_linear_schedule_with_warmup
from IPython.core.debugger import set_trace
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
# os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
torch.__version__

'1.7.1'

In [3]:
def setup_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    
GLOBAL_SEED = 42
setup_seed(GLOBAL_SEED)

In [4]:
data_path = '/dev/shm/data'
feat_path = '/root/s/RFCX/features'
res_path = '/root/s/RFCX/res'
model_path = '/root/s/RFCX/model_save'
tensorboard_path = '/root/s/RFCX/tensorboard'
if not os.path.exists(model_path):
    os.makedirs(model_path)
if not os.path.exists(res_path):
    os.makedirs(res_path)
if not os.path.exists(tensorboard_path):
    os.makedirs(tensorboard_path)

In [5]:
data_tp_df=pd.read_csv(os.path.join(data_path, 'train_tp.csv'))
data_fp_df=pd.read_csv(os.path.join(data_path, 'train_fp.csv'))

In [6]:
# count_series = data_tp_df['recording_id'].value_counts()[data_tp_df['recording_id']]
# count_series.index = data_tp_df.index
# data_tp_df['counts'] = count_series
# def process_overlap(df):
#     if df.counts() == 1:
#         return d
#     return d
# data_tp_df.groupby('recording_id').apply(f)
# data_tp_df[data_tp_df['counts'] >= 2]

## Some Global Parameter

In [7]:
class Config:
    num_class = 24
    n_fft = 2048
    hop_length = 512
    n_mels = 256
    sr = 32000
    segment_length = 10 * sr
    fmin = 80
    fmax = 16000
    
    resize = False
    img_shape = (256, 600)
    
    wav_augment = True
    spec_augment = True
    spec_augprob = 0.5
    mixup_proba = 0.0
    mixup_alpha = 5
    
    attenion_border = 1

## Prepare Dataset and Dataloader

In [8]:
"https://www.kaggle.com/gopidurgaprasad/audio-augmentation-albumentations/"

import matplotlib.pyplot as plt
import IPython.display as ipd
import albumentations
from albumentations.core.transforms_interface import DualTransform, BasicTransform
from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift, PolarityInversion, Gain, AddGaussianSNR


class AudioTransform(BasicTransform):
    """Transform for Audio task"""

    @property
    def targets(self):
        return {"data": self.apply}
    
    def update_params(self, params, **kwargs):
        if hasattr(self, "interpolation"):
            params["interpolation"] = self.interpolation
        if hasattr(self, "fill_value"):
            params["fill_value"] = self.fill_value
        return params
    
      
class MelSpectrogram(AudioTransform):
    """Shifting time axis"""
    def __init__(self, parameters, always_apply=False, p=0.5):
        super(MelSpectrogram, self).__init__(always_apply, p)

        self.parameters = parameters
    
    def apply(self, data, **params):
        sound, sr = data

        melspec = librosa.feature.melspectrogram(sound, sr=sr, **self.parameters)
        melspec = librosa.power_to_db(melspec)
        melspec = melspec.astype(np.float32)
        return melspec, sr
    
    
class SpecAugment(AudioTransform):
    """Shifting time axis"""
    def __init__(self, num_mask=2, freq_masking=0.15, time_masking=0.20, always_apply=False, p=0.5):
        super(SpecAugment, self).__init__(always_apply, p)

        self.num_mask = num_mask
        self.freq_masking = freq_masking
        self.time_masking = time_masking
    
    def apply(self, data, **params):
        melspec, sr = data

        spec_aug = self.spec_augment(melspec, 
                                     self.num_mask,
                                     self.freq_masking,
                                     self.time_masking,
                                     melspec.min())
        

        return spec_aug, sr
    
    # Source: https://www.kaggle.com/davids1992/specaugment-quick-implementation
    def spec_augment(self, 
                    spec: np.ndarray,
                    num_mask=2,
                    freq_masking=0.15,
                    time_masking=0.20,
                    value=0):
        spec = spec.copy()
        num_mask = random.randint(1, num_mask)
        for i in range(num_mask):
            all_freqs_num, all_frames_num  = spec.shape
            freq_percentage = random.uniform(0.0, freq_masking)

            num_freqs_to_mask = int(freq_percentage * all_freqs_num)
            f0 = np.random.uniform(low=0.0, high=all_freqs_num - num_freqs_to_mask)
            f0 = int(f0)
            spec[f0:f0 + num_freqs_to_mask, :] = value

            time_percentage = random.uniform(0.0, time_masking)

            num_frames_to_mask = int(time_percentage * all_frames_num)
            t0 = np.random.uniform(low=0.0, high=all_frames_num - num_frames_to_mask)
            t0 = int(t0)
            spec[:, t0:t0 + num_frames_to_mask] = value

        return spec

    
class SpectToImage(AudioTransform):

    def __init__(self, always_apply=False, p=0.5):
        super(SpectToImage, self).__init__(always_apply, p)
        
        
    def mono_to_color(self, X: np.ndarray,
                      mean=None,
                      std=None,
                      norm_max=None,
                      norm_min=None,
                      eps=1e-6):
        """
        Code from https://www.kaggle.com/daisukelab/creating-fat2019-preprocessed-data
        """


        # Standardize
        mean = mean or X.mean()
        X = X - mean
        std = std or X.std()
        Xstd = X / (std + eps)
        _min, _max = Xstd.min(), Xstd.max()
        norm_max = norm_max or _max
        norm_min = norm_min or _min
        if (_max - _min) > eps:
            # Normalize to [0, 255]
            V = Xstd
            V[V < norm_min] = norm_min
            V[V > norm_max] = norm_max
            V = (V - norm_min) / (norm_max - norm_min)
        else:
            # Just zero
            V = np.zeros_like(Xstd, dtype=np.float32)
        return V
    
    
    def apply(self, data, **params):
        melspec, sr = data
        image = self.mono_to_color(melspec)
        if Config.resize:
            image = resize(image, Config.img_shape)
        image = np.stack([image, image, image], axis=-1)
#         delta = librosa.feature.delta(image)
#         accelerate = librosa.feature.delta(image, order=2)
#         image = np.stack([image, delta, accelerate], axis=-1)
#         image = image.astype(np.float32) / 100.0
        # (n_mels, time_step, 3) --> (3, time_step, n_mels)
        return image.transpose(2, 1, 0)

    



sound_augment = Compose([
    PolarityInversion(p=0.2),
    Gain(min_gain_in_db=-15, max_gain_in_db=15, p=0.2),
    AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.2),
    AddGaussianSNR(max_SNR=0.5, p=0.2),
#     TimeStretch(min_rate=0.8, max_rate=1.25, p=0.2)
#     Shift(min_fraction=-0.2, max_fraction=0.2, p=0.2)
])


melspectrogram_parameters = {
        "n_mels": Config.n_mels,
        'n_fft': Config.n_fft, 
        'hop_length': Config.hop_length,
        'fmin': Config.fmin, 
        'fmax': Config.fmax 
    }

spec_augment = albumentations.Compose([
    MelSpectrogram(parameters=melspectrogram_parameters, always_apply=True),
    SpecAugment(p=0.2),
    SpectToImage(always_apply=True)
])

to_image = albumentations.Compose([
    MelSpectrogram(parameters=melspectrogram_parameters, always_apply=True),
    SpectToImage(always_apply=True)
])


In [9]:
from torchvision import transforms


ONE_HOT = np.eye(Config.num_class)
class TrainDataset(Dataset):
    def __init__(self, data_df, is_valid=False):
        self.data_df = data_df
        self.is_valid = is_valid
    
    def __len__(self):
        return len(self.data_df)
    
    def load_audio_clip(self, audio_file_path, t_min, t_max):
        # All sound files are 48000 bitrate, no need to slowly resample
        wav, _ = librosa.load(audio_file_path, sr=Config.sr)

        t_min = float(t_min) * Config.sr
        t_max = float(t_max) * Config.sr

        # Positioning sound slice
        begin = max(t_max - Config.segment_length, 0)
        end = t_min
        random_begin = np.random.randint(begin, end)
        random_end = random_begin + Config.segment_length
        if random_end > len(wav):
            random_end = len(wav)
            random_begin = random_end - Config.segment_length

        slice = wav[int(random_begin):int(random_end)]
        t_min_ratio = (t_min - random_begin)/Config.segment_length
        t_max_ratio = (t_max - random_begin)/Config.segment_length
        return slice, t_min_ratio, t_max_ratio
    
    def __getitem__(self, idx):
        s = self.data_df.iloc[idx]
        audio_file_path = os.path.join(data_path, 'train', s['recording_id']+'.wav')
        wav, t_min_ratio, t_max_ratio = self.load_audio_clip(audio_file_path, s['t_min'], s['t_max'])
        if not self.is_valid and Config.wav_augment:
            wav = sound_augment(samples=wav, sample_rate=Config.sr)
        return torch.tensor(wav, dtype=torch.float32), ONE_HOT[s['species_id']], t_min_ratio, t_max_ratio


class TestDataset(Dataset):
    def __init__(self, test_files):
        self.test_files = test_files 
    
    def __len__(self):
        return len(self.test_files)
    
    def __getitem__(self, idx):
        audio_file_path = os.path.join(data_path, 'test', self.test_files[idx])
        wav, _ = librosa.load(audio_file_path, sr=Config.sr)
        segments = len(wav) / Config.segment_length
        segments = int(np.ceil(segments))
        wavs = []
        for i in range(0, segments):
            # Last segment going from the end
            if (i + 1) * Config.segment_length > len(wav):
                slice = wav[len(wav) - Config.segment_length:len(wav)]
            else:
                slice = wav[i * Config.segment_length:(i + 1) * Config.segment_length]
                
            wavs.append(slice)
#             img.append(to_image(data=(slice, Config.sr))['data'])
        return torch.tensor(wavs, dtype=torch.float32)

In [10]:
test_files = sorted(os.listdir(os.path.join(data_path, 'test')))
test_dataset = TestDataset(test_files)
test_dataloader = DataLoader(test_dataset, batch_size=16, sampler=SequentialSampler(test_dataset), shuffle=False, num_workers=4)

In [11]:
batch_size = 32
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=GLOBAL_SEED)
data_folds = []
valid_indexs = []

for idx, (train_index, valid_index) in enumerate(kf.split(X=data_tp_df, y=data_tp_df['species_id'])):
    valid_indexs.append(valid_index)
    train_dataset = TrainDataset(data_tp_df.iloc[train_index], is_valid=False)
    val_dataset = TrainDataset(data_tp_df.iloc[valid_index], is_valid=True)

    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
    valid_dataloader = DataLoader(val_dataset, batch_size=batch_size, sampler=SequentialSampler(val_dataset), shuffle=False, num_workers=4)
    data_folds.append((train_dataloader, valid_dataloader))

In [12]:
# time_steps = X.shape[2]
# start = (time_steps * t_min_ratio).int()
# end = torch.clamp((time_steps * t_max_ratio).int() + 2, min=0, max=time_steps)
# attn_mask = torch.zeros_like(X) == 0
# for i in range(X.shape[0]):
#     attn_mask[i, :, start[i]:end[i], :] = False
# X.masked_fill_(attn_mask, -float('inf'))

## Build Model and Train

In [13]:
from efficientnet_pytorch import EfficientNet

In [14]:
def init_layer(layer):
    nn.init.xavier_uniform_(layer.weight)

    if hasattr(layer, "bias"):
        if layer.bias is not None:
            layer.bias.data.fill_(0.)


def init_bn(bn):
    bn.bias.data.fill_(0.)
    bn.weight.data.fill_(1.0)


def interpolate(x: torch.Tensor, ratio: int):
    """Interpolate data in time domain. This is used to compensate the
    resolution reduction in downsampling of a CNN.
    Args:
      x: (batch_size, time_steps, classes_num)
      ratio: int, ratio to interpolate
    Returns:
      upsampled: (batch_size, time_steps * ratio, classes_num)
    """
    (batch_size, time_steps, classes_num) = x.shape
    upsampled = x[:, :, None, :].repeat(1, 1, ratio, 1)
    upsampled = upsampled.reshape(batch_size, time_steps * ratio, classes_num)
    return upsampled


def pad_framewise_output(framewise_output: torch.Tensor, frames_num: int):
    """Pad framewise_output to the same length as input frames. The pad value
    is the same as the value of the last frame.
    Args:
      framewise_output: (batch_size, frames_num, classes_num)
      frames_num: int, number of frames to pad
    Outputs:
      output: (batch_size, frames_num, classes_num)
    """
    pad = framewise_output[:, -1:, :].repeat(
        1, frames_num - framewise_output.shape[1], 1)
    """tensor for padding"""

    output = torch.cat((framewise_output, pad), dim=1)
    """(batch_size, frames_num, classes_num)"""

    return output


class ConvBlock(nn.Module):
    def __init__(self, in_channels: int, out_channels: int):
        super().__init__()

        self.conv1 = nn.Conv2d(
            in_channels=in_channels,
            out_channels=out_channels,
            kernel_size=(3, 3),
            stride=(1, 1),
            padding=(1, 1),
            bias=False)

        self.conv2 = nn.Conv2d(
            in_channels=out_channels,
            out_channels=out_channels,
            kernel_size=(3, 3),
            stride=(1, 1),
            padding=(1, 1),
            bias=False)

        self.bn1 = nn.BatchNorm2d(out_channels)
        self.bn2 = nn.BatchNorm2d(out_channels)

        self.init_weight()

    def init_weight(self):
        init_layer(self.conv1)
        init_layer(self.conv2)
        init_bn(self.bn1)
        init_bn(self.bn2)

    def forward(self, input, pool_size=(2, 2), pool_type='avg'):

        x = input
        x = F.relu_(self.bn1(self.conv1(x)))
        x = F.relu_(self.bn2(self.conv2(x)))
        if pool_type == 'max':
            x = F.max_pool2d(x, kernel_size=pool_size)
        elif pool_type == 'avg':
            x = F.avg_pool2d(x, kernel_size=pool_size)
        elif pool_type == 'avg+max':
            x1 = F.avg_pool2d(x, kernel_size=pool_size)
            x2 = F.max_pool2d(x, kernel_size=pool_size)
            x = x1 + x2
        else:
            raise Exception('Incorrect argument!')

        return x


class AttBlock(nn.Module):
    def __init__(self,
                 in_features: int,
                 out_features: int,
                 activation="linear",
                 temperature=1.0):
        super().__init__()

        self.activation = activation
        self.temperature = temperature
        self.att = nn.Conv1d(
            in_channels=in_features,
            out_channels=out_features,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=True)
        self.cla = nn.Conv1d(
            in_channels=in_features,
            out_channels=out_features,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=True)

        self.bn_att = nn.BatchNorm1d(out_features)
        self.init_weights()

    def init_weights(self):
        init_layer(self.att)
        init_layer(self.cla)
        init_bn(self.bn_att)

    def forward(self, x, t_min_ratio=None, t_max_ratio=None, attn_mask=False):
        # x: (n_samples, n_in, n_time)
        energy = torch.tanh(self.att(x))
        if attn_mask:
            time_steps = energy.shape[2]
            start = torch.clamp((time_steps * t_min_ratio).int()-Config.attenion_border, min=0, max=time_steps)
            end = torch.clamp((time_steps * t_max_ratio).int() + 2 + Config.attenion_border, min=0, max=time_steps)
            mask = torch.zeros_like(energy) == 0
            if len(start.size()) == 1:
                for i in range(energy.shape[0]):
                    mask[i, :, start[i]:end[i]] = False
            else:
                for i in range(energy.shape[0]):
                    mask[i, :, start[i][0]:end[i][0]] = False
                    mask[i, :, start[i][1]:end[i][1]] = False
            energy = energy.masked_fill(mask, -float('inf'))
        norm_att = torch.softmax(energy, dim=-1)
        cla = self.nonlinear_transform(self.cla(x))
        x = torch.sum(norm_att * cla, dim=2)
        return x, norm_att, cla

    def nonlinear_transform(self, x):
        if self.activation == 'linear':
            return x
        elif self.activation == 'sigmoid':
            return torch.sigmoid(x)


class feature_extractor(nn.Module):
    def __init__(self, original):
        super().__init__()
        self.model = original
    def forward(self, x):
        x= self.model.extract_features(x)
        return x
        

class PANNsAtt(nn.Module):
    def __init__(self):
        super().__init__()
        
        window = 'hann'
        center = True
        pad_mode = 'reflect'
        ref = 1.0
        amin = 1e-10
        top_db = None
        self.interpolate_ratio = 32  # Downsampled ratio
        self.apply_aug = Config.spec_augment

        # Spectrogram extractor
        self.spectrogram_extractor = Spectrogram(
            n_fft=Config.n_fft,
            hop_length=Config.hop_length,
            win_length=Config.n_fft,
            window=window,
            center=center,
            pad_mode=pad_mode,
            freeze_parameters=True)

        # Logmel feature extractor
        self.logmel_extractor = LogmelFilterBank(
            sr=Config.sr,
            n_fft=Config.n_fft,
            n_mels=Config.n_mels,
            fmin=Config.fmin,
            fmax=Config.fmax,
            ref=ref,
            amin=amin,
            top_db=top_db,
            freeze_parameters=True)

        # Spec augmenter
        self.spec_augmenter = SpecAugmentation(
            time_drop_width=64,
            time_stripes_num=2,
            freq_drop_width=8,
            freq_stripes_num=2)
        
        
        self.feature_net = feature_extractor(EfficientNet.from_pretrained('efficientnet-b4'))
#         self.out_features = 1280 # b0
        self.out_features = 1792 # b4
        self.bn0 = nn.BatchNorm2d(Config.n_mels)

        self.fc1 = nn.Linear(self.out_features, 1792, bias=True)
        self.att_block = AttBlock(1792, Config.num_class, activation='sigmoid')

        self.init_weight()

    def init_weight(self):
        init_bn(self.bn0)
        init_layer(self.fc1)
           
            
    def preprocess(self, input_x, mixup_lambda=None):

        x = self.spectrogram_extractor(input_x)  # (batch_size, 1, time_steps, freq_bins)
        x = self.logmel_extractor(x)  # (batch_size, 1, time_steps, mel_bins)
        x = x.expand(-1, 3, -1, -1)
        frames_num = x.shape[2]
        x = x.transpose(1, 3)
        x = self.bn0(x)
        x = x.transpose(1, 3)

        if self.training and self.apply_aug:
            if np.random.rand() < Config.spec_augprob:
                x = self.spec_augmenter(x)

        # Mixup on spectrogram
#         if self.training  and self.apply_aug and mixup_lambda is not None:
#             x = do_mixup(x, mixup_lambda)
        return x, frames_num
        

    def forward(self, x, t_min_ratio=None, t_max_ratio=None, attn_mask=False):
#         input_x, mixup_lambda = input_data
#         """
#         Input: (batch_size, data_length)"""
#         b, c, s = input_x.shape
#         input_x = input_x.reshape(b*c, s)
        x, frames_num = self.preprocess(x)
#         if mixup_lambda is not None:
#             b = (b*c)//2
#             c = 1
        # Output shape (batch size, channels, time, frequency)
        x = self.feature_net(x)
        
        # Aggregate in frequency axis
        x = torch.mean(x, dim=3)

        x1 = F.max_pool1d(x, kernel_size=3, stride=1, padding=1)
        x2 = F.avg_pool1d(x, kernel_size=3, stride=1, padding=1)
        x = x1 + x2

        x = F.dropout(x, p=0.5, training=self.training)
        x = x.transpose(1, 2)
        x = F.relu(self.fc1(x))
        x = x.transpose(1, 2)
        x = F.dropout(x, p=0.5, training=self.training)
        if attn_mask:
            (clipwise_output, norm_att, segmentwise_output) = self.att_block(x, t_min_ratio, t_max_ratio, attn_mask=True)
        else:
            (clipwise_output, norm_att, segmentwise_output) = self.att_block(x)
        segmentwise_output = segmentwise_output.transpose(1, 2)
        framewise_output = segmentwise_output
        # Get framewise output
#         framewise_output = interpolate(segmentwise_output,
#                                        self.interpolate_ratio)
#         framewise_output = pad_framewise_output(framewise_output, frames_num)
#         frame_shape =  framewise_output.shape
#         clip_shape = clipwise_output.shape
        output_dict = {
            'framewise_output': framewise_output,
            'clipwise_output': clipwise_output,
        }

        return output_dict


In [15]:
# label-level average
# Assume float preds [BxC], labels [BxC] of 0 or 1
def LWLRAP(preds, labels):
    # Ranks of the predictions
    ranked_classes = torch.argsort(preds, dim=-1, descending=True)
    # i, j corresponds to rank of prediction in row i
    class_ranks = torch.zeros_like(ranked_classes)
    for i in range(ranked_classes.size(0)):
        for j in range(ranked_classes.size(1)):
            class_ranks[i, ranked_classes[i][j]] = j + 1
    # Mask out to only use the ranks of relevant GT labels
    ground_truth_ranks = class_ranks * labels + (1e6) * (1 - labels)
    # All the GT ranks are in front now
    sorted_ground_truth_ranks, _ = torch.sort(ground_truth_ranks, dim=-1, descending=False)
    # Number of GT labels per instance
    num_labels = labels.sum(-1)
    pos_matrix = torch.tensor(np.array([i+1 for i in range(labels.size(-1))])).unsqueeze(0)
    score_matrix = pos_matrix / sorted_ground_truth_ranks
    score_mask_matrix, _ = torch.sort(labels, dim=-1, descending=True)
    scores = score_matrix * score_mask_matrix
    score = scores.sum() / labels.sum()
    return score.item()

# # Sample usage
# y_true = torch.tensor(np.array([[1, 1, 0], [1, 0, 1], [0, 0, 1]]))
# y_score = torch.tensor(np.random.randn(3, 3))
# print(LRAP(y_score, y_true), LWLRAP(y_score, y_true))

In [16]:
def mixup_data(x, y, t_min_ratio, t_max_ratio, alpha=5):
    """
    Applies mixup to a sample
    Arguments:
        x {torch tensor} -- Input batch
        y {torch tensor} -- Labels
    Keyword Arguments:
        alpha {float} -- Parameter of the beta distribution (default: {0.4})
    Returns:
        torch tensor  -- Mixed input
        torch tensor  -- Labels of the original batch
        torch tensor  -- Labels of the shuffle batch
        float  -- Probability samples by the beta distribution
    """
    lam = np.random.beta(alpha, alpha) if alpha > 0 else 1
    index = torch.randperm(x.size()[0]).cuda()
    mixed_x = lam * x + (1 - lam) * x[index, :]
    y_a, y_b = y, y[index]
    t_min = torch.stack([t_min_ratio, t_min_ratio[index]], dim=1)
    t_max = torch.stack([t_max_ratio, t_max_ratio[index]], dim=1)
    return mixed_x, y_a, y_b, t_min, t_max, lam

# for step, (x, y_batch) in enumerate(train_loader):
    
#     if np.random.rand() < mixup_proba:
#         x, y_a, y_b, _ = mixup_data(x.cuda(), y_batch.cuda(), alpha=alpha)
#         y_batch = torch.clamp(y_a + y_b, 0, 1)

In [17]:
def validate(model, val_dataloader, criterion, history, n_iters, writer, fold):
    model.eval()
    costs = []
    rocs = []
    metrics = []
    y_trues = []
    y_preds = []
    y_probs = []
    with torch.no_grad():
        for idx, batch in enumerate(val_dataloader):
            X, y, t_min_ratio, t_max_ratio = batch
            X, y = X.cuda(), y.cuda()
            y_output = model(X, t_min_ratio=t_min_ratio, t_max_ratio=t_max_ratio, attn_mask=True)    
            loss = criterion(y_output, y)
            costs.append(loss.item())
            y_true, y_prob= y.cpu(), y_output['clipwise_output'].detach().cpu()
            y_pred = (y_prob+0.5).int()
            y_trues.append(y_true.numpy())
            y_probs.append(y_prob.numpy())
            y_preds.append(y_pred.numpy())
            metrics.append(LWLRAP(y_prob, y.cpu()))
    
    mean_rocs = sklearn.metrics.roc_auc_score(np.concatenate(y_trues), np.concatenate(y_probs))
    mean_costs = np.mean(costs)
    mean_metrics = np.mean(metrics)
    writer.add_scalar('fold_{}/validate_roc'.format(fold), mean_rocs, n_iters)
    writer.add_scalar('fold_{}/validate_loss'.format(fold), mean_costs, n_iters)
    writer.add_scalar('fold_{}/validate_LWLRAP'.format(fold), mean_metrics, n_iters)
    history['best_roc'][fold] = mean_rocs
    history['best_metrics'][fold] = mean_metrics
#     if mean_rocs > history['best_roc'][fold]:  
#         history['best_roc'][fold] = mean_rocs
#         history['best_metrics'][fold] = mean_metrics
#         torch.save(model.state_dict(), history['best_model_path'][fold])
    return mean_costs, mean_rocs, mean_metrics


def train(model, train_dataloader, val_dataloader, criterion, optimizer, epoch, history, validate_points, scheduler, writer, fold, step=True):
    model.train()
    costs = []
    metrics = []
    y_trues = []
    y_preds = []
    y_probs = []
    val_loss, val_roc = 0, 0
    optimizer.zero_grad()
    with tqdm(total=len(train_dataloader.dataset), desc='Epoch{}'.format(epoch)) as pbar:
        for idx, batch in enumerate(train_dataloader):
            X, y, t_min_ratio, t_max_ratio = batch
            X, y = X.cuda(), y.cuda()
            if np.random.rand() < Config.mixup_proba:
                X, y_a, y_b, t_min_ratio, t_max_ratio, _ = mixup_data(X, y, t_min_ratio, t_max_ratio, alpha=Config.mixup_alpha)
                y = torch.clamp(y_a + y_b, 0, 1)
            y_output = model(X, t_min_ratio=t_min_ratio, t_max_ratio=t_max_ratio, attn_mask=True)    
            loss = criterion(y_output, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            if step:
                scheduler.step()
            costs.append(loss.item())
            y_true, y_prob= y.cpu(), y_output['clipwise_output'].detach().cpu()
            y_pred = (y_prob+0.5).int()
            y_trues.append(y_true.numpy())
            y_probs.append(y_prob.numpy())
            y_preds.append(y_pred.numpy())
            metrics.append(LWLRAP(y_prob, y_true))
#             rocs.append(sklearn.metrics.roc_auc_score(y.cpu(), y_prob))
#                 rocs.append((y_pred == y.cpu()).float().mean().item())
            pbar.update(y.size(0))
            n_iters = idx + len(train_dataloader) * (epoch-1)
            if idx in validate_points:
                val_loss, val_roc, val_metrics = validate(model, val_dataloader, criterion, history, n_iters, writer, fold)
                model.train()
            writer.add_scalar('fold_{}/train_loss'.format(fold), costs[-1], n_iters)
            writer.add_scalar('fold_{}/train_LWLRAP'.format(fold), metrics[-1], n_iters)
            writer.add_scalar('fold_{}/learning_rate'.format(fold), scheduler.get_last_lr()[0], n_iters)
            pbar.set_postfix_str('loss:{:.3f}, val-loss:{:.3f}, val-roc:{:.4f}'.format(np.mean(costs[-10:]),  val_loss, val_roc))
            torch.cuda.empty_cache()
        writer.add_scalar('fold_{}/train_roc'.format(fold), sklearn.metrics.roc_auc_score(np.concatenate(y_trues), np.concatenate(y_probs)), n_iters)

In [18]:
class PANNsLoss(nn.Module):
    def __init__(self):
        super().__init__()

        self.bce = nn.BCELoss()

    def forward(self, input, target):
        input_ = input["clipwise_output"]
#         input_ = torch.where(torch.isnan(input_),
#                              torch.zeros_like(input_),
#                              input_)
#         input_ = torch.where(torch.isinf(input_),
#                              torch.zeros_like(input_),
#                              input_)
        input_ = torch.clamp(input_, 0, 1)
        target = target.float()
        return self.bce(input_, target)

In [19]:
pos_weights = torch.ones(Config.num_class)
counts = data_tp_df['species_id'].value_counts()
for i in range(Config.num_class):
    pos_weights[i] = (sum(counts)-counts[i])/counts[i]
criterion = PANNsLoss().cuda()
# loss_function = nn.BCEWithLogitsLoss(pos_weight=pos_weights).cuda()

# def criterion(y_pred, y_target):
#     loss = loss_function(y_pred, y_target.float())
#     return loss

model_name = 'SED_new_noaugment_RandomCrop_nomixup_attention_effb4'
time_stamp = '{0:%m_%d_%H_%M}'.format(datetime.datetime.now())
# time_stamp = '02_09_16_16'

history = {
    'config': Config,
    'best_roc': [0]*len(data_folds),
    'best_metrics': [0]*len(data_folds), 
    'best_model_path': [os.path.join(model_path, '{}_{}_fold_{}.pth'.format(model_name, time_stamp, i)) for i in range(len(data_folds))]
}
writer = SummaryWriter(log_dir=os.path.join(tensorboard_path, '{}_{}'.format(model_name, time_stamp)))
for idx, (train_dataloader, val_dataloader) in enumerate(data_folds):
    validate_points = list(np.linspace(0, len(train_dataloader)-1, 2).astype(int))[1:]
    model = PANNsAtt().cuda()
#     model = nn.DataParallel(model, device_ids=[0, 1])
    epochs = 40
    warmup_prob = 0.3
    optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.01)
    scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=1e-3, epochs=epochs, steps_per_epoch=len(train_dataloader), pct_start=warmup_prob, div_factor=25, anneal_strategy='cos', cycle_momentum=True)
#     scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=int(warmup_prob*len(train_dataloader)*epochs), num_training_steps=len(train_dataloader)*epochs)
    for epoch in range(1, epochs+1):
        train(model, train_dataloader, val_dataloader, criterion, optimizer, epoch, history, validate_points, scheduler,  writer, fold=idx, step=True)
#         scheduler.step()
        gc.collect()
    torch.save(model.state_dict(), history['best_model_path'][idx])
    del model
    gc.collect()
    torch.cuda.empty_cache()
    
with open(os.path.join(model_path, '{}_{}_history.pkl'.format(model_name, time_stamp)), 'wb') as f:
    pickle.dump(history, f)

Loaded pretrained weights for efficientnet-b4


Epoch1: 100%|██████████| 972/972 [00:44<00:00, 21.93it/s, loss:0.276, val-loss:0.346, val-roc:0.5052]
Epoch2: 100%|██████████| 972/972 [00:44<00:00, 21.77it/s, loss:0.198, val-loss:0.208, val-roc:0.5223]
Epoch3: 100%|██████████| 972/972 [00:46<00:00, 20.79it/s, loss:0.177, val-loss:0.195, val-roc:0.5957]
Epoch4: 100%|██████████| 972/972 [00:46<00:00, 21.03it/s, loss:0.156, val-loss:0.174, val-roc:0.7288]
Epoch5: 100%|██████████| 972/972 [00:44<00:00, 21.80it/s, loss:0.133, val-loss:0.141, val-roc:0.8435]
Epoch6: 100%|██████████| 972/972 [00:44<00:00, 21.83it/s, loss:0.108, val-loss:0.132, val-roc:0.8880]
Epoch7: 100%|██████████| 972/972 [00:43<00:00, 22.11it/s, loss:0.089, val-loss:0.122, val-roc:0.9230]
Epoch8: 100%|██████████| 972/972 [00:43<00:00, 22.12it/s, loss:0.070, val-loss:0.093, val-roc:0.9529]
Epoch9: 100%|██████████| 972/972 [00:45<00:00, 21.31it/s, loss:0.064, val-loss:0.105, val-roc:0.9410]
Epoch10: 100%|██████████| 972/972 [00:45<00:00, 21.47it/s, loss:0.053, val-loss:0.

Loaded pretrained weights for efficientnet-b4


Epoch1: 100%|██████████| 973/973 [00:33<00:00, 29.21it/s, loss:0.279, val-loss:0.338, val-roc:0.5163]
Epoch2: 100%|██████████| 973/973 [00:34<00:00, 28.20it/s, loss:0.197, val-loss:0.224, val-roc:0.5397]
Epoch3: 100%|██████████| 973/973 [00:34<00:00, 28.13it/s, loss:0.174, val-loss:0.210, val-roc:0.6336]
Epoch4: 100%|██████████| 973/973 [00:34<00:00, 27.82it/s, loss:0.154, val-loss:0.171, val-roc:0.7434]
Epoch5: 100%|██████████| 973/973 [00:34<00:00, 27.97it/s, loss:0.133, val-loss:0.146, val-roc:0.8345]
Epoch6: 100%|██████████| 973/973 [00:34<00:00, 28.20it/s, loss:0.106, val-loss:0.126, val-roc:0.8764]
Epoch7: 100%|██████████| 973/973 [00:34<00:00, 27.86it/s, loss:0.086, val-loss:0.099, val-roc:0.9384]
Epoch8: 100%|██████████| 973/973 [00:34<00:00, 28.07it/s, loss:0.061, val-loss:0.086, val-roc:0.9533]
Epoch9: 100%|██████████| 973/973 [00:34<00:00, 28.09it/s, loss:0.064, val-loss:0.084, val-roc:0.9500]
Epoch10: 100%|██████████| 973/973 [00:34<00:00, 28.10it/s, loss:0.064, val-loss:0.

Loaded pretrained weights for efficientnet-b4


Epoch1: 100%|██████████| 973/973 [00:33<00:00, 29.26it/s, loss:0.272, val-loss:0.352, val-roc:0.5118]
Epoch2: 100%|██████████| 973/973 [00:33<00:00, 28.64it/s, loss:0.200, val-loss:0.232, val-roc:0.5151]
Epoch3: 100%|██████████| 973/973 [00:34<00:00, 28.52it/s, loss:0.179, val-loss:0.225, val-roc:0.5976]
Epoch4: 100%|██████████| 973/973 [00:34<00:00, 28.48it/s, loss:0.161, val-loss:0.184, val-roc:0.7268]
Epoch5: 100%|██████████| 973/973 [00:33<00:00, 28.63it/s, loss:0.125, val-loss:0.144, val-roc:0.8310]
Epoch6: 100%|██████████| 973/973 [00:33<00:00, 28.65it/s, loss:0.101, val-loss:0.123, val-roc:0.8979]
Epoch7: 100%|██████████| 973/973 [00:34<00:00, 28.61it/s, loss:0.081, val-loss:0.108, val-roc:0.9303]
Epoch8: 100%|██████████| 973/973 [00:34<00:00, 28.45it/s, loss:0.079, val-loss:0.106, val-roc:0.9398]
Epoch9: 100%|██████████| 973/973 [00:33<00:00, 28.66it/s, loss:0.065, val-loss:0.087, val-roc:0.9485]
Epoch10: 100%|██████████| 973/973 [00:34<00:00, 28.44it/s, loss:0.051, val-loss:0.

Loaded pretrained weights for efficientnet-b4


Epoch1: 100%|██████████| 973/973 [00:53<00:00, 18.09it/s, loss:0.265, val-loss:0.320, val-roc:0.5380]
Epoch2: 100%|██████████| 973/973 [00:55<00:00, 17.41it/s, loss:0.197, val-loss:0.211, val-roc:0.5476]
Epoch3: 100%|██████████| 973/973 [00:54<00:00, 17.77it/s, loss:0.172, val-loss:0.196, val-roc:0.6302]
Epoch4: 100%|██████████| 973/973 [00:58<00:00, 16.61it/s, loss:0.153, val-loss:0.168, val-roc:0.7512]
Epoch5: 100%|██████████| 973/973 [00:56<00:00, 17.36it/s, loss:0.129, val-loss:0.150, val-roc:0.8561]
Epoch6: 100%|██████████| 973/973 [00:52<00:00, 18.40it/s, loss:0.108, val-loss:0.128, val-roc:0.8838]
Epoch7: 100%|██████████| 973/973 [00:57<00:00, 17.04it/s, loss:0.088, val-loss:0.112, val-roc:0.9236]
Epoch8: 100%|██████████| 973/973 [00:57<00:00, 16.91it/s, loss:0.082, val-loss:0.112, val-roc:0.9419]
Epoch9: 100%|██████████| 973/973 [00:54<00:00, 17.86it/s, loss:0.071, val-loss:0.080, val-roc:0.9637]
Epoch10: 100%|██████████| 973/973 [00:52<00:00, 18.55it/s, loss:0.051, val-loss:0.

Loaded pretrained weights for efficientnet-b4


Epoch1: 100%|██████████| 973/973 [00:49<00:00, 19.48it/s, loss:0.302, val-loss:0.356, val-roc:0.4995]
Epoch2: 100%|██████████| 973/973 [00:54<00:00, 17.71it/s, loss:0.204, val-loss:0.221, val-roc:0.5173]
Epoch3: 100%|██████████| 973/973 [00:55<00:00, 17.42it/s, loss:0.182, val-loss:0.207, val-roc:0.5690]
Epoch4: 100%|██████████| 973/973 [00:55<00:00, 17.50it/s, loss:0.160, val-loss:0.181, val-roc:0.7059]
Epoch5: 100%|██████████| 973/973 [00:52<00:00, 18.36it/s, loss:0.136, val-loss:0.151, val-roc:0.8187]
Epoch6: 100%|██████████| 973/973 [00:55<00:00, 17.63it/s, loss:0.112, val-loss:0.125, val-roc:0.9095]
Epoch7: 100%|██████████| 973/973 [00:55<00:00, 17.62it/s, loss:0.087, val-loss:0.093, val-roc:0.9379]
Epoch8: 100%|██████████| 973/973 [00:53<00:00, 18.20it/s, loss:0.073, val-loss:0.093, val-roc:0.9424]
Epoch9: 100%|██████████| 973/973 [00:53<00:00, 18.20it/s, loss:0.062, val-loss:0.113, val-roc:0.9388]
Epoch10: 100%|██████████| 973/973 [00:54<00:00, 17.82it/s, loss:0.065, val-loss:0.

## Predict Testset

In [20]:
for file in os.listdir(model_path):
    if file.endswith('.pkl'):
        print(file)

SED_augment_RandomCrop_wavmixup_attention_effb4_02_10_14_20_history.pkl
SED_old_augment_RandomCrop_specmixup_attention_effb4_02_10_14_26_history.pkl


In [22]:
with open(os.path.join(model_path, 'SED_augment_RandomCrop_wavmixup_attention_effb4_02_10_14_20_history.pkl'), 'rb') as f:
    history = pickle.load(f)
model = PANNsAtt().cuda()

Loaded pretrained weights for efficientnet-b4


In [23]:
folds = []
for path in history['best_model_path']:
    model.load_state_dict(torch.load(path, map_location= torch.device('cpu')), strict=True)
    model.eval()
    preds = []
    with torch.no_grad():
        for batch in tqdm(test_dataloader):
            a, b, c = batch.size()
            X = batch.view(a*b, c).cuda()
            output = model(X, attn_mask=False)
            pred = output['framewise_output'].view(a, b, -1, 24).max(dim=2)[0].max(dim=1)[0].cpu().detach().numpy()
            preds.append(pred)
    folds.append(np.concatenate(preds, axis=0))

100%|██████████| 125/125 [04:09<00:00,  2.00s/it]
100%|██████████| 125/125 [04:20<00:00,  2.08s/it]
100%|██████████| 125/125 [04:13<00:00,  2.03s/it]
100%|██████████| 125/125 [04:11<00:00,  2.01s/it]
100%|██████████| 125/125 [04:20<00:00,  2.09s/it]


In [24]:
sub = pd.DataFrame(columns=['recording_id','s0','s1','s2','s3','s4','s5','s6','s7','s8','s9','s10','s11','s12','s13','s14','s15','s16','s17','s18','s19','s20','s21','s22','s23'], dtype=np.float32)
sub['recording_id'] = [file.split('.')[0] for file in test_files]
sub.iloc[:, 1:] = sum(folds) / len(folds)

In [25]:
time_stamp = '{0:%m_%d_%H_%M}'.format(datetime.datetime.now())
sub.to_csv(os.path.join(res_path, 'submission_SED_new_noaugment_RandomCrop_nomixup_attention_effb4_{}.csv'.format(time_stamp)), index=None)

In [20]:
np.mean(history['best_roc'])

0.9774732011686715

In [21]:
np.mean(history['best_metrics'])

0.894425490044201