## [BirdCLEF2023](https://www.kaggle.com/competitions/birdclef-2023)


[Inference notebook](https://www.kaggle.com/code/ollypowell/birdclef-2023-pytorch-lightning-inference) 

This one forked from V44.  Refer to training version 17 for a working version to troubleshoot.  
Expect AP score = 0.62 after epoch1

Original Forks: [Generate spectrograms](https://www.kaggle.com/code/nischaydnk/split-creating-melspecs-stage-1), [Training](https://www.kaggle.com/code/nischaydnk/birdclef-2023-pytorch-lightning-training-w-cmap), [Inference](https://www.kaggle.com/code/nischaydnk/birdclef-2023-pytorch-lightning-inference)

### Strategy

Get this working again.  Use to pre-train
Apply the fix to the training notebook, but start that from the weights from this one, with one extra layer
Run this on a larger dataset from the 3 competitions.

Kaggle Strategy: 
- Strong single model, Tweak single-model hyperparameters and augmentation & sampling recipe a little, backbone fine tuning, stronger classifier head, try weighted sampling
- Develop a pre-training model with the full 2021, 2022, 2023 data, to help with generalisation.  Find tune the backbone. Then leave that fixed, just keep the weights for a seperate notebook
- Fork this notebook and fine-tune with the previous weights, but 2023 data and classes only
- Develop two seperate classifier heads for rare and common birds, by working with the data, loss function, weighted sampling
- Ensemble those two heads together and see if I get some improvement within the competition constraints
- Post processing

In [None]:
!pip install -q torchtoolbox timm colorednoise
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gc
import warnings
import random 
from pathlib import Path
import plotly.express as px
import pandas as pd
import plotly.io as pio


# Torch and PyTorch
import torch
import torch.nn.functional as F
import pytorch_lightning as pl
from pytorch_lightning import Trainer
import torch.multiprocessing as mp
from pytorch_lightning.callbacks import ModelCheckpoint, BackboneFinetuning, EarlyStopping
import torch.nn as nn
import torch.multiprocessing as mp
from torch.nn.functional import cross_entropy, binary_cross_entropy_with_logits
from torch.utils.data import Dataset, DataLoader 
from torch.utils.data.sampler import Sampler
from torch.optim.lr_scheduler import CosineAnnealingLR, CosineAnnealingWarmRestarts, ReduceLROnPlateau # OneCycleLR
import torchmetrics
import timm

#ML Modules
from sklearn import model_selection
from sklearn.model_selection import StratifiedKFold
import sklearn.metrics as skm
import torchvision.transforms as transforms
import torchvision.io 
import albumentations as A

#Audio 
import scipy.signal as sps
import torchaudio
import librosa
from PIL import Image
import colorednoise as cn

In [None]:
print(pl.__version__)

In [None]:
class Config:
    #Training Parameters
    EXPERIMENT = 23 # Increased LR
    WEIGHTED_SAMPLING = False
    NUM_WORKERS = 8
    NUM_CLASSES = 264
    BATCH_SIZE = 64
    EPOCHS = 12
    PRECISION = 16    
    PATIENCE = 3
    MIN_DELTA = 0
    SEED = 2023
    MODEL = "tf_efficientnetv2_s_in21k"
    PRETRAINED = True            
    WEIGHT_DECAY = 1e-3
    LR = 1e-4
    USE_MIXUP= True
    MIXUP_ALPHA = 0.2   
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    LOSS_FUNCTION = nn.CrossEntropyLoss()  #nn.BCEWithLogitsLoss() 
    IMAGE_SIZE = 256
    
    #Train and CV Parameters
    N_FOLDS = 10 
    RARE_THRESHOLD = 4 # Samples with less than this many values will not be allowed in validation

    #Filepaths
    OUT_DIR = '/kaggle/working'
    LABELS_PATH = "/kaggle/input/birdclef23-train-8-sec-wav/train_23_cropped.csv"
    BACKGROUND_NOISE_FLDR = '/kaggle/input/birdclef23-uniform-noise-chunks/birdclef23-backgrounds'

#os.listdir(Config.BACKGROUND_NOISE_FLDR)

In [None]:
  class Audio:
    SR = 32000
    DURATION = 5  # Duration the loaded wav file will be cropped to.
    CHUNK_LENGTH = 8 # Maximum duration of the wav files
    N_MELS = 128 # Try increasing this to 256 later, once other experiments tried
    FMIN = 20
    FMAX = 14000
    WINDOW = 1024  
    HOP_LENGTH = 312
    N_FFT = 1024
    
    
    # last years 3rd place: two strategies
    # sr: 32000, window_size: 2048, hop_size: 1024, fmin: 200, fmax: 14000, mel_bins: 224
    # sr: 32000, window_size: 1024, hop_size: 512, fmin: 50, fmax: 14000, mel_bins: 128

Setup

In [None]:
pl.seed_everything(Config.SEED, workers=True)
torch.set_flush_denormal(True)
pd.set_option('display.max_colwidth', None)
warnings.filterwarnings('ignore')
save_path = Path(Config.OUT_DIR) / f'Exp{Config.EXPERIMENT}'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
accelerator = 'gpu' if torch.cuda.is_available() else 'cpu'
Config.PRECISION = 16 if accelerator == 'gpu' else 32
noise = Path(Config.BACKGROUND_NOISE_FLDR)
Audio.BACKGROUNDS = [noise / p for p in noise.rglob('*.wav')]
f'There are {len(Audio.BACKGROUNDS)} Audio background files (each made up of concatenated 5 second random clips)'

Create the train and validation splits using stratified-k-fold to keep the values representative of their class proportions. Later on when the differences are getting smaller and I'm choosing between options, base the CV scheme on 5 folds mean + one standard deviation.  This is too computationaly expensive for now.

In [None]:
in_df = pd.read_csv(Config.LABELS_PATH)
Config.NUM_CLASSES = len(in_df.primary_label.unique())
print(f'Training + Validation with {in_df.shape[0]} audio samples')
print(f'There are {Config.NUM_CLASSES} primary class labels')
in_df.head(3)

Temporarily drop any super rare classes from the dataframe, so they don't end up loosing precious samples from training to the validation split.

In [None]:
mask = in_df['primary_label'].map(in_df['primary_label'].value_counts()) > Config.RARE_THRESHOLD
common_df = in_df[mask][['primary_label', 'filepath']]
mask = in_df['primary_label'].map(in_df['primary_label'].value_counts()) <= Config.RARE_THRESHOLD
rare_df = in_df[mask][['primary_label', 'filepath']]
rare_df.primary_label.value_counts()

In [None]:
print(f'Temporarily removing {rare_df.shape[0]} rare instances from the dataset before splitting because'
      f' they have less than or equal to {Config.RARE_THRESHOLD} audio samples per class')

In [None]:
skf =StratifiedKFold(n_splits=Config.N_FOLDS, shuffle=True, random_state=Config.SEED)
target = common_df['primary_label'] 

for train_index, val_index in skf.split(common_df, target):
    train_df, val_df = common_df.iloc[train_index], common_df.iloc[val_index]
train_df = pd.concat([train_df, rare_df])
    
print(f'The training dataframe has {train_df.shape[0]} rows\n'
      f'The validation dataframe has {val_df.shape[0]} rows')

In [None]:
# Specifically check the filename column
train_df.head()

Create & Fill birds with 0 samples in validation.  This is only needed because of the get-dummies approach with the dataframes.  The validation df misses out on columns for the missing species.  I also considered up-sampling the rare species, and ensuring one of the extra samples from each would go into the validation dataframe, but this would likely result in over-estimating the validation performance, and potentially over-fitting.

In [None]:
train_df = pd.concat([train_df, pd.get_dummies(train_df['primary_label'])], axis=1)
val_df = pd.concat([val_df, pd.get_dummies(val_df['primary_label'])], axis=1)
train_df.head(3)

In [None]:
birds = list(train_df.primary_label.unique())
missing_birds = list(set(list(train_df.primary_label.unique())).difference(list(val_df.primary_label.unique())))
non_missing_birds = list(set(list(train_df.primary_label.unique())).difference(missing_birds))
val_df[missing_birds] = 0
val_df = val_df[train_df.columns] ## Fix order
len(non_missing_birds)
val_df.head(3)

Helper functions and classes

In [None]:
def load_sf(wav_path, sr=Audio.SR):
    y, _ = librosa.load(wav_path, sr=sr)
    return y


def compute_pcen(y):
    melspec = librosa.feature.melspectrogram(y=y, sr=Audio.SR, n_mels=Audio.N_MELS, n_fft= Audio.N_FFT, 
                                             hop_length = Audio.HOP_LENGTH, fmin=Audio.FMIN, fmax=Audio.FMAX)
    pcen = librosa.pcen(melspec, sr=Audio.SR, gain=0.98, bias=2, power=0.5, time_constant=0.4, eps=0.000001).astype(np.float32)
    return pcen


def compute_melspec(y):
    melspec = librosa.feature.melspectrogram(y=y, sr=Audio.SR, n_mels=Audio.N_MELS, n_fft=Audio.N_FFT, 
                                             hop_length = Audio.HOP_LENGTH, fmin=Audio.FMIN, fmax=Audio.FMAX)
    melspec = librosa.power_to_db(melspec)  #.astype(np.float32)
    return melspec


def mono_to_color(X, eps=1e-6, mean=None, std=None):
    _min, _max = X.min(), X.max()
    if (_max - _min) > eps:
        X = (X - _min) / (_max - _min) #scales to a range of [0,1]
        X = X.astype(np.float32)
    else:
        X = np.zeros_like(X, dtype=np.float32)
    return X


def crop_or_pad(y, length, train='train'):
    y = np.concatenate([y, y, y])
    if len(y) <= length:
        y = np.concatenate([y, np.zeros(length - len(y))])
    else:
        if train != 'train':
            start = 0
        else:
            start = np.random.randint(len(y) - length)
        y = y[start: start + length]
    return y


def random_crop(arr, length):
    start = np.random.randint(len(arr) - length)
    arr = arr[start: start + length]
    return arr


def reshape_image(arr):  #chop the image in half along the long dimension and stack to make more square
    cols = arr.shape[1]//2 
    remainder = arr.shape[1] % 2
    half1 = arr[:, :cols + remainder]
    half2 = arr[:, cols:]
    
    if np.random.choice([True,False], size=1)[0]:
        arr =  np.vstack((half1, half2))
    else:
        arr = np.vstack((half2, half1))
    return arr

In [None]:
class Compose:
    def __init__(self, transforms: list):
        self.transforms = transforms

    def __call__(self, y: np.ndarray, sr):
        for trns in self.transforms:
            y = trns(y, sr)
        return y


class AudioTransform:
    def __init__(self, always_apply=False, p=0.5):
        self.always_apply = always_apply
        self.p = p

    def __call__(self, y: np.ndarray, sr):
        if self.always_apply:
            return self.apply(y, sr=sr)
        else:
            if np.random.rand() < self.p:
                return self.apply(y, sr=sr)
            else:
                return y

    def apply(self, y: np.ndarray, **params):
        raise NotImplementedError


class OneOf(Compose):
    def __init__(self, transforms, p=0.5):
        super().__init__(transforms)
        self.p = p
        transforms_ps = [t.p for t in transforms]
        s = sum(transforms_ps)
        self.transforms_ps = [t / s for t in transforms_ps]

    def __call__(self, y: np.ndarray, sr):
        data = y
        if self.transforms_ps and (np.random.random() < self.p):
            random_state = np.random.RandomState(np.random.randint(0, 2 ** 32 - 1))
            t = random_state.choice(self.transforms, p=self.transforms_ps)
            data = t(y, sr)
        return data


class Normalize(AudioTransform):
    def __init__(self, always_apply=True, p=1):
        super().__init__(always_apply, p)

    def apply(self, y: np.ndarray, **params):
        max_vol = np.abs(y).max()
        y_vol = y * 1 / max_vol
        return np.asfortranarray(y_vol)


class NoiseInjection(AudioTransform):
    def __init__(self, always_apply=False, p=0.5, max_noise_level=0.5):
        super().__init__(always_apply, p)

        self.noise_level = (0.0, max_noise_level)

    def apply(self, y: np.ndarray, **params):
        noise_level = np.random.uniform(*self.noise_level)
        noise = np.random.randn(len(y))
        augmented = (y + noise * noise_level).astype(y.dtype)
        return augmented


class GaussianNoise(AudioTransform):
    def __init__(self, always_apply=False, p=0.5, min_snr=5, max_snr=20):
        super().__init__(always_apply, p)

        self.min_snr = min_snr
        self.max_snr = max_snr

    def apply(self, y: np.ndarray, **params):
        snr = np.random.uniform(self.min_snr, self.max_snr)
        a_signal = np.sqrt(y ** 2).max()
        a_noise = a_signal / (10 ** (snr / 20))

        white_noise = np.random.randn(len(y))
        a_white = np.sqrt(white_noise ** 2).max()
        augmented = (y + white_noise * 1 / a_white * a_noise).astype(y.dtype)
        return augmented

#https://github.com/felixpatzelt/colorednoise
class PinkNoise(AudioTransform):
    def __init__(self, always_apply=False, p=0.5, min_snr=5, max_snr=20):
        super().__init__(always_apply, p)

        self.min_snr = min_snr
        self.max_snr = max_snr

    def apply(self, y: np.ndarray, **params):
        snr = np.random.uniform(self.min_snr, self.max_snr)
        a_signal = np.sqrt(y ** 2).max()
        a_noise = a_signal / (10 ** (snr / 20))

        pink_noise = cn.powerlaw_psd_gaussian(1, len(y))
        a_pink = np.sqrt(pink_noise ** 2).max()
        augmented = (y + pink_noise * 1 / a_pink * a_noise).astype(y.dtype)
        return augmented


class BrownNoise(AudioTransform):
    def __init__(self, always_apply=False, p=0.5, min_snr=5, max_snr=20):
        super().__init__(always_apply, p)

        self.min_snr = min_snr
        self.max_snr = max_snr

    def apply(self, y: np.ndarray, **params):
        snr = np.random.uniform(self.min_snr, self.max_snr)
        a_signal = np.sqrt(y ** 2).max()
        a_noise = a_signal / (10 ** (snr / 20))

        brown_noise = cn.powerlaw_psd_gaussian(2, len(y))
        a_brown = np.sqrt(brown_noise ** 2).max()
        augmented = (y + brown_noise * 1 / a_brown * a_noise).astype(y.dtype)
        return augmented

    
#https://www.kaggle.com/code/hidehisaarai1213/rfcx-audio-data-augmentation-japanese-english
#https://medium.com/@makcedward/data-augmentation-for-audio-76912b01fdf6
class AddBackround(AudioTransform):
    def __init__(self, always_apply=False, p=0.6, min_snr=1, max_snr=20, background_pths=Audio.BACKGROUNDS, sr=Audio.SR, duration=Audio.DURATION):
        super().__init__(always_apply, p)
        self.min_snr = min_snr
        self.max_snr = max_snr
        self.back_pths = Audio.BACKGROUNDS
        self.background = load_sf(random.choice(Audio.BACKGROUNDS))
        self.d_len = duration * sr

    def apply(self, y: np.ndarray, **params):
        snr = np.random.uniform(self.min_snr, self.max_snr)
        if random.random() < 0.05:  #load a new background file roughly every 20th sample
            background_path = random.choice(Audio.BACKGROUNDS)
            #print(background_path)
            self.background = load_sf(background_path)
        background = random_crop(self.background, self.d_len)
        a_signal = np.sqrt(y ** 2).max()
        a_noise = a_signal / (10 ** (snr / 20))  
        l_signal = len(y)

        a_background = np.sqrt(background ** 2).max()
        l_background = len(background)

        if l_signal > l_background:
            ratio = l_signal//l_background
            background = np.tile(background, ratio+1 )
            background = background[0:l_signal]

        if l_signal < l_background:    
            background = background[0:l_signal]

        augmented = (y + background * 1 / a_background * a_noise).astype(y.dtype)
        return augmented
    
class SpecAugment:
    def __init__(self, time_drop_width=80, time_stripes_num=2, freq_drop_width=12, freq_stripes_num=2):
        self.time_drop_width = time_drop_width
        self.time_stripes_num = time_stripes_num
        self.freq_drop_width = freq_drop_width
        self.freq_stripes_num = freq_stripes_num
        
    def __call__(self, spec):
        time_bins, freq_bins = spec.shape
        
        # Time masking
        for i in range(self.time_stripes_num):
            start = np.random.randint(0, time_bins - self.time_drop_width)
            spec[start:start+self.time_drop_width, :] = 0
        
        # Frequency masking
        for i in range(self.freq_stripes_num):
            start = np.random.randint(0, freq_bins - self.freq_drop_width)
            spec[:, start:start+self.freq_drop_width] = 0
        
        return spec
    

def spec_augment(spec: np.ndarray, num_mask=2, freq_masking_max_percentage=0.15, time_masking_max_percentage=0.1, p=0.5):
    if random.uniform(0, 1) > p:
        return spec
    
    #spec = spec.copy()

    # frequency masking
    num_freq_masks = random.randint(1, num_mask)
    for i in range(num_freq_masks):
        freq_percentage = random.uniform(0, freq_masking_max_percentage)
        freq_mask_size = int(freq_percentage * spec.shape[0])
        freq_mask_pos = random.randint(0, spec.shape[0] - freq_mask_size)
        spec[freq_mask_pos:freq_mask_pos+freq_mask_size, :] = 0

    # time masking
    num_time_masks = random.randint(1, num_mask)
    for i in range(num_time_masks):
        time_percentage = random.uniform(0, time_masking_max_percentage)
        time_mask_size = int(time_percentage * spec.shape[1])
        time_mask_pos = random.randint(0, spec.shape[1] - time_mask_size)
        spec[:, time_mask_pos:time_mask_pos+time_mask_size] = 0

    return spec



mean = (0.485, 0.456, 0.406) # RGB
std = (0.229, 0.224, 0.225) # RGB
albu_transforms = {
    'train' : A.Compose([
            A.Normalize(mean, std, max_pixel_value=1.0,always_apply=True),
            A.OneOf([
                A.Cutout(max_h_size=5, max_w_size=16),
                A.CoarseDropout(max_holes=4),
                #A.Lambda(image=apply_spec_augment),
            ], p=0.5),
            A.PadIfNeeded(min_height=Config.IMAGE_SIZE, min_width=Config.IMAGE_SIZE),
            A.RandomCrop(width=Config.IMAGE_SIZE, height=Config.IMAGE_SIZE),      
    ]),
    'valid' :  A.Compose([
                A.Normalize(mean, std, max_pixel_value=1.0,always_apply=True),
                A.PadIfNeeded(min_height=Config.IMAGE_SIZE, min_width=Config.IMAGE_SIZE),
                A.RandomCrop(width=Config.IMAGE_SIZE, height=Config.IMAGE_SIZE),  
        ])
}

In [None]:
class WaveformDataset(torch.utils.data.Dataset):

    def __init__(self, df, sr = Audio.SR, duration = Audio.DURATION, chunk=Audio.CHUNK_LENGTH, train=True):
        
        self.df = df
        self.sr = sr 
        self.train = train
        self.mode = 'train' if train else 'valid'
        self.d_len = duration * self.sr
        self.c_len = chunk * self.sr
        

    #With nnAudio n_bins=None, the frequency bins default to n_fft/2 + 1 =513 with a 1024 window, which is optimal, but with hop lenth 512
    #we end up with a 501x513 image, which iss too big for CUDA memory.  So downsampling is needed in the f axis by specifying n_bins
    #513 x 313 images would also be too slow for inferance. Could consider 256x313 if that gives a performance gain.  Better still try
    #to get a square 256x256 or 224 x 224 image
        
        if self.train:
            self.wave_transforms = Compose(
                [
                    OneOf(
                        [
                            NoiseInjection(p=1, max_noise_level=0.04),
                            GaussianNoise(p=1, min_snr=5, max_snr=20),
                            PinkNoise(p=1, min_snr=5, max_snr=20),
                            BrownNoise(p=1, min_snr=5, max_snr=20),
                        ],
                        p=0.5,
                    ),
                    AddBackround(p=0.6, min_snr=1, max_snr=20),
                    Normalize(always_apply=True, p=1),
                ]
            )
        else:
            self.wave_transforms = Normalize(always_apply=True, p=1)
        
    def __len__(self):
        return self.df.shape[0]

    
    def __getitem__(self, idx):    
        row = self.df.iloc[idx]
        wav_path = row.filepath 
        #y, _ = librosa.load(wav_path, sr=Audio.SR)
        y, _ = torchaudio.load(wav_path)  # might need more arguments
        y = y.squeeze().numpy()   #Faster to use torchaudio, but anyway the bottleneck is with the STFT
        
        if len(y) > 0: 
            y = y[:self.c_len]  
        y = crop_or_pad(y, self.d_len, train=self.train)     
        y = self.wave_transforms(y, sr=self.sr)

        
        image = compute_melspec(y)
        #image = compute_pcen(y)
        if self.train:
            image = spec_augment(image, p=0.4, num_mask=2, freq_masking_max_percentage=0.05, time_masking_max_percentage=0.05)
        image = reshape_image(image)
        image = mono_to_color(image)
        image = np.stack([image, image, image], axis=-1) #puts the chanels last, like a normal image, for the ablu_trasformations
        image = albu_transforms[self.mode](image=image)['image']#[:,:,0]  # replace with spec_augment
        image = image.transpose(2,0,1).astype(np.float32) # swapping the image channels to the first axis
        targets = torch.tensor(row[2:]).float().to(torch.float32)
        return image, targets

In [None]:
class InverseSqrtSampler(Sampler):
    def __init__(self, targets_df, replacement=True):
        targets_numeric = targets_df.select_dtypes(include='number')  # Select only numeric columns
        self.targets = np.argmax(targets_numeric.values, axis=1)
        self.class_counts = np.bincount(self.targets)
        self.weights = 1.0 / np.sqrt(self.class_counts)
        self.weights = self.weights / self.weights.sum()
        self.indices = np.arange(len(targets_df))
        self.replacement = replacement

    def __iter__(self):
        indices = []
        for class_idx in range(len(self.class_counts)):
            class_indices = self.indices[self.targets == class_idx]
            class_sampler = torch.utils.data.sampler.WeightedRandomSampler(
                self.weights[class_idx] * np.ones(len(class_indices)),
                len(class_indices),
                replacement=self.replacement
            )
            indices += [class_indices[i] for i in class_sampler]
        return iter(indices)

    def __len__(self):
        return len(self.indices)

In [None]:
def get_fold_dataloaders(df_train, df_valid):
        
    ds_train = WaveformDataset(
        df_train, 
        sr = Audio.SR,
        duration = Audio.DURATION,
        train = True)
    
    ds_val = WaveformDataset(
        df_valid, 
        sr = Audio.SR,
        duration = Audio.DURATION,
        train=False)
    
    sampler = InverseSqrtSampler(df_train, replacement=True)
    dl_val = DataLoader(ds_val, batch_size=Config.BATCH_SIZE, num_workers = Config.NUM_WORKERS)
        
    if Config.WEIGHTED_SAMPLING:
        dl_train = DataLoader(ds_train, batch_size=Config.BATCH_SIZE , sampler=sampler, num_workers=Config.NUM_WORKERS)
    else:
        dl_train = DataLoader(ds_train, batch_size=Config.BATCH_SIZE , shuffle=True, num_workers=Config.NUM_WORKERS) 
        
    return dl_train, dl_val, ds_train, ds_val

In [None]:
def show_batch(img_ds, num_rows, num_cols, predict_arr=None):
    fig, axes = plt.subplots(nrows=num_rows, ncols=num_cols, sharex=True, figsize=(15, 4*num_rows))
    fig.tight_layout(pad=3.0)
    img_inds = np.random.randint(0, len(img_ds)-1, num_rows*num_cols)
    for index, ax in zip(img_inds, axes.flatten()):  # list first 9 images
        img, lb = img_ds[index]
        shape = img.shape
        scaled_data = (img[0] - np.min(img[0])) / (np.max(img[0]) - np.min(img[0]))
        img = librosa.display.specshow(scaled_data, x_axis='time', y_axis='mel', ax=ax)
        ax.set(title= f'Spectrogram scaled to [0,1] {shape}')
        fig.colorbar(img, ax=ax, format="%+2.f dB")        

In [None]:
dl_train, dl_val, ds_train, ds_val = get_fold_dataloaders(train_df, val_df)
show_batch(ds_train, 4, 2)

In [None]:
def get_optimizer(lr, params):
    model_optimizer = torch.optim.Adam(
            filter(lambda p: p.requires_grad, params), 
            lr=lr,
            weight_decay=Config.WEIGHT_DECAY )
    interval = "epoch"
    
    lr_scheduler = CosineAnnealingWarmRestarts(
                            model_optimizer, 
                            T_0=Config.EPOCHS, 
                            T_mult=1, 
                            eta_min=1e-6, 
                            last_epoch=-1)

    return { "optimizer": model_optimizer, 
             "lr_scheduler": {"scheduler": lr_scheduler,
                        "interval": interval,
                        "monitor": "val_loss",
                        "frequency": 1}}

In [None]:
def padded_cmap(solution, submission, padding_factor=5):
    solution = solution.fillna(0).replace([pd.np.inf, -pd.np.inf], 0)
    submission = submission.fillna(0).replace([pd.np.inf, -pd.np.inf], 0)
    new_rows = []
    for i in range(padding_factor):
        new_rows.append([1 for i in range(len(solution.columns))])
    new_rows = pd.DataFrame(new_rows)
    new_rows.columns = solution.columns
    padded_solution = pd.concat([solution, new_rows]).reset_index(drop=True).copy()
    padded_submission = pd.concat([submission, new_rows]).reset_index(drop=True).copy()
    score = skm.average_precision_score(
        padded_solution.values,
        padded_submission.values,
        average='macro')    
    return score


def map_score(solution, submission):
    solution = solution.fillna(0).replace([pd.np.inf, -pd.np.inf], 0)
    submission = submission.fillna(0).replace([pd.np.inf, -pd.np.inf], 0)
    score = skm.average_precision_score(
        solution.values,
        submission.values,
        average='micro')  
    return score


def mixup_data(x, y, alpha=1.0, device=device):
    '''Returns mixed inputs, pairs of targets, and lambda'''
    if alpha > 0:
        lam = np.random.beta(alpha, alpha)
    else:
        lam = 1

    batch_size = x.size()[0]
    index = torch.randperm(batch_size).to(device)
    mixed_x = lam * x + (1 - lam) * x[index, :]
    y_a, y_b = y, y[index]
    return mixed_x, y_a, y_b, lam


def mixup_criterion(criterion, pred, y_a, y_b, lam):
    return lam * criterion(pred, y_a) + (1 - lam) * criterion(pred, y_b)

In [None]:
# Testing Validation Strategy
dummy = val_df[birds].copy()
dummy[birds] = np.random.rand(dummy.shape[0],dummy.shape[1])
pad_5 = padded_cmap(val_df[birds], dummy[birds], padding_factor = 5)
pad_1 = padded_cmap(val_df[birds], dummy[birds], padding_factor = 1)

print(f'Padded cMAP, with padding=5: {pad_5}\n'
     f'Padded cMAP, with padding=1: {pad_1}\n'
     f'MAP score: {map_score(val_df[birds], dummy[birds])}')

In [None]:
class ClassifierHead(nn.Module):
    def __init__(self, num_classes, num_features, dropout_rate=0.2):
        super().__init__()
        
        self.dense_layer = nn.Linear(num_features, num_features)
        self.relu = nn.ReLU(inplace=True)
        self.dropout = nn.Dropout(p=dropout_rate)  #removed in experiment 22
        self.output_layer = nn.Linear(num_features, num_classes)
        
    def forward(self, x):
        x = self.dense_layer(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.output_layer(x)
        return x

In [None]:
class BirdClefModel(pl.LightningModule):
    def __init__(self, model_name=Config.MODEL, num_classes = Config.NUM_CLASSES, pretrained = Config.PRETRAINED, loss=Config.LOSS_FUNCTION):
        super().__init__()
        self.num_classes = num_classes
        self.backbone = timm.create_model(model_name, pretrained=pretrained)
        self.in_features = self.backbone.classifier.in_features
        self.backbone.classifier = nn.Sequential(nn.Linear(self.in_features, num_classes))
        #self.backbone.classifier = ClassifierHead(num_classes, self.in_features)
        self.loss_function = loss
        self.val_outputs = []
        self.train_outputs = []
        self.metrics = []

    def forward(self,images):
        x = self.backbone(images)
        return x
        
    def configure_optimizers(self):
        return get_optimizer(lr=Config.LR, params=self.parameters())

    def train_with_mixup(self, X, y):
        X, y_a, y_b, lam = mixup_data(X, y, alpha=Config.MIXUP_ALPHA)
        y_pred = self(X)
        loss_mixup = mixup_criterion(self.loss_function, y_pred, y_a, y_b, lam)
        return loss_mixup

    def training_step(self, batch, batch_idx):
        image, target = batch        
        if Config.USE_MIXUP:
            loss = self.train_with_mixup(image, target)
        else:
            y_pred = self(image)
            loss = self.loss_function(y_pred,target) 
        self.train_outputs.append({'train_loss': loss})
        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
        return loss        

    def validation_step(self, batch, batch_idx):
        image, target = batch     
        y_pred = self(image)
        val_loss = self.loss_function(y_pred, target)
        self.log("val_loss", val_loss, on_step=True, on_epoch=True, logger=True, prog_bar=True)
        outputs = {"val_loss": val_loss, "logits": y_pred, "targets": target}
        self.val_outputs.append(outputs)
        return outputs
    
    def train_dataloader(self):
        return self._train_dataloader 
    
    def validation_dataloader(self):
        return self._validation_dataloader
    
    def on_validation_epoch_end(self):
        outputs = self.val_outputs
        train_outputs = self.train_outputs
        train_losses = [x['train_loss'] for x in train_outputs]
        avg_train_loss = sum(train_losses) / len(train_losses) if train_losses else 0.0
        avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
        output_logits = torch.cat([x['logits'] for x in outputs],dim=0)
        
        if Config.LOSS_FUNCTION == nn.CrossEntropyLoss():
            output_probs = F.softmax(output_logits, dim=1).cpu().detach().numpy()
        else:
            output_probs = output_logits.sigmoid().cpu().detach().numpy()
  
        target_val = torch.cat([x['targets'] for x in outputs],dim=0).cpu().detach().numpy()  
        pred_df = pd.DataFrame(output_probs, columns = birds)
        val_df = pd.DataFrame(target_val, columns = birds)
       
        avg_score = padded_cmap(val_df, pred_df, padding_factor = 5)
        avg_score2 = padded_cmap(val_df, pred_df, padding_factor = 3)
        avg_score3 = skm.label_ranking_average_precision_score(target_val,output_probs)
        self.metrics.append({'train_loss':avg_train_loss, 'val_loss': avg_loss, 'map':avg_score3, 'map5':avg_score})
        
        # competition_metrics(output_val,target_val)
        print(f'epoch {self.current_epoch} train loss {avg_train_loss}')
        print(f'epoch {self.current_epoch} validation loss {avg_loss}')
        print(f'epoch {self.current_epoch} validation C-MAP score pad 5 {avg_score}')
        print(f'epoch {self.current_epoch} validation C-MAP score pad 3 {avg_score2}')
        print(f'epoch {self.current_epoch} validation AP score {avg_score3}')
        
        val_df.to_pickle('val_df.pkl')
        pred_df.to_pickle('pred_df.pkl')    
        return self.metrics

In [None]:
def run_training():
    val_loss=[]
    train_loss=[]
    print(f"Running training...")
    logger = None
    dl_train, dl_val, ds_train, ds_val = get_fold_dataloaders(train_df, val_df)
    audio_model = BirdClefModel()

    early_stop_callback = EarlyStopping(monitor="val_loss", 
                                        min_delta=Config.MIN_DELTA, 
                                        patience=Config.PATIENCE, 
                                        verbose= True, 
                                        mode="min")
    
    # saves top- checkpoints based on "val_loss" metric
    checkpoint_callback = ModelCheckpoint(save_top_k=6,
                                          monitor="val_loss",
                                          mode="min",
                                          dirpath=save_path,
                                          save_last= True,
                                          save_weights_only=True, 
                                          verbose= True,
                                          #filename=f'birdCLEF23-{trainer.current_epoch:02d}-{val_loss:.4f}',  #need to figure this out so It can update to a dataset
)
    
      
    callbacks_to_use = [checkpoint_callback, early_stop_callback, ]

    trainer = Trainer(
        gpus=1,
        val_check_interval=0.5,
        deterministic=True,
        max_epochs=Config.EPOCHS,
        logger=logger,
        auto_lr_find=False,    
        callbacks=callbacks_to_use,
        precision=Config.PRECISION, accelerator=accelerator)

    
    print("Running trainer.fit")
    trainer.fit(audio_model, train_dataloaders = dl_train, val_dataloaders = dl_val)       
    training_results = trainer.callback_metrics['validation_epoch_end']
    
    gc.collect()
    torch.cuda.empty_cache()
    return training_results

In [None]:
metrics = run_training()

In [None]:
train_losses = [x['train_loss'] for x in metrics]
val_losses = [x['val_loss'] for x in metrics]  
val_map = [x['map'] for x in metrics]  
val_map5 = [x['map5'] for x in metrics]  

# Plot the training and validation losses
plt.plot(train_losses, color='b', label='Train Loss')
plt.plot(val_losses, color='g', label='Val Loss')

# Set the x-axis label and the label for the first y-axis
plt.xlabel('Epoch')
plt.ylabel('Loss')

# Add a second y-axis for the validation accuracy
ax2 = plt.twinx()
ax2.plot(val_accs, color='r', label='Val mAP')
ax2.plot(val_accs, color='o', label='Val mAP-5')
ax2.set_ylabel('Accuracy')

# Set the legend for the plot
plt.legend(loc='upper right')

# Show the plot
plt.show()

In [None]:
df_pred = pd.read_pickle('/kaggle/working/pred_df.pkl')
df_true = pd.read_pickle('/kaggle/working/val_df.pkl')
padded_cmap(df_true, df_pred, padding_factor = 5)

In [None]:
def padded_cmap_by_class(solution, submission, padding_factor=5):
    solution = solution.fillna(0).replace([np.inf, -np.inf], 0)
    submission = submission.fillna(0).replace([np.inf, -np.inf], 0)
    new_rows = []
    for i in range(padding_factor):
        new_rows.append([1 for i in range(len(solution.columns))])
    new_rows = pd.DataFrame(new_rows)
    new_rows.columns = solution.columns
    padded_solution = pd.concat([solution, new_rows]).reset_index(drop=True).copy()
    padded_submission = pd.concat([submission, new_rows]).reset_index(drop=True).copy()
    
    column_headers = list(solution.columns)
    scores = {}
    
    for column in column_headers:
        score = skm.average_precision_score(
            padded_solution[[column]].values,
            padded_submission[[column]].values,
            average='macro')    
        scores[column] = score
    return scores

In [None]:
cmap5_by_class = padded_cmap_by_class(df_true, df_pred, padding_factor=5)
#cmap5_by_class  # A dict with name:score
np.mean(list(cmap5_by_class.values())) #checking it's the same thing

In [1]:
# Compute the frequency count of each class in the target dataframe
col_sums = [(col, df_true[col].sum()) for col in df_true.columns]
names_by_frequency = sorted(col_sums, key=lambda x: x[1], reverse=True)

# extract names and counts as separate lists
names = [name for name, _ in names_by_frequency]
counts = [count for _, count in names_by_frequency]
scores = [cmap5_by_class[name] for name in names]

# Create a dataframe
df = pd.DataFrame({'names': names, 'counts': counts, 'scores': scores})
df["scores"] = pd.to_numeric(df["scores"])
df["counts"] = pd.to_numeric(df["counts"])

NameError: name 'df_target' is not defined

In [None]:
# Set the default renderer to 'notebook'
pio.renderers.default = 'notebook'
fig = px.bar(df, x='scores', y='names', color='counts', orientation='h', hover_data=['counts', 'scores'], range_x=[0.5, 1])
fig.update_layout(height=1200)

# Show the plot
fig.show()