# Data exploration 🦜

In [None]:
!ls -l /kaggle/input/birdclef-2022

PATH_DATASET = "/kaggle/input/birdclef-2022"
PATH_CONVERTED = "/kaggle/input/birdclef-convert-spectrograms-noise-reduce"

## Visualise training meta data

In [None]:
import os
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt

train_meta = pd.read_csv(os.path.join(PATH_DATASET, "train_metadata.csv"))
train_meta["secondary_labels"] = list(map(eval, train_meta["secondary_labels"]))
train_meta["type"] = list(map(eval, train_meta["type"]))
display(train_meta.head())

In [None]:
import glob
import json

with open(os.path.join(PATH_DATASET, "scored_birds.json")) as fp:
    scored_birds = json.load(fp)
print(scored_birds)
    
test_sounds = glob.glob(os.path.join(PATH_DATASET, "test_soundscapes", "soundscape_*.ogg"))
WITH_SUBMISSION = len(test_sounds) > 1
print(f"WITH_SUBMISSION: {WITH_SUBMISSION}")

In [None]:
# print(len(train_meta))
# train_meta = train_meta[train_meta["primary_label"].isin(scored_birds)]
# print(len(train_meta))
# train_meta["secondary_labels"] = [[lb for lb in lbs if lb in scored_birds] for lbs in train_meta["secondary_labels"]]

In [None]:
ax = train_meta["primary_label"].value_counts().plot.bar(figsize=(12, 3), grid=True) 
ax.set_yscale('log')

In [None]:
train_meta["secondary_counts"] = [len(sd) for sd in train_meta["secondary_labels"]]
ax = train_meta["secondary_counts"].value_counts().sort_index().plot.bar(figsize=(4, 3), grid=True)
ax.set_yscale('log')

In [None]:
import plotly.express as px

fig = px.scatter_geo(
    train_meta,
    lat="latitude",
    lon="longitude",
    color="common_name",
    width=1000,
    height=500,
    title="BirdCLEF 2022 Training Data",
)
fig.show()

# Data prerocessing 💽

For some optimization we moved the dataset conversion to a separate kernel as it is not needed to waste your GPU quota on constant task

So the image-dataset will be attached here

**https://www.kaggle.com/jirkaborovec/birdclef-convert-spectrograms-noise-reduce**

## Prepare train 🪡 validation dataset

In [None]:
import glob
from tqdm.auto import tqdm
from pprint import pprint
# from joblib import Parallel, delayed

print(f"dataset size (audio): {len(train_meta)}")

train_records = []
for idx, row in tqdm(train_meta.iterrows(), total=len(train_meta)):
    imgs = glob.glob(os.path.join(PATH_CONVERTED, "train_images", row["filename"] + ".*"))
    # TODO: use 0.5 for background bird
    row = {**dict(row), **{row["primary_label"]: 1}, **{lb: 0.2 for lb in row["secondary_labels"]}}
    # imgs = [p for p in imgs if _try_image(p)]
    imgs = [os.path.sep.join(p.split(os.path.sep)[-2:]) for p in sorted(imgs)]
    rows = [dict(row) for _ in range(len(imgs))]
    _= [r.update({"img_name": img}) for r, img in zip(rows, imgs)]
    train_records += rows
    
df_train = pd.DataFrame(train_records).fillna(0)
display(df_train.head())

# train_meta["img_name"] = [f"{fn}.jpg" for fn in train_meta["filename"]]
# # mask = [_try_image(os.path.join(PATH_CONVERTED, "train_images", n)) for n in tqdm(train_meta["img_name"])]
# mask = Parallel(n_jobs=os.cpu_count())(delayed(_try_image)(os.path.join(PATH_CONVERTED, "train_images", n)) for n in tqdm(train_meta["img_name"]))
# train_meta = train_meta[mask]

print(f"dataset size (image): {len(df_train)}")

**Manual split for train/validation dataset to prevent leaking data if taking images for train and valid from the same audio**

In [None]:
import random
val_split = 0.05

val_fnames = []
for _, dfg in df_train.groupby("primary_label"):
    fnames = dfg["filename"].unique()
    random.shuffle(fnames)
    val_spls = max(1, int(len(fnames) * val_split))
    # skip val if there is only one audio
    if len(fnames) > 1:
        val_fnames += list(fnames[:val_spls])

print(len(val_fnames))

In [None]:
df_valid = df_train[df_train["filename"].isin(val_fnames)]
display(df_valid.head(3))
print(len(df_valid))

In [None]:
df_train = df_train[~df_train["filename"].isin(val_fnames)]
display(df_train.head(3))
print(len(df_train))

# Training with Lightning⚡Flash

**Follow the example:** https://lightning-flash.readthedocs.io/en/stable/reference/audio_classification.html

https://ai.googleblog.com/2019/05/efficientnet-improving-accuracy-and.html

In [None]:
# !pip download -q 'lightning-flash[audio]' noisereduce --dest frozen_packages --prefer-binary
# !pip download -q effdet "icevision[all]" 'lightning-flash[image]' --dest frozen_packages --prefer-binary
# !pip wheel -q "https://github.com/PyTorchLightning/lightning-flash/archive/refs/heads/feature/soft_targets.zip" --wheel-dir frozen_packages
# !rm frozen_packages/torch-*
# !ls -l frozen_packages

In [None]:
!pip --version
!mkdir frozen_packages
!cp /kaggle/input/birdclef-eda-baseline-flash-efficientnet/frozen_packages/* frozen_packages/
!cp /kaggle/input/birdclef-convert-spectrograms-noise-reduce/frozen_packages/* frozen_packages/
!pip install -q 'lightning-flash[audio]' "datasets<2.2.0" --find-links frozen_packages/ --no-index
!pip install -q "frozen_packages/lightning_flash-0.8.0.dev0-py3-none-any.whl[image]" --find-links frozen_packages/ --no-index
!pip install -q timm -U --find-links frozen_packages/ --no-index
# !pip install -q -U "https://github.com/PyTorchLightning/lightning-flash/archive/refs/heads/feature/soft_targets.zip"
!pip uninstall -y wandb

In [None]:
import torch

import flash
import timm
from flash.audio import AudioClassificationData
from flash.image import ImageClassifier

print(timm.__version__)
print(flash.__version__)

## 1. Create the DataModule 🗄️

In [None]:
from dataclasses import dataclass
from torchvision import transforms as T
from typing import Tuple, Callable, Optional
from flash.core.data.io.input_transform import InputTransform

class AddGaussianNoise(object):
    def __init__(self, mean=0., std=1.):
        self.std = std
        self.mean = mean
        
    def __call__(self, img):
        noise = torch.randn(img.size()[1:]) * self.std + self.mean
        return torch.clip(img + noise.repeat(img.size()[0], 1, 1), 0., 1.)

@dataclass
class AudioClassificationInputTransform(InputTransform):

    spectrogram_size: Tuple[int, int] = (128, 128)
    color_mean: float = 0.4173
    color_std: float = 0.15079

    def train_input_per_sample_transform(self) -> Callable:
        return T.Compose([
            T.ToTensor(),
            T.Lambda(lambda x: (x * 255).to(torch.uint8)),
            T.RandomPosterize(bits=7, p=0.2),
            # T.RandomEqualize(),
            T.Lambda(lambda x: x.to(torch.float32) / 255),
            # T.GaussianBlur(kernel_size=3, sigma=(0.5, 10)),
            T.Resize(self.spectrogram_size),
            T.RandomAffine(degrees=0, translate=(0.02, 0.1)),
            AddGaussianNoise(mean=0, std=0.10),
            T.Normalize([self.color_mean] * 3, [self.color_std] * 3),
        ])

    def input_per_sample_transform(self) -> Callable:
        return T.Compose([
            T.ToTensor(),
            T.Resize(self.spectrogram_size),
            T.Normalize([self.color_mean] * 3, [self.color_std] * 3),
        ])

    def target_per_sample_transform(self) -> Callable:
        return torch.as_tensor

In [None]:
SPECTROGRAM_SIZE = (384, 384)
birds = list(train_meta["primary_label"].unique())
# print(birds)

datamodule = AudioClassificationData.from_data_frame(
    "img_name",
    birds,
    train_data_frame=df_train,
    train_images_root=os.path.join(PATH_CONVERTED, "train_images"),
    val_data_frame=df_valid,
    val_images_root=os.path.join(PATH_CONVERTED, "train_images"),
    transform=AudioClassificationInputTransform,
    transform_kwargs=dict(spectrogram_size=SPECTROGRAM_SIZE),
    batch_size=14,
    num_workers=3,
    #val_split=0.1,
)

print(datamodule.num_classes)
print(datamodule.labels)
print(datamodule.multi_label)

In [None]:
import numpy as np
# datamodule.show_train_batch()

nb_samples = 9
fig, axarr = plt.subplots(ncols=3, nrows=3, figsize=(8, 8))

for batch in datamodule.train_dataloader():
    print(batch.keys())
    for i, (img, lb) in enumerate(list(zip(batch["input"], batch["target"]))[:nb_samples]):
        img = np.rollaxis(img.numpy(), 0, 3)
        print(np.min(img), np.max(img))
        axarr[i % 3, i // 3].imshow(img, vmin=-3., vmax=3.)
        axarr[i % 3, i // 3].set_title(lb)
    break

## 2. Build the model ⚙️

In [None]:
from torchmetrics import F1

class SoftF1(F1):
    
    def update(self, preds, target) -> None:
        target = target >= self.threshold
        super().update(preds, target)

In [None]:
# https://timm.fast.ai/asymmetric_loss
from timm.loss import AsymmetricLossMultiLabel, SoftTargetCrossEntropy

model = ImageClassifier(
    backbone="cait_xxs36_384",
    labels=datamodule.labels,
    multi_label=datamodule.multi_label,
    metrics=SoftF1(num_classes=datamodule.num_classes, average="macro"),
    pretrained=False,
    loss_fn=AsymmetricLossMultiLabel(),
    optimizer="AdamW",
    learning_rate=0.005,
)

## 3. Finetune the model 🛠️

In [None]:
from pytorch_lightning.loggers import CSVLogger

# from pytorch_lightning.callbacks import StochasticWeightAveraging
# swa = StochasticWeightAveraging(swa_epoch_start=0.6)

# Trainer Args
GPUS = int(torch.cuda.is_available())  # Set to 1 if GPU is enabled for notebook
trainer = flash.Trainer(
    max_epochs=10 if WITH_SUBMISSION else 3,
    # gradient_clip_val=0.01,
    gpus=GPUS,
    precision=16 if GPUS else 32,
    logger=CSVLogger(save_dir='logs/'),
    accumulate_grad_batches=24,
    val_check_interval=0.5,
    limit_train_batches=1.0 if WITH_SUBMISSION else 0.1,
    limit_val_batches=1.0 if WITH_SUBMISSION else 0.2,
)

In [None]:
trainer.finetune(model, datamodule=datamodule, strategy="no_freeze")

trainer.save_checkpoint("audio_classification_model.pt")

In [None]:
metrics = pd.read_csv(f'{trainer.logger.log_dir}/metrics.csv')
del metrics["step"]
metrics.set_index("epoch", inplace=True)
# display(metrics.dropna(axis=1, how="all").head())
g = sn.relplot(data=metrics, kind="line")
plt.gcf().set_size_inches(15, 5)
plt.grid()

# inference... 🔥

In [None]:
!pip install -q noisereduce --find-links /kaggle/input/birdclef-eda-baseline-flash-efficientnet/frozen_packages/ --no-index

In [None]:
import os
import glob
import torch
import librosa
import noisereduce
import numpy as np
from math import ceil

SPECTROGRAM_PARAMS = dict(
    sample_rate=32_000,
    hop_length=640,
    n_fft=800,
    n_mels=128,
    fmin=20,
    fmax=16_000,
    win_length=512
)

def create_spectrogram(
    fname, reduce_noise: bool = False, frame_size: int = 5, spec_params: dict = SPECTROGRAM_PARAMS,
) -> list:
    waveform, sample_rate = librosa.core.load(fname, sr=spec_params["sample_rate"], mono=True)
    if reduce_noise:
        waveform = noisereduce.reduce_noise(
            y=waveform,
            sr=sample_rate,
            time_constant_s=float(frame_size),
            time_mask_smooth_ms=250,
            n_fft=spec_params["n_fft"],
            use_tqdm=False,
            n_jobs=2,
        )
    nb = int(frame_size * sample_rate)
    count = ceil(len(waveform) / float(nb))
    spectrograms = []
    for i in range(count):
        frame = waveform[i * nb:(i + 1) * nb]
        if len(frame) < nb:
            if i == 0:
                rep = round(float(nb) / len(frame))
                frame = frame.repeat(int(rep))
            else:
                frame = waveform[-nb:]
        sg = librosa.feature.melspectrogram(
            y=frame,
            sr=sample_rate,
            n_fft=spec_params["n_fft"],
            win_length=spec_params["win_length"],
            hop_length=spec_params["hop_length"],
            n_mels=spec_params["n_mels"],
            fmin=spec_params["fmin"],
            fmax=spec_params["fmax"],
            power=1,
        )
        sg = librosa.amplitude_to_db(sg, ref=np.max)
        spectrograms.append(np.nan_to_num(sg))
    return spectrograms

In [None]:
path_audio = glob.glob(os.path.join(PATH_DATASET, "test_soundscapes", "soundscape_*.ogg"))[0]
print(path_audio)
sgs = create_spectrogram(path_audio, reduce_noise=False)[:5]

fig, axarr = plt.subplots(nrows=len(sgs), figsize=(8, 2 * len(sgs)))
for i, sg in enumerate(sgs):
    print(np.min(sg), np.max(sg))
    im = axarr[i].imshow(sg, vmin=-80, vmax=0)
    plt.colorbar(im, ax=axarr[i])
fig.tight_layout()

In [None]:
from tqdm.auto import tqdm
from functools import partial
from joblib import Parallel, delayed
from PIL import Image

img_extension = ".png"

def convert_and_export(
    fn, path_in, path_out, reduce_noise = False, frame_size: int = 5
) -> list:
    path_audio = os.path.join(path_in, fn)
    sgs = create_spectrogram(path_audio, reduce_noise=reduce_noise, frame_size=frame_size)
    records = []
    for i, sg in enumerate(sgs):
        path_img = os.path.join(path_out, fn + f".{i:03}" + img_extension)
        os.makedirs(os.path.dirname(path_img), exist_ok=True)
        sg = (sg + 80) / 80.0
        sg = np.clip(sg, a_min=0, a_max=1) * 255
        img = Image.fromarray(sg.astype(np.uint8))
        img.resize((256,256)).save(path_img)
        records.append({"img_name": os.path.basename(path_img), "end_time": (i + 1) * frame_size, "file_id": os.path.splitext(fn)[0]})
    return records

In [None]:
PATH_TEST_IMAGES = os.path.join("/kaggle/temp", "test_images")

_convert_and_export = partial(
    convert_and_export,
    path_in=os.path.join(PATH_DATASET, "test_soundscapes"),
    path_out=PATH_TEST_IMAGES,
)

soundscapes = glob.glob(os.path.join(PATH_DATASET, "test_soundscapes", "*.ogg"))
soundscapes = list(map(os.path.basename, soundscapes))
converted = []
for batch in Parallel(n_jobs=3)(delayed(_convert_and_export)(fn) for fn in tqdm(soundscapes)):
    converted += batch
# _= list(map(_convert_and_export, tqdm(train_meta["filename"])))

In [None]:
df_converted = pd.DataFrame(converted)
display(df_converted.head())

## Run predictions >>

In [None]:
model = ImageClassifier.load_from_checkpoint(
    "audio_classification_model.pt"
#     "/kaggle/input/birdclef-submissions/birdclef_classification_model_384px.pt"
)
print(model.labels)
trainer = flash.Trainer(gpus=GPUS)

In [None]:
datamodule = AudioClassificationData.from_data_frame(
    input_field="img_name",
    predict_data_frame=df_converted,
    predict_images_root=PATH_TEST_IMAGES,
    transform=AudioClassificationInputTransform,
    transform_kwargs=dict(spectrogram_size=SPECTROGRAM_SIZE),
    batch_size=10,
    num_workers=3,
)

In [None]:
predictions = []
for probs in trainer.predict(model, datamodule=datamodule):
    # lbs = [torch.argmax(p["preds"].float()).item() for p in preds]
    predictions += [p["preds"].cpu().numpy() for p in probs]

## >> Format submission

Untill this is resolved https://www.kaggle.com/c/birdclef-2022/discussion/309001

In [None]:
submission = []
for i, row in tqdm(df_converted.iterrows(), total=len(df_converted)):
    assert len(model.labels) == len(predictions[i])
    preds = dict(zip(model.labels, predictions[i]))
    for bird in scored_birds:
        submission.append({
            "row_id": f"{row['file_id']}_{bird}_{row['end_time']}",
            "target": preds.get(bird, 0) > 0.1,
        })

In [None]:
df_submission = pd.DataFrame(submission).set_index("row_id")
df_submission.to_csv("submission.csv")

! head submission.csv