# Data exploration 🦜

In [None]:
!ls -l /kaggle/input/birdclef-2022

PATH_DATASET = "/kaggle/input/birdclef-2022"
PATH_CONVERTED = "/kaggle/input/birdclef-convert-spectrograms-noise-reduce"

## Visualise training meta data

In [None]:
import os
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt

sn.set()

train_meta = pd.read_csv(os.path.join(PATH_DATASET, "train_metadata.csv"))
train_meta["secondary_labels"] = list(map(eval, train_meta["secondary_labels"]))
train_meta["type"] = list(map(eval, train_meta["type"]))
display(train_meta.head())

In [None]:
ax = train_meta["primary_label"].value_counts().plot.bar(figsize=(24, 3), grid=True) 
ax.set_yscale('log')

In [None]:
from itertools import chain

secondary_lbs = list(chain(*train_meta["secondary_labels"]))
# print(secondary_lbs)
ax = pd.Series(secondary_lbs).value_counts().plot.bar(figsize=(16, 3), grid=True)
ax.set_yscale('log')

In [None]:
train_meta["secondary_counts"] = [len(sd) for sd in train_meta["secondary_labels"]]
ax = train_meta["secondary_counts"].value_counts().sort_index().plot.bar(figsize=(4, 3), grid=True)
ax.set_yscale('log')

In [None]:
types = list(chain(*train_meta["type"]))
# print(secondary_lbs)
ax = pd.Series(types).value_counts()[:100].plot.bar(figsize=(18, 3), grid=True)
ax.set_yscale('log')

In [None]:
import plotly.express as px

fig = px.scatter_geo(
    train_meta,
    lat="latitude",
    lon="longitude",
    color="common_name",
    width=1000,
    height=500,
    title="BirdCLEF 2022 Training Data",
)
fig.show()

In [None]:
train_meta["rating"].value_counts().sort_index().plot.bar(grid=True)

In [None]:
import re

def norm_time(tm):
    try:
        dt = pd.to_datetime(tm, format='%H:%M')
        return dt.hour + dt.minute / 60.
    except Exception:
        # print(ex)
        pass

train_meta["time_norm"] = train_meta["time"].apply(norm_time)
train_meta["time_norm"].hist(bins=50, grid=True)

## eBird Taxonomy 🦚 v2021

In [None]:
ebird = pd.read_csv(os.path.join(PATH_DATASET, "eBird_Taxonomy_v2021.csv"))
display(ebird.head())
print(len(ebird))

## Scored birds 🐦

In [None]:
import json

with open(os.path.join(PATH_DATASET, "scored_birds.json")) as fp:
    scored_birds = json.load(fp)

print(scored_birds)

In [None]:
# primary_label = train_meta["primary_label"].unique()
# print(f"Unique primary labels: {primary_label}")
# print(f"missing scored in primary: {[lb for lb in scored_birds if lb not in primary_label]}")

# Data pre-processing 💽

For some optimization we moved the dataset conversion to a separate kernel as it is not needed to waste your GPU quota on constant task

So the image-dataset will be attached here

**https://www.kaggle.com/jirkaborovec/birdclef-convert-spectrograms-noise-reduce**

## Prepare train 🪡 validation dataset

In [None]:
import glob
from tqdm.auto import tqdm
from pprint import pprint
# from joblib import Parallel, delayed

print(f"dataset size (audio): {len(train_meta)}")

train_records = []
for idx, row in tqdm(train_meta.iterrows(), total=len(train_meta)):
    imgs = glob.glob(os.path.join(PATH_CONVERTED, "train_images", row["filename"] + ".*"))
    # imgs = [p for p in imgs if _try_image(p)]
    imgs = [os.path.sep.join(p.split(os.path.sep)[-2:]) for p in sorted(imgs)]
    rows = [dict(row) for _ in range(len(imgs))]
    _= [r.update({"img_name": img}) for r, img in zip(rows, imgs)]
    train_records += rows
    
df_train = pd.DataFrame(train_records)
display(df_train.head())

# train_meta["img_name"] = [f"{fn}.jpg" for fn in train_meta["filename"]]
# # mask = [_try_image(os.path.join(PATH_CONVERTED, "train_images", n)) for n in tqdm(train_meta["img_name"])]
# mask = Parallel(n_jobs=os.cpu_count())(delayed(_try_image)(os.path.join(PATH_CONVERTED, "train_images", n)) for n in tqdm(train_meta["img_name"]))
# train_meta = train_meta[mask]

print(f"dataset size (image): {len(df_train)}")

**Manual split for train/validation dataset to prevent leaking data if taking images for train and valid from the same audio**

In [None]:
import random
val_split = 0.1

val_fnames = []
for _, dfg in df_train.groupby("primary_label"):
    fnames = dfg["filename"].unique()
    random.shuffle(fnames)
    val_spls = max(1, int(len(fnames) * val_split))
    # skip val if there is only one audio
    if len(fnames) > 1:
        val_fnames += list(fnames[:val_spls])

print(len(val_fnames))

In [None]:
df_valid = df_train[df_train["filename"].isin(val_fnames)]
display(df_valid.head(3))
print(len(df_valid))

In [None]:
df_train = df_train[~df_train["filename"].isin(val_fnames)]
display(df_train.head(3))
print(len(df_train))

## Color 🦩 normalizations

In [None]:
import numpy as np
from tqdm.auto import tqdm
from joblib import Parallel, delayed

def _color_means(img_path):
    img = plt.imread(img_path)
    if np.max(img) > 1.5:
        img = img / 255.0
    clr_mean = np.mean(img) if img.ndim == 2 else {i: np.mean(img[..., i]) for i in range(3)}
    clr_std = np.std(img) if img.ndim == 2 else {i: np.std(img[..., i]) for i in range(3)}
    return clr_mean, clr_std

images = glob.glob(os.path.join(PATH_CONVERTED, "train_images", "*", "*.png"))
clr_mean_std = Parallel(n_jobs=os.cpu_count())(delayed(_color_means)(fn) for fn in tqdm(images[::10]))

In [None]:
img_color_mean = pd.DataFrame([c[0] for c in clr_mean_std]).describe()
display(img_color_mean.T)
img_color_std = pd.DataFrame([c[1] for c in clr_mean_std]).describe()
display(img_color_std.T)

img_color_mean = list(img_color_mean.T["mean"])
img_color_std = list(img_color_std.T["mean"])
print(img_color_mean, img_color_std)

# Training with Lightning⚡Flash

**Follow the example:** https://lightning-flash.readthedocs.io/en/stable/reference/audio_classification.html

https://ai.googleblog.com/2019/05/efficientnet-improving-accuracy-and.html

**Later you would need to adjust the image size to used model:**

| **Base model** | resolution |
|----------------|------------|
| EfficientNetB0 | 224        |
| EfficientNetB1 | 240        |
| EfficientNetB2 | 260        |
| EfficientNetB3 | 300        |
| EfficientNetB4 | 380        |

In [None]:
!pip download -q 'lightning-flash[audio]' "datasets==2.1.0" noisereduce --dest frozen_packages --prefer-binary
!pip download -q effdet "icevision[all]" 'lightning-flash[image]' --dest frozen_packages --prefer-binary
!rm frozen_packages/torch-*
!ls -l frozen_packages

In [None]:
!pip --version
!pip install -q 'lightning-flash[audio]' "datasets==2.1.0" -f frozen_packages
!pip install -q effdet "icevision[all]" 'lightning-flash[image]' -f frozen_packages
# !pip uninstall -y wandb

In [None]:
import torch

import flash
from flash.audio import AudioClassificationData
from flash.image import ImageClassifier

## 1. Create the DataModule 🗄️

see discussion about **[Suggested spectrogram augmentation?](https://www.kaggle.com/c/birdclef-2022/discussion/311995)**

In [None]:
from dataclasses import dataclass
from torchvision import transforms as T
from typing import Tuple, Callable, Optional
from flash.core.data.io.input_transform import InputTransform

@dataclass
class AudioClassificationInputTransform(InputTransform):

    spectrogram_size: Tuple[int, int] = (128, 128)
    color_mean: float = 0.4173
    color_std: float = 0.15079

    def train_input_per_sample_transform(self) -> Callable:
        return T.Compose([
            T.ToTensor(),
            T.Lambda(lambda x: (x * 255).to(torch.uint8)),
            T.RandomPosterize(bits=7, p=0.2),
            T.RandomEqualize(),
            T.Lambda(lambda x: x.to(torch.float32) / 255),
            # T.GaussianBlur(kernel_size=5, sigma=(0.5, 4)),
            T.Resize(self.spectrogram_size),
            T.RandomAffine(degrees=0, translate=(0.01, 0.1)),
            T.Normalize([self.color_mean] * 3, [self.color_std] * 3),
        ])

    def input_per_sample_transform(self) -> Callable:
        return T.Compose([
            T.ToTensor(),
            T.Resize(self.spectrogram_size),
            T.Normalize([self.color_mean] * 3, [self.color_std] * 3),
        ])

    def target_per_sample_transform(self) -> Callable:
        return torch.as_tensor

In [None]:
datamodule = AudioClassificationData.from_data_frame(
    "img_name",
    "primary_label",
    train_data_frame=df_train,
    train_images_root=os.path.join(PATH_CONVERTED, "train_images"),
    train_transform=AudioClassificationInputTransform,
    val_data_frame=df_valid,
    val_images_root=os.path.join(PATH_CONVERTED, "train_images"),
    val_transform=AudioClassificationInputTransform,
    transform_kwargs=dict(spectrogram_size=(224, 224)),
    batch_size=64,
    num_workers=3,
    #val_split=0.1,
)

print(datamodule.labels)

In [None]:
# datamodule.show_train_batch()

nb_samples = 9
fig, axarr = plt.subplots(ncols=3, nrows=3, figsize=(8, 8))

for batch in datamodule.train_dataloader():
    print(batch.keys())
    for i, (img, lb) in enumerate(list(zip(batch["input"], batch["target"]))[:nb_samples]):
        img = np.rollaxis(img.numpy(), 0, 3)
        print(np.min(img), np.max(img))
        axarr[i % 3, i // 3].imshow(img, vmin=-5., vmax=5.)  #
        axarr[i % 3, i // 3].set_title(lb)
    break

## 2. Build the model ⚙️

In [None]:
from torchmetrics import F1
from timm.loss import LabelSmoothingCrossEntropy

model = ImageClassifier(
    backbone="tf_efficientnet_b0_ns",
    labels=datamodule.labels,
    metrics=F1(num_classes=datamodule.num_classes, average="macro"),
    pretrained=True,
    loss_fn=LabelSmoothingCrossEntropy(0.02),
    optimizer="AdamW",
    learning_rate=0.002,
    # lr_scheduler=("cosineannealinglr", {"T_max": 5}),
)

## 3. Finetune the model 🛠️

In [None]:
from pytorch_lightning.loggers import CSVLogger
# from pytorch_lightning.callbacks import StochasticWeightAveraging

# Trainer Args
GPUS = int(torch.cuda.is_available())  # Set to 1 if GPU is enabled for notebook

# swa = StochasticWeightAveraging(swa_epoch_start=0.6)
logger = CSVLogger(save_dir='logs/')

trainer = flash.Trainer(
    max_epochs=5,
    # gradient_clip_val=0.01,
    gpus=GPUS,
    precision=16 if GPUS else 32,
    logger=logger,
    accumulate_grad_batches=32,
)

In [None]:
trainer.finetune(model, datamodule=datamodule, strategy="no_freeze")

trainer.save_checkpoint("audio_classification_model.pt")

In [None]:
metrics = pd.read_csv(f'{trainer.logger.log_dir}/metrics.csv')
del metrics["step"]
metrics.set_index("epoch", inplace=True)
# display(metrics.dropna(axis=1, how="all").head())
g = sn.relplot(data=metrics, kind="line")
plt.gcf().set_size_inches(15, 5)
plt.grid()

# inference... 🚀

see the follow-up kernel: https://www.kaggle.com/jirkaborovec/birdclef-lightning-flash-inference

or **full training & inference**: https://www.kaggle.com/code/jirkaborovec/birdclef-multi-label-flash-transformer