# Inference for 🦜BirdCLEF with Lightning⚡Flash

**This is just inference version fo the original work: https://www.kaggle.com/jirkaborovec/birdclef-eda-baseline-flash-efficientnet**

See our story: [Best Practices to Rank on Kaggle Competition with PyTorch Lightning and Grid.ai Spot Instances](https://devblog.pytorchlightning.ai/best-practices-to-rank-on-kaggle-competition-with-pytorch-lightning-and-grid-ai-spot-instances-54aa5248aa8e)

**Clarification about the submission format: https://www.kaggle.com/c/birdclef-2022/discussion/308009**

In [None]:
!ls -l /kaggle/input/birdclef-2022

PATH_DATASET = "/kaggle/input/birdclef-2022"
PATH_CONVERTED = "/kaggle/input/birdclef-convert-spectrograms-noise-reduce"

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt

# df_test = pd.read_csv(os.path.join(PATH_DATASET, "test.csv")).set_index("row_id")
# display(df_test)

# Converting audio to spectogram

It is done with this notebook and the output will be attached here

**https://www.kaggle.com/jirkaborovec/birdclef-convert-spectrograms-noise-reduce**

In [None]:
!pip install -q noisereduce --find-links /kaggle/input/birdclef-eda-baseline-flash-efficientnet/frozen_packages/ --no-index

In [None]:
import os
import glob
import torch
import librosa
import noisereduce
import numpy as np
from math import ceil

SPECTROGRAM_PARAMS = dict(
    sample_rate=32_000,
    hop_length=640,
    n_fft=800,
    n_mels=128,
    fmin=20,
    fmax=16_000,
    win_length=512
)

def create_spectrogram(
    fname, reduce_noise: bool = False, frame_size: int = 5, spec_params: dict = SPECTROGRAM_PARAMS,
) -> list:
    waveform, sample_rate = librosa.core.load(fname, sr=spec_params["sample_rate"], mono=True)
    if reduce_noise:
        waveform = noisereduce.reduce_noise(
            y=waveform,
            sr=sample_rate,
            time_constant_s=float(frame_size),
            time_mask_smooth_ms=250,
            n_fft=spec_params["n_fft"],
            use_tqdm=False,
            n_jobs=2,
        )
    nb = int(frame_size * sample_rate)
    count = ceil(len(waveform) / float(nb))
    spectrograms = []
    for i in range(count):
        frame = waveform[i * nb:(i + 1) * nb]
        if len(frame) < nb:
            if i == 0:
                rep = round(float(nb) / len(frame))
                frame = frame.repeat(int(rep))
            else:
                frame = waveform[-nb:]
        sg = librosa.feature.melspectrogram(
            y=frame,
            sr=sample_rate,
            n_fft=spec_params["n_fft"],
            win_length=spec_params["win_length"],
            hop_length=spec_params["hop_length"],
            n_mels=spec_params["n_mels"],
            fmin=spec_params["fmin"],
            fmax=spec_params["fmax"],
            power=1,
        )
        sg = librosa.amplitude_to_db(sg, ref=np.max)
        spectrograms.append(np.nan_to_num(sg))
    return spectrograms

In [None]:
path_audio = glob.glob("/kaggle/input/birdclef-2022/test_soundscapes/soundscape_*.ogg")[0]
print(path_audio)
sgs = create_spectrogram(path_audio, reduce_noise=False)

fig, axarr = plt.subplots(nrows=len(sgs), figsize=(8, 2 * len(sgs)))
for i, sg in enumerate(sgs):
    print(np.min(sg), np.max(sg))
    im = axarr[i].imshow(sg, vmin=-80, vmax=0)
    plt.colorbar(im, ax=axarr[i])
fig.tight_layout()

In [None]:
from tqdm.auto import tqdm
from functools import partial
from joblib import Parallel, delayed
from PIL import Image

img_extension = ".png"


def convert_and_export(
    fn, path_in, path_out, reduce_noise = False, frame_size: int = 5
) -> list:
    path_audio = os.path.join(path_in, fn)
    sgs = create_spectrogram(path_audio, reduce_noise=reduce_noise, frame_size=frame_size)
    records = []
    for i, sg in enumerate(sgs):
        path_img = os.path.join(path_out, fn + f".{i:03}" + img_extension)
        os.makedirs(os.path.dirname(path_img), exist_ok=True)
        sg = (sg + 80) / 80.0
        sg = np.clip(sg, a_min=0, a_max=1) * 255
        img = Image.fromarray(sg.astype(np.uint8))
        img.resize((256,256)).save(path_img)
        records.append({"img_name": os.path.basename(path_img), "end_time": (i + 1) * frame_size, "file_id": os.path.splitext(fn)[0]})
    return records

In [None]:
_convert_and_export = partial(
    convert_and_export,
    path_in=os.path.join(PATH_DATASET, "test_soundscapes"),
    path_out=os.path.join("/kaggle/temp", "test_images"),
)

soundscapes = glob.glob(os.path.join(PATH_DATASET, "test_soundscapes", "*.ogg"))
soundscapes = list(map(os.path.basename, soundscapes))
converted = []
for batch in Parallel(n_jobs=3)(delayed(_convert_and_export)(fn) for fn in tqdm(soundscapes)):
    converted += batch
# _= list(map(_convert_and_export, tqdm(train_meta["filename"])))

In [None]:
df_converted = pd.DataFrame(converted)
display(df_converted.head())

# Inference with Lightning⚡Flash

**Follow the example:** https://lightning-flash.readthedocs.io/en/stable/reference/audio_classification.html

In [None]:
!pip --version
!pip install -q 'lightning-flash[audio,image]' -U --find-links /kaggle/input/birdclef-eda-baseline-flash-efficientnet/frozen_packages/ --no-index
!pip install -q timm -U --find-links /kaggle/input/birdclef-submissions/packages/ --no-index
!pip uninstall -y wandb

In [None]:
import torch

import flash
from flash.audio import AudioClassificationData
from flash.image import ImageClassifier

### 1. Load the task ⚙️

In [None]:
model = ImageClassifier.load_from_checkpoint(
    "/kaggle/input/birdclef-eda-baseline-flash-efficientnet/audio_classification_model.pt"
#     "/kaggle/input/birdclef-submissions/birdclef_classification_model_384px.pt"
)

print(model.labels)

In [None]:
# Trainer Args
GPUS = int(torch.cuda.is_available())  # Set to 1 if GPU is enabled for notebook

trainer = flash.Trainer(gpus=GPUS)

### 2. Run predictions 🎉

In [None]:
from dataclasses import dataclass
from torchvision import transforms as T
from typing import Tuple, Callable, Optional
from flash.core.data.io.input_transform import InputTransform

@dataclass
class AudioClassificationInputTransform(InputTransform):

    spectrogram_size: Tuple[int, int] = (128, 128)
    color_mean: float = 0.4173
    color_std: float = 0.15079

    def input_per_sample_transform(self) -> Callable:
        return T.Compose([
            T.ToTensor(),
            T.Resize(self.spectrogram_size),
            T.Normalize([self.color_mean] * 3, [self.color_std] * 3),
        ])

    def target_per_sample_transform(self) -> Callable:
        return torch.as_tensor

In [None]:
datamodule = AudioClassificationData.from_data_frame(
    input_field="img_name",
    predict_data_frame=df_converted,
    predict_images_root=os.path.join("/kaggle/temp", "test_images"),
    predict_transform=AudioClassificationInputTransform,
    transform_kwargs=dict(spectrogram_size=(224, 224)),
    batch_size=24,
    num_workers=3,
)

In [None]:
predictions = []
for probs in trainer.predict(model, datamodule=datamodule, output="probabilities"):
    # lbs = [torch.argmax(p["preds"].float()).item() for p in preds]
    predictions += probs

# Format submission

Untill this is resolved https://www.kaggle.com/c/birdclef-2022/discussion/309001

In [None]:
import json

with open(os.path.join(PATH_DATASET, "scored_birds.json")) as fp:
    scored_birds = json.load(fp)

print(scored_birds)

In [None]:
submission = []
for i, row in tqdm(df_converted.iterrows(), total=len(df_converted)):
    preds = dict(zip(model.labels, predictions[i]))
    for bird in scored_birds:
        submission.append({
            "row_id": f"{row['file_id']}_{bird}_{row['end_time']}",
            "target": preds[bird] > 0.5,
        })

In [None]:
df_submission = pd.DataFrame(submission).set_index("row_id")
df_submission.to_csv("submission.csv")

! head submission.csv