# Converting audio to 🧮 spectogram

See: **[Is bird presented all the time in training recordings?](https://www.kaggle.com/c/birdclef-2022/discussion/308861)**

So this kernel converts audio dataset to image dataset as in many cases a spectrogram is good representation of audio recording

**NOTE: as we create the the image-dataset in this kernel home, it can be easily attached to your future training kernel...**

See: [Easy Kaggle Offline Submission With Chaining Kernels](https://towardsdatascience.com/easy-kaggle-offline-submission-with-chaining-kernels-30bba5ea5c4d)

In [None]:
!pip install -q noisereduce

PATH_DATASET = "/kaggle/input/birdclef-2022"

In [None]:
!pip wheel -q https://github.com/Borda/kaggle_image-classify/archive/refs/heads/main.zip --wheel-dir frozen_packages
!pip wheel -q https://github.com/PyTorchLightning/lightning-flash/archive/refs/heads/feature/soft_targets.zip --wheel-dir frozen_packages
!rm frozen_packages/torch*
!ls -l frozen_packages | grep -e kaggle -e flash

In [None]:
import os
import pandas as pd

path_csv = os.path.join(PATH_DATASET, "train_metadata.csv")
train_meta = pd.read_csv(path_csv).sample(frac=1)
display(train_meta.head())

## Explore 🔍 audio lengths

Let us check lenghth of particular recodings...

In [None]:
import torchaudio
from tqdm.auto import tqdm
from joblib import Parallel, delayed

def get_length(fn):
    fp = os.path.join(PATH_DATASET, "train_audio", fn)
    waveform, sample_rate = torchaudio.load(fp)
    return waveform.size()[-1]

sizes = Parallel(n_jobs=os.cpu_count())(delayed(get_length)(fn) for fn in tqdm(train_meta["filename"]))

In [None]:
import matplotlib.pyplot as plt

plt.hist(sizes, bins=150)
plt.gca().set_xscale('log')
plt.gca().set_yscale('log')
plt.grid()

## Convert 🗃️ audio to set of spectrograms

In [None]:
import torch
import torchaudio
import noisereduce
import matplotlib.pyplot as plt
import numpy as np
import librosa
from math import ceil
from pprint import pprint
from torch import Tensor
from torch.utils.data import DataLoader

SPECTROGRAM_PARAMS = dict(
    sample_rate=32_000,
    hop_length=640,
    n_fft=800,
    n_mels=128,
    fmin=20,
    fmax=16_000,
    win_length=512
)
PCEN_PARAS = dict(
    time_constant=0.06,
    eps=1e-6,
    gain=0.8,
    power=0.25,
    bias=10,
)


@torch.no_grad()
def create_spectrogram(
    fname: str,
    reduce_noise: bool = False,
    frame_size: int = 5,
    frame_step: int = 2,
    spec_params: dict = SPECTROGRAM_PARAMS,
) -> list:
    waveform, sample_rate = librosa.core.load(fname, sr=spec_params["sample_rate"], mono=True)
    if reduce_noise:
        waveform = noisereduce.reduce_noise(
            y=waveform,
            sr=sample_rate,
            time_constant_s=float(frame_size),
            time_mask_smooth_ms=250,
            n_fft=spec_params["n_fft"],
            use_tqdm=False,
            n_jobs=2,
        )

    step = int(frame_step * sample_rate)
    size = int(frame_size * sample_rate)
    count = ceil((len(waveform) - size) / float(step))
    frames = []
    for i in range(max(1, count)):
        begin = i * step
        frame = waveform[begin:begin + size]
        if len(frame) < size:
            if i == 0:
                rep = round(float(size) / len(frame))
                frame = frame.repeat(int(rep))
            elif len(frame) < (size * 0.33):
                continue
            else:
                frame = waveform[-size:]
        frames.append(frame)

    spectrograms = []
    for frm in frames:
        sg = librosa.feature.melspectrogram(
            y=frm,
            sr=sample_rate,
            n_fft=spec_params["n_fft"],
            win_length=spec_params["win_length"],
            hop_length=spec_params["hop_length"],
            n_mels=spec_params["n_mels"],
            fmin=spec_params["fmin"],
            fmax=spec_params["fmax"],
            power=1,
        )
#         sg = librosa.pcen(sg, sr=sample_rate, hop_length=spec_params["hop_length"], **PCEN_PARAS)
        sg = librosa.amplitude_to_db(sg, ref=np.max)
        spectrograms.append(np.nan_to_num(sg))
    return spectrograms

In [None]:
path_audio = os.path.join(PATH_DATASET, "train_audio", "apapan/XC27331.ogg")
# path_audio = os.path.join(PATH_DATASET, "train_audio", "elepai/XC27344.ogg")
# path_audio = os.path.join(PATH_DATASET, "train_audio", "hawgoo/XC210217.ogg")
print(path_audio)
sgs = create_spectrogram(path_audio, reduce_noise=False)


fig, axarr = plt.subplots(nrows=len(sgs), figsize=(8, 3 * len(sgs)))
for i, sg in enumerate(sgs):
    print(np.min(sg), np.max(sg))
    im = axarr[i].imshow(sg)  # librosa
    plt.colorbar(im, ax=axarr[i])

**Export only frames from the recoding beginning and ending**

In [None]:
from PIL import Image
from tqdm.auto import tqdm
from functools import partial
from joblib import Parallel, delayed


def convert_and_export(
    fn: str, path_in: str, path_out: str,
    reduce_noise: bool = False,
    frame_size: int = 5, frame_step: int = 2,
    img_extension=".png",
) -> None:
    path_audio = os.path.join(path_in, fn)
    try:
        sgs = create_spectrogram(
            path_audio,
            reduce_noise=reduce_noise,
            frame_size=frame_size,
            frame_step=frame_step,
        )
    except Exception as ex:
        print(f"Failed conversion for audio: {path_audio}")
        return
    if not sgs:
        print(f"Too short audio for: {path_audio}")
        return
    # see: https://www.kaggle.com/c/birdclef-2022/discussion/308861
    # this is adjustment for window 5s and step 2s
    nb = ceil((10 - frame_size) / frame_step) + 1
    if len(sgs) > 2 * nb:
        sgs = sgs[:nb] + sgs[-nb:]
    path_npz = os.path.join(path_out, fn + '.npz')
    os.makedirs(os.path.dirname(path_npz), exist_ok=True)
    # np.savez_compressed(path_npz, np.array(sgs, dtype=np.float16))
    for i, sg in enumerate(sgs):
        path_img = os.path.join(path_out, fn + f".{i:03}" + img_extension)
        try:
            # plt.imsave(path_img, sg, vmin=-70, vmax=20)
            sg = (sg + 80) / 80.0
            sg = np.clip(sg, a_min=0, a_max=1) * 255
            img = Image.fromarray(sg.astype(np.uint8))
            img.resize((256,256)).save(path_img)
        except Exception as ex:
            print(f"Failed exporting for image: {path_img}")
            continue

## Perform conversion 🏃 full dataset

Running conversion in pararlle and otionaly you can be using GPU

In [None]:
_convert_and_export = partial(
    convert_and_export,
    path_in=os.path.join(PATH_DATASET, "train_audio"),
    path_out="train_images",
)

_= Parallel(n_jobs=3)(delayed(_convert_and_export)(fn) for fn in tqdm(train_meta["filename"]))
# _= list(map(_convert_and_export, tqdm(train_meta["filename"])))

In [None]:
import glob
from pprint import pprint

print(train_meta["filename"][1])
imgs = glob.glob(os.path.join("train_images", train_meta["filename"][1] + ".*.png"))
pprint(sorted(imgs))

path_img = imgs[0]
print(path_img)
img = plt.imread(path_img)
print(img.shape)
plt.imshow(img)

## Compute 🎟️ color normalizations

In [None]:
def _color_means(img_path):
    img = plt.imread(img_path)
    if np.max(img) > 1.5:
        img = img / 255.0
    clr_mean = np.mean(img) if img.ndim == 2 else {i: np.mean(img[..., i]) for i in range(3)}
    clr_std = np.std(img) if img.ndim == 2 else {i: np.std(img[..., i]) for i in range(3)}
    return clr_mean, clr_std

images = glob.glob(os.path.join("train_images", "*", "*.png"))
clr_mean_std = Parallel(n_jobs=os.cpu_count())(delayed(_color_means)(fn) for fn in tqdm(images))

In [None]:
img_color_mean = pd.DataFrame([c[0] for c in clr_mean_std]).describe()
display(img_color_mean.T)

In [None]:
img_color_std = pd.DataFrame([c[1] for c in clr_mean_std]).describe()
display(img_color_std.T)

In [None]:
img_color_mean = list(img_color_mean.T["mean"])
img_color_std = list(img_color_std.T["mean"])
print(img_color_mean, img_color_std)