- Reference
    - [RFCX: Audio Data Augmentation(Japanese+English) by: Hidehisa Arai](https://www.kaggle.com/hidehisaarai1213/rfcx-audio-data-augmentation-japanese-english)
    - [Bird 2022 EDA - [Twitch Live Stream] by: Rob Mulla](https://www.kaggle.com/robikscube/bird-2022-eda-twitch-live-stream)
    - [BirdCLEF_2022_Starter by: DrCapa](https://www.kaggle.com/drcapa/birdclef-2022-starter)
    - [🦜BirdCLEF: world🌎map with birds by: Jirka Borovec](https://www.kaggle.com/jirkaborovec/birdclef-world-map-with-birds)
    - [🦜BirdCLEF: EDA 🔎 & more... by: Jirka Borovec](https://www.kaggle.com/jirkaborovec/birdclef-eda-more)

In [None]:
import numpy as np
import pandas as pd
import os
import json
from pathlib import Path
from tqdm import tqdm
from glob import glob
import matplotlib.pyplot as plt
# import japanize_matplotlib
import seaborn as sns
%matplotlib inline
plt.style.use('fivethirtyeight')

import librosa
import librosa.display
import IPython.display as ipd
from IPython.display import Audio

import warnings
warnings.filterwarnings('ignore')

In [None]:
base_dir = Path('../input/birdclef-2022')
os.listdir(base_dir)

## Load Data

In [None]:
train_meta = pd.read_csv(base_dir/'train_metadata.csv')
test_data = pd.read_csv(base_dir/'test.csv')
ebird_data = pd.read_csv(base_dir/'eBird_Taxonomy_v2021.csv')
samp_subm = pd.read_csv(base_dir/'sample_submission.csv')

with open(base_dir/'scored_birds.json') as f:
    scored_birds = json.load(f)

In [None]:
train_meta.head()

In [None]:
train_meta.common_name.value_counts(ascending=True).plot.barh(figsize=(3, 24), grid=True) 

In [None]:
train_meta.primary_label.value_counts(ascending=True).plot.barh(figsize=(3, 24), grid=True) 


In [None]:
def plot_count(feature, title, df, size=1):
    '''クラス/特徴量をプロットする
    Pram:
        feature : 分析するカラム
        title : グラフタイトル
        df : プロットするデータフレーム
        size : デフォルト 1.
    '''
    f, ax = plt.subplots(1,1, figsize=(4*size,4))
    total = float(len(df))
    # 最大10カラムをヒストグラムで表示
#     g = sns.countplot(x = df[feature], order = df[feature].value_counts().index[:20], palette='Set3')
    g = sns.countplot(y = df[feature], order = df[feature].value_counts().index[:10], palette='Set3')

    g.set_title('Number and percentage of {}'.format(title))
#     if(size > 2):
        # サイズ2以上の時、行名を90°回転し、表示
#         plt.xticks(rotation=90, size=8)
    # データ比率の表示
    for p in ax.patches:
        height = p.get_height()
        width = p.get_width()
#         ax.text(p.get_x()+p.get_width()/2.,
#                 height + 3,
#                 '{:1.2f}%'.format(100*height/total),
#                 ha='center')
        ax.text(width + 500,
                p.get_y()+p.get_height()+.025/2.,
                '{:1.2f}%'.format(100*width/total),
                ha='right') 
    plt.tight_layout()
    plt.show()


In [None]:
# rating の比率を確認する
plot_count(feature='rating', title='rating', df=train_meta, size=2)

In [None]:
# secondary_labels の比率を確認する
plot_count(feature='secondary_labels', title='secondary_labels', df=train_meta, size=3)

In [None]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

stopwords = set(STOPWORDS)
wordcloud = WordCloud(width = 5000, height = 4000,
                      background_color ='white',
                      stopwords = stopwords,
                      min_font_size = 10).generate(' '.join(train_meta.secondary_labels))

print(wordcloud)
fig = plt.figure(1)
plt.figure(figsize=(10,5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
!pip install -q GeoPandas

In [None]:
import geopandas as gpd

# The easiest way to plot data from Pandas on a world map
# https://www.kaggle.com/jirkaborovec/birdclef-world-map-with-birds

# initialize an axis
fig, ax = plt.subplots(figsize=(16,12))
# plot map on axis
countries = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
countries.plot(color='lightgrey', ax=ax)

# plot points
# latitude=緯度, longitude=経度
train_meta.plot.scatter(x='longitude', y='latitude', s=5, ax=ax) # , c='brightness'

ax.grid(b=True, alpha=0.25)

In [None]:
# initialize an axis
fig, ax = plt.subplots(figsize=(24,18))
# plot map on axis
countries = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
countries.plot(color='lightgrey', ax=ax)

# plot points
cmap = plt.cm.get_cmap('jet')
birds = len(train_meta['primary_label'].unique())
for i, (bird, dfg) in enumerate(train_meta.groupby('primary_label')):
    dfg.longitude = np.around(dfg.longitude, 1)
    dfg.latitude = np.around(dfg.latitude, 1)
    dfgg = dfg.groupby(['longitude', 'latitude']).size().reset_index(name='counts')
    dfgg.plot(x='longitude', y='latitude', kind='scatter', c=cmap(float(i) / birds), s=dfgg['counts'] * 5, ax=ax, label=bird, alpha=0.5)

ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.25), ncol=15, fancybox=True, shadow=True)

# get axes limits
x_lo, x_up = ax.get_xlim()
y_lo, y_up = ax.get_ylim()
# add minor ticks with a specified sapcing (deg)
deg = 5
# add grid
ax.set_xticks(np.arange(np.ceil(x_lo), np.ceil(x_up), deg), minor=True)
ax.set_yticks(np.arange(np.ceil(y_lo), np.ceil(y_up), deg), minor=True)
ax.grid(b=True, which='minor', alpha=0.25)

## Load Audio Data

In [None]:
# afrsil1 の音声リストの取得
afrsil1_ogg_files = list(glob(f'{base_dir}/train_audio/afrsil1/*.ogg'))
afrsil1_ogg_files[:5], len(afrsil1_ogg_files)

In [None]:
# ファイル最初の先頭から10秒の区間を読み込み
y, sr = librosa.load(afrsil1_ogg_files[0], duration=10)
# data
print(y)
print()
# rate
print(sr)

In [None]:
Audio(y, rate=sr)

In [None]:
# 音声波形の表示
x = range(len(y))
plt.plot(x, y)
plt.plot(x, y, color='blue')
plt.legend(loc='upper center')
plt.grid()
plt.grid()

In [None]:
# メル尺度のスペクトログラムの算出
mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
# スペクトログラム/クロマトグラム/ cqt /などを表示
librosa.display.specshow(mel_spec, sr=sr, x_axis='time', y_axis='mel')
plt.colorbar()

In [None]:
# パワースペクトログラム（振幅の2乗）をデシベル（dB）単位に変換
# ログスケールに変換
melspec = librosa.power_to_db(mel_spec)
# スペクトログラム/クロマトグラム/ cqt /などを表示
librosa.display.specshow(melspec, sr=sr, x_axis='time', y_axis='mel')
plt.colorbar()

## Data Augmentation

1. AddGaussianNoise
2. GaussianNoiseSNR
3. PinkNoiseSNR
4. PitchShift
5. TimeStretch
6. TimeShift
7. VolumeControl

In [None]:
class AudioTransform:
    def __init__(self, always_apply=False, p=0.5):
        self.always_apply = always_apply
        self.p = p

    def __call__(self, y: np.ndarray):
        if self.always_apply:
            return self.apply(y)
        else:
            if np.random.rand() < self.p:
                return self.apply(y)
            else:
                return y

    def apply(self, y: np.ndarray):
        raise NotImplementedError


class Compose:
    def __init__(self, transforms: list):
        self.transforms = transforms

    def __call__(self, y: np.ndarray):
        for trns in self.transforms:
            y = trns(y)
        return y


class OneOf:
    def __init__(self, transforms: list):
        self.transforms = transforms

    def __call__(self, y: np.ndarray):
        n_trns = len(self.transforms)
        trns_idx = np.random.choice(n_trns)
        trns = self.transforms[trns_idx]
        return trns(y)

### 1. AddGaussianNoise


In [None]:
class AddGaussianNoise(AudioTransform):
    def __init__(self, always_apply=False, p=0.5, max_noise_amplitude=0.5, **kwargs):
        super().__init__(always_apply, p)
        self.noise_amplitude = (0.0, max_noise_amplitude)

    def apply(self, y: np.ndarray, **params):
        # 一様分布からサンプルを抽出
        noise_amplitude = np.random.uniform(*self.noise_amplitude)
        # 標準正規分布から出力値分をランダムで出力
        noise = np.random.randn(len(y))
        # 拡張
        augmented = (y + noise * noise_amplitude).astype(y.dtype)
        return augmented

In [None]:
transform = AddGaussianNoise(always_apply=True, max_noise_amplitude=0.05)
y_gaussian_added = transform(y)

# 拡張結果を出力
Audio(y_gaussian_added, rate=sr)

In [None]:
# 音声波形の表示
x = range(len(y_gaussian_added))
plt.plot(x, y_gaussian_added)
plt.plot(x, y_gaussian_added, color='blue')
plt.legend(loc='upper center')
plt.grid()
plt.grid()

In [None]:
# メル尺度のスペクトログラムの算出
mel_spec = librosa.feature.melspectrogram(y=y_gaussian_added, sr=sr, n_mels=128)
# パワースペクトログラム（振幅の2乗）をデシベル（dB）単位に変換
# ログスケールに変換
melspec = librosa.power_to_db(mel_spec)
# スペクトログラム/クロマトグラム/ cqt /などを表示
librosa.display.specshow(melspec, sr=sr, x_axis='time', y_axis='mel')
plt.colorbar()

### 2. GaussianNoiseSNR

In [None]:
class GaussianNoiseSNR(AudioTransform):
    def __init__(self, always_apply=False, p=0.5, min_snr=5.0, max_snr=20.0, **kwargs):
        super().__init__(always_apply, p)
        # 5
        self.min_snr = min_snr
        # 20
        self.max_snr = max_snr

    def apply(self, y: np.ndarray, **params):
        snr = np.random.uniform(self.min_snr, self.max_snr)
        a_signal = np.sqrt(y ** 2).max()
        a_noise = a_signal / (10 ** (snr / 20))

        white_noise = np.random.randn(len(y))
        a_white = np.sqrt(white_noise ** 2).max()
        augmented = (y + white_noise * 1 / a_white * a_noise).astype(y.dtype)
        return augmented

In [None]:
transform = GaussianNoiseSNR(always_apply=True, min_snr=5, max_snr=20)
y_gaussian_snr = transform(y)
Audio(y_gaussian_snr, rate=sr)

In [None]:
x = range(len(y_gaussian_snr))
plt.plot(x, y_gaussian_snr)
plt.plot(x, y_gaussian_snr, color='blue')
plt.legend(loc='upper center')
plt.grid()
plt.grid()

In [None]:
mel_spec = librosa.feature.melspectrogram(y=y_gaussian_snr, sr=sr, n_mels=128)
melspec = librosa.power_to_db(mel_spec)
librosa.display.specshow(melspec, sr=sr, x_axis='time', y_axis='mel')
plt.colorbar()

### 3. PinkNoiseSNR

In [None]:
!pip install colorednoise

In [None]:
import colorednoise as cn

class PinkNoiseSNR(AudioTransform):
    def __init__(self, always_apply=False, p=0.5, min_snr=5.0, max_snr=20.0, **kwargs):
        super().__init__(always_apply, p)

        self.min_snr = min_snr
        self.max_snr = max_snr

    def apply(self, y: np.ndarray, **params):
        snr = np.random.uniform(self.min_snr, self.max_snr)
        a_signal = np.sqrt(y ** 2).max()
        a_noise = a_signal / (10 ** (snr / 20))

        pink_noise = cn.powerlaw_psd_gaussian(1, len(y))
        a_pink = np.sqrt(pink_noise ** 2).max()
        augmented = (y + pink_noise * 1 / a_pink * a_noise).astype(y.dtype)
        return augmented

In [None]:
transform = PinkNoiseSNR(always_apply=True, min_snr=5.0, max_snr=20.0)
y_pink_noise = transform(y)
Audio(y_pink_noise, rate=sr)

In [None]:
x = range(len(y_pink_noise))
plt.plot(x, y_pink_noise)
plt.plot(x, y_pink_noise, color='blue')
plt.legend(loc='upper center')
plt.grid()
plt.grid()

In [None]:
mel_spec = librosa.feature.melspectrogram(y=y_pink_noise, sr=sr, n_mels=128)
melspec = librosa.power_to_db(mel_spec)
librosa.display.specshow(melspec, sr=sr, x_axis='time', y_axis='mel')
plt.colorbar()

### 4. PitchShift

In [None]:
class PitchShift(AudioTransform):
    def __init__(self, always_apply=False, p=0.5, max_steps=5, sr=32000):
        super().__init__(always_apply, p)

        self.max_steps = max_steps
        self.sr = sr

    def apply(self, y: np.ndarray, **params):
        n_steps = np.random.randint(-self.max_steps, self.max_steps)
        augmented = librosa.effects.pitch_shift(y, sr=self.sr, n_steps=n_steps)
        return augmented

In [None]:
transform = PitchShift(always_apply=True, max_steps=2, sr=sr)
y_pitch_shift = transform(y)
Audio(y_pitch_shift, rate=sr)

In [None]:
x = range(len(y_pitch_shift))
plt.plot(x, y_pitch_shift)
plt.plot(x, y_pitch_shift, color='blue')
plt.legend(loc='upper center')
plt.grid()
plt.grid()

In [None]:
mel_spec = librosa.feature.melspectrogram(y=y_pitch_shift, sr=sr, n_mels=128)
melspec = librosa.power_to_db(mel_spec)
librosa.display.specshow(melspec, sr=sr, x_axis='time', y_axis='mel')
plt.colorbar()

### 5. TimeStretch


In [None]:
class TimeStretch(AudioTransform):
    def __init__(self, always_apply=False, p=0.5, max_rate=1.2):
        super().__init__(always_apply, p)

        self.max_rate = max_rate

    def apply(self, y: np.ndarray, **params):
        rate = np.random.uniform(0, self.max_rate)
        augmented = librosa.effects.time_stretch(y, rate=rate)
        return augmented

In [None]:
transform = TimeStretch(always_apply=True, max_rate=2.0)
y_time_stretch = transform(y)
Audio(y_time_stretch, rate=sr)

In [None]:
mel_spec = librosa.feature.melspectrogram(y=y_time_stretch, sr=sr, n_mels=128)
melspec = librosa.power_to_db(mel_spec)
librosa.display.specshow(melspec, sr=sr, x_axis='time', y_axis='mel')
plt.colorbar()

## 6. TimeShift

In [None]:
class TimeShift(AudioTransform):
    def __init__(self, always_apply=False, p=0.5, max_shift_second=2, sr=32000, padding_mode="replace"):
        super().__init__(always_apply, p)
    
        assert padding_mode in ["replace", "zero"], "`padding_mode` must be either 'replace' or 'zero'"
        self.max_shift_second = max_shift_second
        self.sr = sr
        self.padding_mode = padding_mode

    def apply(self, y: np.ndarray, **params):
        shift = np.random.randint(-self.sr * self.max_shift_second, self.sr * self.max_shift_second)
        augmented = np.roll(y, shift)
        if self.padding_mode == "zero":
            if shift > 0:
                augmented[:shift] = 0
            else:
                augmented[shift:] = 0
        return augmented

In [None]:
transform = TimeShift(always_apply=True, max_shift_second=4, sr=sr)
y_time_shifted = transform(y)
Audio(y_time_shifted, rate=sr)

In [None]:
x = range(len(y_time_shifted))
plt.plot(x, y_time_shifted)
plt.plot(x, y_time_shifted, color='blue')
plt.legend(loc='upper center')
plt.grid()
plt.grid()

In [None]:
mel_spec = librosa.feature.melspectrogram(y=y_time_shifted, sr=sr, n_mels=128)
melspec = librosa.power_to_db(mel_spec)
librosa.display.specshow(melspec, sr=sr, x_axis='time', y_axis='mel')
plt.colorbar()

### 7. VolumeControl

In [None]:
class VolumeControl(AudioTransform):
    def __init__(self, always_apply=False, p=0.5, db_limit=10, mode='uniform'):
        super().__init__(always_apply, p)

        assert mode in ["uniform", "fade", "fade", "cosine", "sine"], \
            "`mode` must be one of 'uniform', 'fade', 'cosine', 'sine'"
        
        self.db_limit= db_limit
        self.mode = mode

    def apply(self, y: np.ndarray, **params):
        db = np.random.uniform(-self.db_limit, self.db_limit)
        if self.mode == 'uniform':
            db_translated = 10 ** (db / 20)
        elif self.mode == 'fade':
            lin = np.arange(len(y))[::-1] / (len(y) - 1)
            db_translated = 10 ** (db * lin / 20)
        elif self.mode == 'cosine':
            cosine = np.cos(np.arange(len(y)) / len(y) * np.pi * 2)
            db_translated = 10 ** (db * cosine / 20)
        else:
            sine = np.sin(np.arange(len(y)) / len(y) * np.pi * 2)
            db_translated = 10 ** (db * sine / 20)
        augmented = y * db_translated
        return augmented

In [None]:
transform = VolumeControl(always_apply=True, mode='sine')
y_volume_controlled = transform(y)
Audio(y_volume_controlled, rate=sr)

In [None]:
x = range(len(y_volume_controlled))
plt.plot(x, y_volume_controlled)
plt.plot(x, y_volume_controlled, color='blue')
plt.legend(loc='upper center')
plt.grid()
plt.grid()

In [None]:
mel_spec = librosa.feature.melspectrogram(y=y_volume_controlled, sr=sr, n_mels=128)
melspec = librosa.power_to_db(mel_spec)
librosa.display.specshow(melspec, sr=sr, x_axis='time', y_axis='mel')
plt.colorbar()