In [None]:
import numpy as np
import pandas as pd
import librosa
import matplotlib.pyplot as plt
from PIL import Image
from tqdm.notebook import tqdm
import os
import csv
import warnings
warnings.filterwarnings(action='ignore')

In [None]:
AUDIO_PATH = '../input/birdclef-2021/train_short_audio/'
MEL_DATASET = '../working/melspectrogram_dataset/'
SAMPLING_RATE = 32000 # Hertz
INPUT_LENGTH = 5 * 4      # seconds                         # <-- 20 seconds!!
SPEC_SHAPE = (48,128 * 4) # spectrogram shape
FMIN=500              # Hz (~min frequency for birds)
FMAX=12500            # Hz (~max frequency for birds)

## Dataset creation

In [None]:
# Load metadata file
metadata_df = pd.read_csv('../input/birdclef-2021/train_metadata.csv')   # (62874 files) 
# metadata_df = metadata_df.query('rating>=3.5')                           # (48886 files)
# metadata_df = metadata_df.query('rating>=4')                             # (38226 files)

In [None]:
def get_spectrograms(audio_path, primary_label, output_dir):
    # get the signal with librosa
    sig, rate = librosa.load(audio_path, sr=SAMPLING_RATE, offset=None)
    
    lunp_samples = int(INPUT_LENGTH*SAMPLING_RATE)   # number of time samples in a piece of signal (Xs*32000Hz)
    
    # split signal into X second lumps
    sig_splits = []
    for i in range(0, len(sig), lunp_samples):
        split = sig[i:i + lunp_samples]

        # End of signal
        if len(split) < lunp_samples:
            break
        sig_splits.append(split)
    
    # extract mel spectrograms
    split_count = 0
    samples = []
    
    # sig_splits = list([sig_splits[0], sig_splits[-1]])             # <-- only first+last 5 seconds
    
    for i, lump in enumerate(sig_splits):
        HOP_LENGTH = int((INPUT_LENGTH*SAMPLING_RATE)/(SPEC_SHAPE[1]-1))
        mel_spec = librosa.feature.melspectrogram(y=lump,
                                                 sr=SAMPLING_RATE,
                                                 n_fft=1024,
                                                 hop_length=HOP_LENGTH,
                                                 n_mels=SPEC_SHAPE[0],
                                                 fmin=FMIN,
                                                 fmax=FMAX)
        
        mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
        
        # Normalize
        mel_spec -= mel_spec.min()
        mel_spec /= mel_spec.max()
        
        spec_stats[0] += mel_spec.sum()
        spec_stats[1] += (mel_spec ** 2).sum()
        
        # Save as image file
        save_dir = os.path.join(output_dir, primary_label) # output_dir/primary_label
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        save_path = os.path.join(save_dir, audio_path.rsplit(os.sep, 1)[-1].rsplit('.', 1)[0] + 
                                 '_' + str(split_count) + '.png')

        im = Image.fromarray(mel_spec * 255.0).convert("L") # color -> greyscale (mode “L”): L = R * 299/1000 + G * 587/1000 + B * 114/1000
        im.save(save_path)   # save image in the specified path
        
        samples.append(save_path)
        split_count += 1
                
    return samples

In [None]:
# save signal duration in 5s units
def get_duration(audio_path):
    sig, rate = librosa.load(audio_path, sr=SAMPLING_RATE, offset=None)
    lunp_samples = int(5*SAMPLING_RATE)   # number of 5s time samples in a piece of signal (5s*32000Hz)
    return len(sig)//lunp_samples

In [None]:
# Parse audio files and extract training samples
TRAIN_SPECS = []
spec_stats = np.zeros(2)
# duration = []
f=open('duration.csv','w')
writer = csv.writer(f)

with tqdm(total=len(metadata_df)) as pbar:
    for idx, row in metadata_df.iterrows():
        pbar.update(1)
        audio_file_path = os.path.join(AUDIO_PATH, row.primary_label, row.filename)
        TRAIN_SPECS += get_spectrograms(audio_file_path, row.primary_label, MEL_DATASET)
        writer.writerow([get_duration(audio_file_path)])
#         if idx == 12:
#             break
f.close()
print('SUCCESSFULLY EXTRACTED {} SPECTROGRAMS'.format(len(TRAIN_SPECS)))

In [None]:
# Plot the first 12 spectrograms of TRAIN_SPECS
plt.figure(figsize=(15, 7))
for i in range(12):
    spec = Image.open(TRAIN_SPECS[i])
    plt.subplot(6, 2, i + 1)
    plt.title(TRAIN_SPECS[i].split(os.sep)[-1])
    plt.imshow(spec, origin='lower')

In [None]:
# MEAN and STD (but I will use mean and std obtained with all spectrograms; here I'm discarding audio<20s)
# pixel count
count = len(TRAIN_SPECS) * SPEC_SHAPE[0] * SPEC_SHAPE[1]
# mean and std
total_sound_mean = spec_stats[0] / count
total_sound_var  = (spec_stats[1] / count) - (total_sound_mean ** 2)   # E[x^2] - (E[x])^2
total_sound_std  = np.sqrt(total_sound_var)

print('mean: ', total_sound_mean)
print('std:  ', total_sound_std)

In [None]:
import shutil
shutil.make_archive('./melspec_dataset_20s', 'zip', './melspectrogram_dataset') # name will be first.second
shutil.rmtree("./melspectrogram_dataset")   # remove folder