In [1]:
import os
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
import colorednoise as cn
import torch
import torchaudio
import torchaudio.transforms as T

from tqdm import tqdm
from PIL import Image
from tqdm import tqdm
%matplotlib inline

In [2]:
audio_dir = 'dataset/'

In [3]:
dir_list = os.listdir(audio_dir)

In [4]:
spectrograms_dir = 'spectrogram/'
if not os.path.exists(spectrograms_dir):
    os.makedirs(spectrograms_dir)

In [5]:
noise_scale = 0.005

In [None]:
# Define time and frequency masking parameters
time_mask_param = 10
freq_mask_param = 10

In [None]:
# Define time and frequency masking transformations
time_masking = T.TimeMasking(time_mask_param=time_mask_param)
freq_masking = T.FrequencyMasking(freq_mask_param=freq_mask_param)

In [6]:
# Loop through all the directories (bird species)
for directory in tqdm(os.listdir(audio_dir)):
    
    # Create a new directory for each bird species inside the "spectrograms" directory
    output_dir = os.path.join(spectrograms_dir, directory)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # Get a list of all the audio files in the current directory
    file_list = os.listdir(os.path.join(audio_dir, directory))
    
    # Loop through all the audio files in the current directory
    for filename in file_list:
        
        # Load the audio file
        file_path = os.path.join(audio_dir, directory, filename)
        y, sr = librosa.load(file_path, duration=45)
        
        # Add pink noise augmentation to the audio file
        pink_noise = np.random.normal(scale=noise_scale, size=y.shape)
        y_augmented = y + pink_noise
        
        # Compute the mel spectrogram of the augmented audio file
        S = librosa.feature.melspectrogram(y=y_augmented, sr=sr, n_mels=128, fmax=8000)
        S_dB = librosa.amplitude_to_db(S, ref=np.max)
        
        # Split the spectrogram into three equal parts
        n_frames = S_dB.shape[1]
        frame_splits = np.array_split(np.arange(n_frames), 3)
        
        # Split the audio file into chunks and process each chunk separately
        chunk_size = 10 * sr  # 10 seconds
        for i, chunk_start in enumerate(range(0, len(y_augmented), chunk_size)):
            # Get the start and end samples of the current chunk
            chunk_end = chunk_start + chunk_size
            chunk = y_augmented[chunk_start:chunk_end]
            
            # Compute the mel spectrogram of the current chunk
            S = librosa.feature.melspectrogram(y=chunk, sr=sr, n_mels=128, fmax=8000)
            S_dB = librosa.amplitude_to_db(S, ref=np.max)
            
            # Apply time and frequency masking to the current chunk
            augmented_melspec = torch.Tensor(S_dB).unsqueeze(0)  # unsqueeze to add batch dimension
            augmented_melspec = time_masking(augmented_melspec)
            augmented_melspec = freq_masking(augmented_melspec)
            S_dB = augmented_melspec.squeeze(0).numpy()  # squeeze to remove batch dimension and convert back to NumPy array
            
            # Save the spectrogram as a PNG image file in the current directory of the "spectrograms" directory
            output_file = os.path.join(output_dir, f"{os.path.splitext(filename)[0]}_{i+1}.png")
            plt.figure(figsize=(10.00, 6.00), dpi=100)
            librosa.display.specshow(S_dB, x_axis='time', y_axis='mel', sr=sr, fmax=8000)
            plt.axis('off')
            plt.tight_layout()
            plt.savefig(output_file, bbox_inches='tight', pad_inches=0)
            plt.close()

  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
100%|█████████████████████████████████████████████████████████████████████████████████| 10/10 [29:45<00:00, 178.52s/it]
