In [6]:
import random
import os
from os.path import splitext
from pydub import AudioSegment, utils
import numpy as np
import matplotlib.pyplot as plt
import librosa
import librosa.display

DIR = os.getcwd()

VALID_AUDIO_EXTENSIONS =('.mp3', '.wav', '.flac', '.ogg', '.m4a', '.aac')
# get list of audio files in a folder ignoring any other files to prevent errors
def getListOfAudioFilesInFolder(folder_path):
  file_list = [file for file in os.listdir(folder_path) if file.endswith(VALID_AUDIO_EXTENSIONS)]
  return file_list

def split_audio_into_windows(audio, window_duration=10000):
  """
  Takes in a single audio clip and splits it into multiple clips of length `window_duration` (default is 10 seconds).

  Returns:
      list[] windows: a list of audio clips of length `window_duration` in milliseconds.
  """
  windows = []
  audio_duration = len(audio)

  for start_time in range(0, audio_duration, window_duration):
    end_time = start_time + window_duration
    window = audio[start_time:end_time]
    windows.append(window)

  return windows

# Insert the gunshot audio into the background audio at a random interval.
def random_insertion(background_audio, gunshot_audio):
  try:
    # If gunshot_audio can fit, proceed with insertion
    if len(gunshot_audio) <= len(background_audio):
      start_time = random.randint(0, len(background_audio) - len(gunshot_audio))
      result = background_audio.overlay(gunshot_audio, position=start_time)

      return result
  except ValueError as e:
    # Ignore this case and return None
    print(f"Ignoring error: {e}")
    return

# Generate a spectrogram for the audio file and save it to the output file.
def generate_spectrogram(audio_segment, output_file):
  temp_audio = f"{DIR}/output/audio/{output_file}.mp3"
  audio_segment.export(temp_audio, format="mp3")
 
  samples = librosa.load(temp_audio, sr=audio_segment.frame_rate)[0]
  # Z-normalize the samples 
  # samples = (samples - np.mean(samples)) / np.std(samples)
  # Use librosa to compute a spectrogram
  spectrogram = librosa.feature.melspectrogram(y=samples, sr=audio_segment.frame_rate)
  # Convert amplitude to decibels
  log_spectrogram = librosa.power_to_db(spectrogram, ref=np.max)

  # Plot and save the spectrogram
  plt.figure(figsize=(10, 4))
  librosa.display.specshow(log_spectrogram, sr=audio_segment.frame_rate, x_axis='time', y_axis='linear')
  # plt.colorbar(format='%+2.0f dB')
  plt.axis('off')
  plt.savefig(f"{DIR}/output/spectrograms/{output_file}", bbox_inches='tight', pad_inches=0)
  # plt.show()
  os.remove(temp_audio)
  plt.close()

# Create multiple variations of gunshot-inserted audio files.
def create_variations(window_duration=10000):
  background_folder = f"{DIR}/input/background"
  gunshot_folder = f"{DIR}/input/gunshot"
  non_gunshot_folder = f"{DIR}/input/non-gunshot"
  background_files = getListOfAudioFilesInFolder(background_folder)
  gunshot_files = getListOfAudioFilesInFolder(gunshot_folder)
  non_gunshot_files = getListOfAudioFilesInFolder(non_gunshot_folder)

  for bg_file_name in background_files:
    bg_file_path = os.path.join(background_folder, bg_file_name)
    bg_audio = AudioSegment.from_file(bg_file_path)
    
    for i, window in enumerate(split_audio_into_windows(bg_audio, window_duration)):
      # Select a gunshot within 80% chance, otherwise select a non-gunshot
      include_gunshot = True
      if random.random() <= 0.8:
        selected_gunshot = random.choice(gunshot_files)
        gunshot_audio = AudioSegment.from_file(os.path.join(gunshot_folder, selected_gunshot))
      else:
        include_gunshot = False
        selected_gunshot = random.choice(non_gunshot_files)
        gunshot_audio = AudioSegment.from_file(os.path.join(non_gunshot_folder, selected_gunshot))
      
      # Normalize the audio segments to prevent clipping
      window = window.normalize()
      gunshot_audio = gunshot_audio.normalize()
      
      window_name = f"{splitext(bg_file_name)[0]}_window={i+1}"

      # Generate a clean spectrogram (the original background window without a gunshot)
      generate_spectrogram(window, f"{window_name}_gun=0.png")
      
      # Insert the gunshot into the background window and create different copies for each volume level
      for volume in [0.08, 0.35, 0.70]:
        modified_gunshot = gunshot_audio.apply_gain(utils.ratio_to_db(volume))
        result_audio = random_insertion(window, modified_gunshot)
        
        if result_audio is not None:
          result_audio = result_audio.normalize()
          output_filename = f"{window_name}_vol={int(volume * 100)}%_gun={1 if include_gunshot else 0}"
          
          # Export result audio as mp3
          # result_audio.export(f"{DIR}/output/audio/{output_filename}.mp3", format="mp3")
          # Generate the spectrogram for the result audio
          generate_spectrogram(result_audio, f"{output_filename}.png")


In [None]:
# Create variations
create_variations()

# Utilities