In [63]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [65]:
import librosa
import os
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
import IPython.display as ipd
import soundfile as sf
from pydub import AudioSegment
from pydub.utils import make_chunks
from tensorflow.keras.models import load_model

In [66]:
 cd '/content/drive/MyDrive/capuuchan_audio_classification'

/content/drive/MyDrive/capuuchan_audio_classification


Extracting the duration of the audios.

In [67]:
#calculating the average duration of the cappuchan bird call

training_path = 'Parsed_Capuchinbird_Clips'

full_path_duration = []
full_path_length = []
for i in os.listdir(training_path):
  full_path = os.path.join(training_path, i)
  signal, sr = librosa.load(full_path)
  time = len(signal)/sr
  length = len(signal)
  full_path_duration.append(time)
  full_path_length.append(length)

In [68]:
mean_duration = np.mean(full_path_duration)
mean_length = np.mean(full_path_length)
mean_duration, mean_length

(3.3322972915874423, 73477.1552795031)

The 3.5 average seconds account for approximately 74k samples, thus I willl build a sliding window, of the number of samples in this time frame

# Testing the Audio
here I will give a merged audio into the sliding window.
The audio of two cappucin birds will be merged with Audacity Software

In [69]:
merged_audio = '/content/drive/MyDrive/capuuchan_audio_classification/cappuchin_merged_audio.wav'
merge_signal, sr_merg = librosa.load(merged_audio)

In [70]:
ipd.Audio(merged_audio)

In [71]:
merge_signal.shape

(176400,)

#Detecting the Number of Bird calls
here the sliding window will iterate through the whole audio and then detect the capuchin bird calls

In [152]:
class audio_preparation:
  def __init__(self):
    pass

  def add_padding(self, array):     #padding the arrays to prepare for the model
    #the array with the max size
    max_value = max([k.shape[0] for k in array])
    new_audio = []
    for i in range(len(array)):
      zeros = np.zeros((max_value - array[i].shape[0]))
      new_array = np.concatenate([array[i], zeros], axis=0)
      new_audio.append(new_array)
    return new_audio

  #siliding window
  def sliding_window(self, arr):
    window_size = 75000
    random_lst = []
    for i in range(0, arr.shape[0], window_size):
      random_lst.append(arr[i:i+window_size])
    return random_lst

In [148]:
prep = audio_preparation()

In [151]:
#sliding the window
window = prep.sliding_window(merge_signal)

In [75]:
#we get different size audios
window[0].shape, window[1].shape, window[2].shape

((75000,), (75000,), (26400,))

In [110]:
def mel_spectrogram(arr, sr):
  audios = []
  for i in arr:
    mel = librosa.feature.melspectrogram(i, sr=sr, n_mels=100)
    log_mel = librosa.power_to_db(mel, ref=np.max)
    audios.append(log_mel)
  return audios

In [113]:
#extracting the mel
mel = mel_spectrogram(window, sr_merg)
mel[0].shape, mel[1].shape, mel[2].shape

((100, 147), (100, 147), (100, 52))

In [137]:
#reshaping the mel spectrograms for the model (100, 216) shape
shape = 216
new_audio = []
for i in range(len(mel)):
  zeros = np.zeros((mel[i].shape[0],shape - mel[i].shape[1]))
  new_array = np.concatenate([mel[i], zeros], axis=1)
  new_audio.append(new_array)

In [131]:
arr = np.array(new_audio)
arr.shape

(3, 100, 216)

In [132]:
test_arr = np.expand_dims(arr, axis=-1)
test_arr.shape

(3, 100, 216, 1)

In [133]:
model_path = '/content/drive/MyDrive/capuuchan_audio_classification/audio_model.h5'
model = load_model(model_path)

In [138]:
yp = model.predict(test_arr)

In [141]:
prediction = []
for i in yp:
  if i <= 0.5:
    yp = 0
  elif i > 0.5:
    yp = 1
  prediction.append(yp)

In [144]:
print(f'The number of times birds have called is: {np.sum(prediction)}')

The number of times birds have called is: 2


#Conclusion
here I had merged two audios of the capuchin bird, and the sliding window algorithm has detected two possible instances with 98% probability of capuchin bird sound. If we apply this on various audios belonging to same area and then divide the total number of birdcalls with the area we will have the population density