In [None]:
import tensorflow as tf
import matplotlib.pyplot as plt
import os 
from scipy.signal import butter, filtfilt
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, Dense, Flatten

# Building a Deep Audio Classifier using original data (no denoising has been applied)
Following: https://www.youtube.com/watch?v=ZLIPkmmDJAc&t=1468s&ab_channel=NicholasRenotte

## Get the data 

In [None]:
# Data from the elephant listening project 
general_path = os.path.join('data', 'Clips')

# To ensure that both classes have same of samples and to increase the number of gunshots, 
# I extracted extra data from: https://data.mendeley.com/datasets/x48cwz364j/3 
background_path = os.path.join('data', 'Sounds_background')
guns_path = os.path.join('data', 'Sounds_gunshots')

gunshot_files = [os.path.join(general_path, 'pnnn*'), os.path.join(general_path, 'ecoguns*'), os.path.join(guns_path, '*\.wav')]

no_gunshot_files = [os.path.join(general_path, 'other*'), os.path.join(background_path, '*\.wav')] 
gunshot = tf.data.Dataset.list_files(gunshot_files) 
no_gunshot = tf.data.Dataset.list_files(no_gunshot_files) 

#to see how many files are in each group: 
#num_elements = tf.data.experimental.cardinality(no_gunshot).numpy()


## 1. Load data and return wave 

In [None]:
def load_data(file_name): 
    file_contents = tf.io.read_file(file_name) #retuns a string 
    wave, sample_rate = tf.audio.decode_wav(file_contents, desired_channels=1) # transforms string into actual wav
    wave = wave - tf.reduce_mean(wave) # remove the mean 
    wave = tf.squeeze(wave, axis= -1) #removes axis 
    #wave = tf.cast(wave * 32768, tf.float32) # value is scaled to look like int16, however, type is kept as float32 for compatibility issues

    return wave, sample_rate

Testing

In [None]:
# Testing 
file = 'data/Clips/other2.wav'
wave, rate = load_data(file)
#plt.plot(wave)
file = 'data/Sounds_background/pnnn4.wav'
wave, rate = load_data(file_name)

## 2. Add labels
1: gunshot 
0: no gunshot

In [None]:
gunshot = tf.data.Dataset.zip((gunshot, tf.data.Dataset.from_tensor_slices(tf.ones(len(gunshot)))))
no_gunshot= tf.data.Dataset.zip((no_gunshot, tf.data.Dataset.from_tensor_slices(tf.zeros(len(gunshot)))))

## 3. Concatenate gunshots and no_gunshots into one data set 

In [None]:
data = gunshot.concatenate(no_gunshot)
data.as_numpy_iterator().next() # see how it looks like 

## 4. Convert data into Spectogram 
Time frequency compromise: 
https://www.tensorflow.org/tutorials/audio/simple_audio
https://www.coursera.org/lecture/audio-signal-processing/stft-2-tjEQe 



In [None]:
def preprocess(file_path, label): 
    # Load data
    wave, sr = load_data(file_path)
    max_lenght = 80000 # = 10* 8000, this means 10 seconds 

    # Padding 
    wave = wave[:max_lenght] #grab first elements up to max(lengths)
    zero_padding = tf.zeros(max_lenght - tf.shape(wave), dtype=tf.float32) # pad with zeros what doesn't meet full length 
    wave = tf.concat([zero_padding, wave],0) 

    # Create spectogram 
    # 1. Fast fourier transform 
    spectrogram = tf.signal.stft(wave, frame_length=256, frame_step=128)  # Paper: 'Automated detection of gunshots in tropical forests using CNN' 
    # frame_length =  window length in samples
    # frame_step = number of samples to step
    # 'Time frequency compromise' 
    # if window size is small: you get good time resolution in exchange of poor frequency resolution 

    # 2. Obtain the magnitude of the STFT
    spectrogram = tf.abs(spectrogram)

    # 3. Tranform it into appropiate format for deep learning model by adding the channel dimension (in this case 1)
    spectrogram = tf.expand_dims(spectrogram, axis=2)
    return spectrogram, label


Testing

In [None]:
#file_name = 'data/Clips/ecoguns1.wav' # bad: pixeled -- and quite a few are the same (1-4)
file_name = 'data/Clips/ecoguns105.wav' # seems good  
file_name = 'data/Clips/pnnn4.wav'
file_name = 'data/Clips/other1.wav'
file_name = 'data/Sounds_gunshots/5B9FE452_38.WAV'
file_name = 'data/Sounds_background/5B1E8AFA.WAV'


#file_name = 'data/Clips_denoised/gunshots/spectral_gating/ecoguns0.wav'

waveform, sr = load_data_2(file_name)
spectrogram, label = preprocess(file_name, '1')

# The reason the plot only shows frequencies up to 140 Hz is because the spectrogram is plotted using a log scale,
#  which compresses higher frequencies. The pcolormesh function is plotting the spectrogram as a 2D heatmap where 
# the x-axis represents time and the y-axis represents frequency, and the color represents the magnitude of the spectrogram 
# at each time-frequency point.

def plot_spectrogram(spectrogram, ax):
  if len(spectrogram.shape) > 2:
    assert len(spectrogram.shape) == 3
    spectrogram = np.squeeze(spectrogram, axis=-1)
  # Convert the frequencies to log scale and transpose, so that the time is
  # represented on the x-axis (columns).
  # Add an epsilon to avoid taking a log of zero.
  log_spec = np.log(spectrogram.T + np.finfo(float).eps)
  height = log_spec.shape[0]
  width = log_spec.shape[1]
  t = np.arange(len(waveform)) / 8000
  X = np.linspace(0, np.size(spectrogram), num=width, dtype=int)
  Y = range(height)
  ax.pcolormesh(X, Y, log_spec)

# tensor flow website 
#fig, ax = plt.subplots()
#plot_spectrogram(spectrogram, ax)
fig, axes = plt.subplots(2, figsize=(12, 8))
t = np.arange(len(waveform)) / 8000
axes[0].plot(t, waveform.numpy())
axes[0].set_title('Waveform')

plot_spectrogram(spectrogram.numpy(), axes[1])

axes[1].set_title('Spectrogram')
plt.suptitle(label.title())
plt.show()

#online 
#plt.figure()
#plt.imshow(tf.math.log(spectrogram).numpy())

#youtube video 
#plt.figure(figsize=(30,20))
#plt.imshow(tf.transpose(spectrogram)[0])
#plt.show()


## 5. Create Training and Testing partitions

In [None]:
batch = 16
data = data.map(preprocess) # calling preprocess method which generates spectograms
data = data.cache()
data = data.shuffle(buffer_size=1000) # mixing training samples 
data = data.batch(batch) #train at 16 samples at the time 
data = data.prefetch(8) 

In [None]:
train = data.take(round(len(data)*.7)) #taking 70% of the total data
test = data.skip(round(len(data)*.7)).take(len(data) - round(len(data)*.7)) # taking remaining 30% 

In [None]:
samples,labels = train.as_numpy_iterator().next()
samples.shape
# Input to Neural network: 624, 129, 1

## 6. Build Deep Learning model 

In [None]:
model = Sequential()
# Adding layers 
model.add(Conv2D(16, (3,3), activation='relu', input_shape=(624, 129, 1))) #matching samples.shape
model.add(Conv2D(16, (3,3), activation='relu'))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [None]:
model.compile('Adam', loss='BinaryCrossentropy', metrics=['accuracy', 'Recall', 'Precision'])
# learning rate = 0.001

In [None]:
model.summary()


## 7. Train model 

In [None]:
hist = model.fit(train, epochs=100, validation_data=test)


In [None]:
hist.history

In [None]:
plt.title('Loss: original data')
plt.plot(hist.history['loss'], 'r')
plt.plot(hist.history['val_loss'], 'b')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend(['train', 'test'])
plt.grid()
plt.savefig('loss_original_data.png')
plt.show()

In [None]:
plt.title('Precision: original data')
plt.plot(hist.history['precision'], 'r')
plt.plot(hist.history['val_precision'], 'b')
plt.xlabel('Epochs')
plt.ylabel('Precision')
plt.legend(['train', 'test'])
plt.grid()
plt.savefig('precision_original_data.png')
plt.show()

In [None]:
plt.title('Recall: original data')
plt.plot(hist.history['recall'], 'r')
plt.plot(hist.history['val_recall'], 'b')
plt.xlabel('Epochs')
plt.ylabel('Recall')
plt.legend(['train', 'test'])
plt.grid()
plt.savefig('recall_original_data.png')
plt.show()

In [None]:
plt.title('Accuracy: original data')
plt.plot(hist.history['accuracy'], 'r')
plt.plot(hist.history['val_recall'], 'b')
plt.xlabel('Epochs')
plt.ylabel('Recall')
plt.legend(['train', 'test'])
plt.grid()
plt.savefig('accuracy_original_data.png')
plt.show()

## 8. Make a prediction 

In [None]:
X_test, y_test = test.as_numpy_iterator().next()
yhat = model.predict(X_test)
yhat


In [None]:
yhat = [1 if prediction > 0.9 else 0 for prediction in yhat]
yhat

In [None]:
y_test

In [None]:
tf.math.reduce_sum(y_test)

## 9. See how good the predictions are from the testing set 

In [None]:
model.evaluate(test)

In [None]:
iterator = test.as_numpy_iterator()
total = 0
true_positive = 0 
true_negative = 0 
false_positive = 0
false_negative = 0
while True:
    try: 
        X_test, y_test = iterator.next()
        yhat = model.predict(X_test)
        yhat = [1 if prediction > 0.9 else 0 for prediction in yhat]
        for prediction,result in zip(yhat, y_test):
            if prediction == result and prediction == 1:
               true_positive +=1 
            elif  prediction == result and prediction == 0:
                true_negative +=1 
            elif prediction != result and prediction == 1:
                false_positive +=1
            else:
                false_negative +=1
    except Exception:
        break 


In [None]:
total = true_positive + true_negative + false_positive + false_negative 
print(total)
print(true_positive )
print(true_negative) 
print(false_positive) 
print(false_negative)

In [None]:
accuracy = (true_positive+true_negative) /total
accuracy*100