In [None]:
import tensorflow as tf
import matplotlib.pyplot as plt
import os 
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, Dense, Flatten, MaxPool2D
from tensorflow import keras
import seaborn as sns
from sklearn.model_selection import StratifiedKFold
from audiomentations import Compose, AddGaussianNoise, PitchShift, Shift, TimeMask, TimeStretch

## Get the data 

In [None]:
# Data from the elephant listening project 
general_path = os.path.join('data', 'Clips')

# To ensure that both classes have same of samples and to increase the number of gunshots, 
# I extracted extra data from: https://data.mendeley.com/datasets/x48cwz364j/3 
background_path = os.path.join('data', 'Sounds_background')
guns_path = os.path.join('data', 'Sounds_gunshots')

gunshot_files = [os.path.join(general_path, 'pnnn*'), os.path.join(general_path, 'ecoguns*'), os.path.join(guns_path, '*\.wav')]

no_gunshot_files = [os.path.join(general_path, 'other*'), os.path.join(background_path, '*\.wav')] 
gunshot = tf.data.Dataset.list_files(gunshot_files) 
no_gunshot = tf.data.Dataset.list_files(no_gunshot_files) 

#to see how many files are in each group: 
#num_elements = tf.data.experimental.cardinality(no_gunshot).numpy()


## 1. Load data and return wave 

In [120]:
def load_data(file_name): 
    file_contents = tf.io.read_file(file_name) #retuns a string 
    wave, sample_rate = tf.audio.decode_wav(file_contents, desired_channels=1) # transforms string into actual wav
    wave = wave - tf.reduce_mean(wave) # remove the mean 
    wave = wave / tf.reduce_max(tf.abs(wave)) #normalize 
    wave = tf.squeeze(wave, axis= -1) #removes axis 
    #wave = tf.cast(wave * 32768, tf.float32) # value is scaled to look like int16, however, type is kept as float32 for compatibility issues

    return wave, sample_rate

## 2. Add labels
1: gunshot 
0: no gunshot

In [None]:
gunshot = tf.data.Dataset.zip((gunshot, tf.data.Dataset.from_tensor_slices(tf.ones(len(gunshot)))))
no_gunshot= tf.data.Dataset.zip((no_gunshot, tf.data.Dataset.from_tensor_slices(tf.zeros(len(gunshot)))))

In [201]:
general_path = os.path.join('data', 'ollie')
gunshot_files = [os.path.join(general_path, 'pnnn*'), os.path.join(general_path, 'ecoguns*')]
gunshot = tf.data.Dataset.list_files(gunshot_files) 
print(len(gunshot))
gunshot = tf.data.Dataset.zip((gunshot, tf.data.Dataset.from_tensor_slices(tf.ones(len(gunshot)))))
data = gunshot
data.as_numpy_iterator().next() # see how it looks like 


1


(b'data/ollie/ecoguns0.wav', 1.0)

## 3. Concatenate gunshots and no_gunshots into one data set 

In [None]:
data = gunshot.concatenate(no_gunshot)
data.as_numpy_iterator().next() # see how it looks like 


## 4. Convert data into Spectogram 
Time frequency compromise: 
https://www.tensorflow.org/tutorials/audio/simple_audio <br>
https://www.coursera.org/lecture/audio-signal-processing/stft-2-tjEQe 



In [184]:
def make_spectogram(wave, label): 
    max_lenght = 80000 # = 10* 8000, this means 10 seconds 

    # Padding 
    wave = wave[:max_lenght] #grab first elements up to max(lengths)
    zero_padding = tf.zeros(max_lenght - tf.shape(wave), dtype=tf.float32) # pad with zeros what doesn't meet full length 
    wave = tf.concat([zero_padding, wave],0) 

    # Create spectogram 
    # 1. Fast fourier transform 
    spectrogram = tf.signal.stft(wave, frame_length=256, frame_step=128)  # Paper: 'Automated detection of gunshots in tropical forests using CNN' 
    # frame_length =  window length in samples
    # frame_step = number of samples to step
    # 'Time frequency compromise' 
    # if window size is small: you get good time resolution in exchange of poor frequency resolution 

    # 2. Obtain the magnitude of the STFT
    spectrogram = tf.abs(spectrogram)

    # 3. Tranform it into appropiate format for deep learning model by adding the channel dimension (in this case 1)
    spectrogram = tf.expand_dims(spectrogram, axis=2)
    return spectrogram, label


In [165]:
def load_file_into_wave(file_path, label): 
    # Load data
    wave, sr = load_data(file_path)
    return wave, label

In [200]:
# https://towardsdatascience.com/audio-augmentations-in-tensorflow-48483260b169

augmentations_pipeline = Compose([
    AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),
    TimeMask(min_band_part=0.0, max_band_part= 0.1, fade = False, p = 0.5), 
    PitchShift(min_semitones=-4, max_semitones=4, p=0.5),
    TimeStretch(min_rate=0.8, max_rate=1.25, p=0.5),
])  

def apply_pipeline(y, sr):
    augmented_samples = augmentations_pipeline(samples=y, sample_rate=sr)
    #augmented_samples = tf.convert_to_tensor(augmented_samples) #conver it back to a tensor
    
    return augmented_samples


@tf.function
def tf_apply_pipeline(feature, label):
    """
    Applies the augmentation pipeline to audio files
    @param y: audio data
    @param sr: sampling rate
    @return: augmented audio data
    """
    augmented_feature = tf.numpy_function(
        apply_pipeline, inp=[feature, label], Tout=tf.float32, name="apply_pipeline", stateful = True
    )

    return augmented_feature, label


def augment_audio_dataset(dataset: tf.data.Dataset):
    dataset = dataset.map(tf_apply_pipeline)

    return dataset

In [202]:
# HOW TO USE THE DATA AUGMENTATION METHODS 

data = data.map(load_file_into_wave) # loading the paths into waves 
original_data = data 
data = augment_audio_dataset(data) # create new data 
data = data.map(make_spectogram) # convert waves into spectograms 


## 5. Shuffle the data such that not all gunshots are followed by gunshots, and similarly with no gunshots. 

In [None]:
data = data.map(preprocess) 


In [None]:
data = data.map(preprocess) # calling preprocess method which generates spectograms
data = data.cache()
data = data.shuffle(buffer_size=1000) # mixing training samples 1000 at the time  

## 6. Extract samples and labels 

In [None]:
iterator = data.as_numpy_iterator()
x = []
y = []
while True:
    try: 
        x_temp, y_temp = iterator.next()
        x.append(x_temp)
        y.append(y_temp)
    except Exception:
        break 

In [None]:
splits = 10 
kfold = StratifiedKFold(n_splits=splits, shuffle=True, random_state=123)


In [None]:
counter = 0
from audiomentations import SpecCompose, SpecChannelShuffle, SpecFrequencyMask
augment = SpecCompose(
    [
        SpecFrequencyMask(p=0.5),
    ]
)


for train, test in kfold.split(x, y):
    x_train = np.array(x)[train.astype(int)]
    y_train = np.array(y)[train.astype(int)]
    x_test = np.array(x)[test.astype(int)]
    y_test = np.array(y)[test.astype(int)]

    if counter == 0: 
        #print(x_train)
        augmented_spectrogram = augment(x_train)
        print('different')
        print(augmented_spectrogram)
    counter = counter + 1

