## Get Google Drive

In [None]:
%load_ext autoreload
from google.colab import drive
import os

drive.mount('/content/gdrive')
path = '/content/gdrive/My Drive/Breath-Data/data/training/'
os.chdir(path)
!ls

In [None]:
!pip install tensorflow-gpu --upgrade

In [None]:
import tensorflow as tf
print(tf.__version__)


In [None]:
path_2 = '/content/gdrive/My Drive/Breath-Data/data/training/heavy/*'
# os.chdir(path_2)
import glob
list_file = glob.glob(path_2)
print(len(list_file))

In [None]:
%matplotlib inline
import numpy, scipy, matplotlib.pyplot as plt, IPython.display as ipd
import librosa, librosa.display

x, sr = librosa.load('/content/gdrive/My Drive/Breath-Data/data/training/heavy/05_male_21_NLinh_78_heavy.wav')
ipd.Audio(x, rate=sr)
plt.figure(figsize=(15, 5))
plt.ylabel('Amplitude - Heavy')
librosa.display.waveplot(x, sr, alpha=0.8)

In [None]:
%matplotlib inline
import numpy, scipy, matplotlib.pyplot as plt, IPython.display as ipd
import librosa, librosa.display


def modified_z_score(intensity):
    median_int = np.median(intensity)
    mad_int = np.median([np.abs(intensity - median_int)])
    modified_z_scores = 0.6745 * (intensity - median_int) / mad_int
    return modified_z_scores

def fixer(y,m):
    threshold = 7 # binarization threshold. 
    spikes = abs(np.array(modified_z_score(np.diff(y)))) > threshold
    y_out = y.copy() # So we don’t overwrite y
    for i in np.arange(len(spikes)):
        if spikes[i] != 0: # If we have an spike in position i
            w = np.arange(i-m,i+1+m) # we select 2 m + 1 points around our spike
            w2 = w[spikes[w] == 0] # From such interval, we choose the ones which are not spikes
            y_out[i] = np.mean(y[w2]) # and we average their values
 
    return y_out

x, sr = librosa.load('/content/gdrive/My Drive/Breath-Data/data/training/heavy/05_male_21_NLinh_78_heavy.wav')

x = fixer(x, 3)
ipd.Audio(x, rate=sr)
plt.figure(figsize=(15, 5))
plt.ylabel('Amplitude - Heavy')
librosa.display.waveplot(x, sr, alpha=0.8) 

In [None]:
x, sr = librosa.load('/content/gdrive/My Drive/Breath-Data/data/training/deep/04_female_21_LAnh_21_deep.wav')

ipd.Audio(x, rate=sr)
plt.figure(figsize=(15, 5))
plt.ylabel('Amplitude - deep')
librosa.display.waveplot(x, sr, alpha=0.8)

In [None]:
x, sr = librosa.load('/content/gdrive/My Drive/Breath-Data/data/training/normal/07_male_21_MQuang_6_normal.wav')
ipd.Audio(x, rate=sr)
plt.figure(figsize=(15, 5))
plt.ylabel('Amplitude - normal')
librosa.display.waveplot(x, sr, alpha=0.8)

## Pre-processing data Breath

In [None]:
!pip install pydub

In [None]:
import os
import pandas as pd
import random

from pydub import AudioSegment

duration_breath = 0

In [None]:
def get_audio_segment (filename, source_path, destination_path, start, end, status, i):
    """[summary]
    
    Arguments:
        filename {[type]} -- [filename of the audio file]
        source_path {[type]} -- [source of a audio file]
        destination_path {[type]} -- [destination of a output file]
        start {[type]} -- [description]
        end {[type]} -- [description]
        status {[type]} -- [type of breath]
    """
    
    # Pydub works in milliseconds
    start = start * 1000 
    end = end * 1000
    offset_time = (end - start)

    global duration_breath
    print(offset_time)
    print(duration_breath)

    if offset_time > duration_breath and offset_time < 5000:
      duration_breath = offset_time
    
    # print(start, end, end_2)
    
    # Get the audio file
    src_Audio = AudioSegment.from_wav(source_path)
    
    # Get the audio offset
    offset_Audio = AudioSegment.from_wav('/content/gdrive/My Drive/Breath-Data/Sine.wav')
    
    #Cut the right part
    output_audio = src_Audio[start:end]

    try:
      if offset_time > 4000:
        raise Exception("offset_time over 4000")

      # random start audio
      rand = random.randint(0, int(3000 - offset_time))
    
    
      #Convert to 5s
      # output_audio = offset_Audio[0:rand] + output_audio[0:offset_time] + offset_Audio[rand+offset_time+1:3000]

      for j in range(int(end - 4000), int(start), 100):
        if j < 0:
          continue
        output_audio = src_Audio[j:j+4000]
        # Define name file
        fname =  filename + "_" + str(i) + "_" + str(j) + "_" + status + '.wav'
        # print(destination_path)
        output_audio.export(destination_path + '/' + fname, 
                            format="wav")  # Exports to a wav file in the current path.

      
      # Define name file
      # fname =  filename + "_" + str(i) + "_" + status + '.wav'
      # print(destination_path)
      # output_audio.export(destination_path + '/' + fname, 
      #                     format="wav")  # Exports to a wav file in the current path.
    except Exception as e:
      print(e)
      pass

In [None]:
def split_by_label(source_path, destination_path, label_path, output_folder):

    # Check the output directory status 

    check_directory(destination_path, output_folder)

    output_path = os.path.join(destination_path, output_folder) + '/'
    # Get all the audio files
    filenames = os.listdir(source_path)
    
    meta_data=[["",""]]
    
    # Go through all the file 
    for filename in filenames:
        
        # take the file name without dot
        filename =  filename.split(".")[0]
        
        #get wav file name path
        wav_path = source_path + filename + ".wav"
        
        #get label filename path
        csv_path = label_path + filename + ".txt"
        
        #read the label file path 
#         label = pd.read_csv(csv_path, delim_whitespace= True)

        if not os.path.isfile(csv_path):
            print("Not found:", filename)
            continue

        # Open the file with read only permit
        label = open(csv_path, "r")
        # use readlines to read all lines in the file
        # The variable "lines" is a list containing all lines in the file
        lines = label.readlines()
        
#         # split the text
#         words = text.split()
        
        # close the file after reading the lines.
        label.close()
        
        print(csv_path)
        
        i = 0
        
        #Normal breath
        for line in lines:
            breath_start = float(line.split()[0])
            breath_end   = float(line.split()[1])
            try:
                breath = line.split()[2]
                if breath == 'strong':
                  breath = 'heavy'
                if breath == 'normla':
                  breath = 'normal'
            except Exception as e:
                print(e)
                continue
            output_by_label = os.path.join(output_path, breath)               
            if not os.path.exists(output_by_label):
                os.makedirs(output_by_label)
            #Export the file
            get_audio_segment(filename, wav_path, output_by_label, breath_start, breath_end, breath, i)
            
            i += 1
            
            # Add other label
            

In [None]:
# Dev set
DEV_INPUT_PATH = "/content/gdrive/My Drive/Breath-Data/raw_data/training/"
DEV_OUTPUT_PATH = "/content/gdrive/My Drive/Breath-Data/data/"
DEV_LABEL_PATH = "/content/gdrive/My Drive/Breath-Data/raw_data/training/label/"

TRAIN_OUTPUT_FOLDER = "training"
TEST_OUTPUT_FOLDER = "validation"


# Test set
TEST_INPUT_PATH = "/content/gdrive/My Drive/Breath-Data/raw_data/validation/"
TEST_OUTPUT_PATH = "/content/gdrive/My Drive/Breath-Data/data/"
TEST_LABEL_PATH = "/content/gdrive/My Drive/Breath-Data/raw_data/validation/label/"

CHUNK = 5
OVERLAP = 0.5

LABEL_FOLDER = 'label'
 # Type of breath
BREATH_TYPE = ['normal', 'deep', 'heavy', 'other']

In [None]:
os.chdir(TEST_LABEL_PATH)
!ls

In [None]:
def check_directory(origin_path, folder):
    directory = os.path.join(origin_path, folder)
    if not os.path.exists(directory):
        os.makedirs(directory)

In [None]:
# Training set
split_by_label(DEV_INPUT_PATH, DEV_OUTPUT_PATH, DEV_LABEL_PATH, TRAIN_OUTPUT_FOLDER)

# Validation set
split_by_label(TEST_INPUT_PATH, TEST_OUTPUT_PATH, TEST_LABEL_PATH, TEST_OUTPUT_FOLDER)

## SincConv (slow implementation)
Speaker Recognition from Raw Waveform with SincNet
Mirco Ravanelli, Yoshua Bengio
https://arxiv.org/pdf/1808.00158.pdf

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Layer
from tensorflow.keras import backend as K

import numpy as np

In [None]:
import numpy as np
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
import json
import pickle
from scipy.io import wavfile

# os.environ["CUDA_DEVICE_ORDER"]="0"
import tensorflow as tf

with tf.device('/device:GPU:0'):

  import tensorflow.keras as keras
  import tensorflow as tf

  # Allow memory growth for the GPU
  # physical_devices = tf.config.experimental.list_physical_devices('GPU')
  # tf.config.experimental.set_memory_growth(physical_devices, True)

  # from tensorflow.keras.utils import multi_gpu_model
  # from keras.backend.tensorflow_backend import set_session
  import librosa
  from sklearn.metrics import classification_report, confusion_matrix
  from tensorflow.keras.callbacks import ModelCheckpoint
  from tensorflow.keras.utils import to_categorical
  from tensorflow.keras.models import Sequential
  from tensorflow.keras.layers import Conv2D, MaxPooling2D
  from tensorflow.keras.layers import Activation, Dropout, Flatten, Dense, Permute, Reshape, TimeDistributed
  from tensorflow.keras.optimizers import Adam
  from tensorflow.keras.layers import Dense, Embedding, LSTM, Bidirectional, Flatten
  from tensorflow.compat.v1.keras.layers import CuDNNLSTM as CuLSTM, CuDNNGRU
  from tensorflow.keras.layers import add
  from tensorflow.keras.layers import Input
  from tensorflow.keras.models import Model
  from tensorflow.keras.layers import BatchNormalization

In [None]:
from tensorflow.compat.v1 import ConfigProto
from tensorflow.compat.v1 import InteractiveSession

config = ConfigProto()
config.gpu_options.allow_growth = True
session = InteractiveSession(config=config)

In [None]:
class SincConv(Layer):
    '''
    Sinc-based convolution Keras layer

    Reference
    ---------
    Mirco Ravanelli, Yoshua Bengio,
    "Speaker Recognition from raw waveform with SincNet".
    https://arxiv.org/abs/1808.00158
    '''

    @staticmethod
    def sinc(band, t_right):
        y_right = K.sin(2 * np.pi * band * t_right) / (2 * np.pi * band * t_right)
        y_left = K.reverse(y_right, 0)
        y = K.concatenate([y_left, K.variable(K.ones(1)), y_right])
        return y

    @staticmethod
    def hz_to_mel(hz):
        return 2595.0 * np.log10(1.0 + hz / 700.0)

    @staticmethod
    def mel_to_hz(mels):
        return 700.0 * (10.0 ** (mels / 2595.0) - 1.0)

    def __init__(self, nb_filters, kernel_size, sample_freq):
        super(SincConv, self).__init__()

        self.nb_filters = nb_filters
        self.kernel_size = kernel_size
        self.sample_freq = sample_freq

        # Set trainable parameters
        self.b1 = self.add_weight(
            name='b1',
            shape=(self.nb_filters,),
            initializer="zeros",
            trainable=True)
        self.band = self.add_weight(
            name='band',
            shape=(self.nb_filters,),
            initializer="zeros",
            trainable=True)
        
        # Initialize weights with cutoff frequencies of the mel-scale filter-bank
        low_freq_mel = self.hz_to_mel(50)
        high_freq_mel = self.hz_to_mel(self.sample_freq / 2)
        mel_points = np.linspace(low_freq_mel, high_freq_mel, num=self.nb_filters)
        hz_points = self.mel_to_hz(mel_points)

        b1 = np.roll(hz_points, 1)
        b1[0] = 30
        b2 = np.roll(hz_points, -1)
        b2[-1] = (self.sample_freq / 2) - 100

        self.set_weights([b1 / self.sample_freq, (b2 - b1) / self.sample_freq])

        # Initialize weights by 0 and the Nyquist frequency
        # low = np.zeros(self.nb_filters)
        # high = np.repeat(self.sample_freq / 2, self.nb_filters)
        # self.set_weights([low / self.sample_freq,
        #                   (high - low) / self.sample_freq])
        
        # Get beginning and end frequencies of the filters
        min_freq = 50.0
        min_band = 50.0
        self.beg_freq = K.abs(self.b1) + min_freq / self.sample_freq
        self.end_freq = self.beg_freq + (K.abs(self.band) + min_band / self.sample_freq)
        
        t_right_linspace = np.linspace(1, (self.kernel_size - 1) / 2, int((self.kernel_size - 1) / 2))
        self.t_right = K.variable(t_right_linspace / self.sample_freq)

        # Hamming window
        n = np.linspace(0, self.kernel_size, num=self.kernel_size)
        window = 0.54 - 0.46 * K.cos(2 * np.pi * n / self.kernel_size)
        window = K.cast(window, "float32")
        self.window = K.variable(window)

    def call(self, X):
        filters = []
        for i in range(self.nb_filters):
            low_pass1 = 2 * self.beg_freq[i] * self.sinc(self.beg_freq[i] * self.sample_freq, self.t_right)
            low_pass2 = 2 * self.end_freq[i] * self.sinc(self.end_freq[i] * self.sample_freq, self.t_right)
            band_pass = low_pass2 - low_pass1
            band_pass = band_pass / K.max(band_pass)

            filters.append(band_pass * self.window)

        filters = K.stack(filters)

        # TF convolution assumes data is stored as NWC
        filters = K.transpose(filters)
        filters = K.reshape(filters, (self.kernel_size, 1, self.nb_filters))

        return K.conv1d(X, filters)

    def compute_output_shape(self, input_shape):
        out_width_size = conv_utils.conv_output_length(
            input_shape[1],
            self.kernel_size,
            padding="valid",
            stride=1,
            dilation=1)
        return (input_shape[0], out_width_size, self.nb_filters)


X = np.arange(63, dtype=np.single).reshape((1, 63, 1))
sinc_layer = SincConv(1, 9, 400)
y = sinc_layer(X)
print(y.numpy().transpose(0, 2, 1))

## SincConv (fast implementation)

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Layer
from tensorflow.keras import backend as K

import numpy as np

class SincConvFast(Layer):
    '''
    Sinc-based convolution Keras layer

    Parameters
    ----------
    nb_filters : `int`
        Number of filters (= number of output channels).
    kernel_size : `int`
        Convolution filter width/length (will be increased by one if even).
    sample_freq : `int`
        Sample rate of input audio.
    stride : `int`
        Convolution stride param. Defaults to 1.
    padding : `string`
        Convolution padding param. Defaults to "VALID".
    min_low_hz : `int`
        Minimum lowest frequency for pass band filter. Defaults to 50.
    min_band_hz : `int`
        Minimum frequency for pass band filter. Defaults to 50.

    Reference
    ---------
    Mirco Ravanelli, Yoshua Bengio,
    "Speaker Recognition from raw waveform with SincNet".
    https://arxiv.org/abs/1808.00158
    '''

    @staticmethod
    def hz_to_mel(hz):
        return 2595.0 * np.log10(1.0 + hz / 700.0)

    @staticmethod
    def mel_to_hz(mels):
        return 700.0 * (10.0 ** (mels / 2595.0) - 1.0)
 
 
    # model.add(SincConvFast(64, 251, sample_frequency, input_shape=(frame_length, 1)))
    def __init__(self, nb_filters=64, kernel_size=251, sample_freq=8000,
                 stride=1, padding="VALID", min_low_hz=50, min_band_hz=50,
                 **kwargs):
        # super(SincConvFast, self).__init__(**kwargs)
        super().__init__()

        self.nb_filters = nb_filters
        self.kernel_size = kernel_size
        self.sample_freq = sample_freq
        self.stride = stride
        self.padding = padding
        self.min_low_hz = min_low_hz
        self.min_band_hz = min_band_hz

        # Force filter size to be odd for later optimizations with symmetry
        if kernel_size % 2 == 0:
            self.kernel_size = self.kernel_size + 1

        # Set trainable parameters
        self.low_hz = self.add_weight(
            name='low_hz',
            shape=(self.nb_filters,),
            initializer="zeros",
            trainable=True)
        self.band_hz = self.add_weight(
            name='band_hz',
            shape=(self.nb_filters,),
            initializer="zeros",
            trainable=True)
        
        # Initialize weights with frequencies of the mel-scale filter-bank
        low_freq_mel = self.hz_to_mel(30)
        high_freq_mel = self.hz_to_mel(self.sample_freq / 2 - (self.min_low_hz + self.min_band_hz))
        mel_points = np.linspace(low_freq_mel, high_freq_mel, num=self.nb_filters + 1)
        hz_points = self.mel_to_hz(mel_points)
        self.set_weights([hz_points[:-1], np.diff(hz_points)])
      
        # Determine half of t
        t_linspace = np.arange(-(self.kernel_size - 1) / 2, 0)
        t = tf.Variable(2 * np.pi * t_linspace / self.sample_freq)
        t = tf.cast(t, "float32")
        self.t = tf.reshape(t, (1, -1))

        # Determine half of the hamming window
        n = np.linspace(0, (self.kernel_size / 2) - 1, num=int((self.kernel_size / 2)))
        window = 0.54 - 0.46 * tf.cos(2 * np.pi * n / self.kernel_size)
        window = tf.cast(window, "float32")
        self.window = tf.Variable(window)

    def call(self, X):
        low = self.min_low_hz + tf.abs(self.low_hz)
        high = tf.clip_by_value(low + self.min_band_hz + tf.abs(self.band_hz), self.min_low_hz, self.sample_freq / 2)
        band = high - low

        low_times_t = tf.linalg.matmul(tf.reshape(low, (-1, 1)), self.t)
        high_times_t = tf.linalg.matmul(tf.reshape(high, (-1, 1)), self.t)

        band_pass_left = ((tf.sin(high_times_t) - tf.sin(low_times_t)) / (self.t / 2)) * self.window
        band_pass_center = tf.reshape(2 * band, (-1, 1))
        band_pass_right = tf.reverse(band_pass_left, [1])

        filters = tf.concat([band_pass_left,
                             band_pass_center,
                             band_pass_right], axis=1)
        filters = filters / (2 * band[:, None])

        # TF convolution assumes data is stored as NWC
        filters = tf.transpose(filters)
        filters = tf.reshape(filters, (self.kernel_size, 1, self.nb_filters))

        return tf.nn.conv1d(X, filters, self.stride, self.padding)

    def compute_output_shape(self, input_shape):
        out_width_size = conv_utils.conv_output_length(
            input_shape[1],
            self.kernel_size,
            padding="valid",
            stride=1,
            dilation=1)
        return (input_shape[0], out_width_size, self.nb_filters)


X = np.arange(63, dtype=np.single).reshape((1, 63, 1))
sinc_layer = SincConvFast(2, 9, 400)
y = sinc_layer(X)

print(y.shape)

print(y.numpy().transpose(0, 2, 1))

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Layer
from tensorflow.keras import backend as K

import numpy as np

class SincConvFast_1(Layer):
    '''
    Sinc-based convolution Keras layer

    Parameters
    ----------
    nb_filters : `int`
        Number of filters (= number of output channels).
    kernel_size : `int`
        Convolution filter width/length (will be increased by one if even).
    sample_freq : `int`
        Sample rate of input audio.
    stride : `int`
        Convolution stride param. Defaults to 1.
    padding : `string`
        Convolution padding param. Defaults to "VALID".
    min_low_hz : `int`
        Minimum lowest frequency for pass band filter. Defaults to 50.
    min_band_hz : `int`
        Minimum frequency for pass band filter. Defaults to 50.

    Reference
    ---------
    Mirco Ravanelli, Yoshua Bengio,
    "Speaker Recognition from raw waveform with SincNet".
    https://arxiv.org/abs/1808.00158
    '''

    @staticmethod
    def hz_to_mel(hz):
        return 2595.0 * np.log10(1.0 + hz / 700.0)

    @staticmethod
    def mel_to_hz(mels):
        return 700.0 * (10.0 ** (mels / 2595.0) - 1.0)

    def __init__(self, nb_filters, kernel_size, sample_freq,
                 stride=1, padding="VALID", min_low_hz=50, min_band_hz=50,
                 **kwargs):
        # super(SincConvFast, self).__init__(**kwargs)
        super().__init__()

        self.nb_filters = nb_filters
        self.kernel_size = kernel_size
        self.sample_freq = sample_freq
        self.stride = stride
        self.padding = padding
        self.min_low_hz = min_low_hz
        self.min_band_hz = min_band_hz

        # Force filter size to be odd for later optimizations with symmetry
        if kernel_size % 2 == 0:
            self.kernel_size = self.kernel_size + 1

        # Set trainable parameters
        self.low_hz = self.add_weight(
            name='low_hz',
            shape=(self.nb_filters,),
            initializer="zeros",
            trainable=True)
        self.band_hz = self.add_weight(
            name='band_hz',
            shape=(self.nb_filters,),
            initializer="zeros",
            trainable=True)
        
        # Initialize weights with frequencies of the mel-scale filter-bank
        low_freq_mel = self.hz_to_mel(30)
        high_freq_mel = self.hz_to_mel(self.sample_freq / 2 - (self.min_low_hz + self.min_band_hz))
        mel_points = np.linspace(low_freq_mel, high_freq_mel, num=self.nb_filters + 1)
        hz_points = self.mel_to_hz(mel_points)
        self.set_weights([hz_points[:-1], np.diff(hz_points)])
      
        # Determine half of t
        t_linspace = np.arange(-(self.kernel_size - 1) / 2, 0)
        t = tf.Variable(2 * np.pi * t_linspace / self.sample_freq)
        t = tf.cast(t, "float32")
        self.t = tf.reshape(t, (1, -1))

        # Determine half of the hamming window
        n = np.linspace(0, (self.kernel_size / 2) - 1, num=int((self.kernel_size / 2)))
        window = 0.54 - 0.46 * tf.cos(2 * np.pi * n / self.kernel_size)
        window = tf.cast(window, "float32")
        self.window = tf.Variable(window)

    def call(self, X):
        low = self.min_low_hz + tf.abs(self.low_hz)
        high = tf.clip_by_value(low + self.min_band_hz + tf.abs(self.band_hz), self.min_low_hz, self.sample_freq / 2)
        band = high - low

        low_times_t = tf.linalg.matmul(tf.reshape(low, (-1, 1)), self.t)
        high_times_t = tf.linalg.matmul(tf.reshape(high, (-1, 1)), self.t)

        band_pass_left = ((tf.sin(high_times_t) - tf.sin(low_times_t)) / (self.t / 2)) * self.window
        band_pass_center = tf.reshape(2 * band, (-1, 1))
        band_pass_right = tf.reverse(band_pass_left, [1])

        filters = tf.concat([band_pass_left,
                             band_pass_center,
                             band_pass_right], axis=1)
        filters = filters / (2 * band[:, None])

        # TF convolution assumes data is stored as NWC
        filters = tf.transpose(filters)
        filters = tf.reshape(filters, (self.kernel_size, 1, self.nb_filters))

        return tf.nn.conv1d(X, filters, self.stride, self.padding)

    def compute_output_shape(self, input_shape):
        out_width_size = conv_utils.conv_output_length(
            input_shape[1],
            self.kernel_size,
            padding="valid",
            stride=1,
            dilation=1)
        return (input_shape[0], out_width_size, self.nb_filters)


X = np.arange(63, dtype=np.single).reshape((1, 63, 1))
sinc_layer = SincConvFast(2, 9, 400)
y = sinc_layer(X)

print(y.numpy().transpose(0, 2, 1))

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Layer
from tensorflow.keras import backend as K

import numpy as np

class SincConvFast_2(Layer):
    '''
    Sinc-based convolution Keras layer

    Parameters
    ----------
    nb_filters : `int`
        Number of filters (= number of output channels).
    kernel_size : `int`
        Convolution filter width/length (will be increased by one if even).
    sample_freq : `int`
        Sample rate of input audio.
    stride : `int`
        Convolution stride param. Defaults to 1.
    padding : `string`
        Convolution padding param. Defaults to "VALID".
    min_low_hz : `int`
        Minimum lowest frequency for pass band filter. Defaults to 50.
    min_band_hz : `int`
        Minimum frequency for pass band filter. Defaults to 50.

    Reference
    ---------
    Mirco Ravanelli, Yoshua Bengio,
    "Speaker Recognition from raw waveform with SincNet".
    https://arxiv.org/abs/1808.00158
    '''

    @staticmethod
    def hz_to_mel(hz):
        return 2595.0 * np.log10(1.0 + hz / 700.0)

    @staticmethod
    def mel_to_hz(mels):
        return 700.0 * (10.0 ** (mels / 2595.0) - 1.0)

    def __init__(self, nb_filters, kernel_size, sample_freq,
                 stride=1, padding="VALID", min_low_hz=50, min_band_hz=50,
                 **kwargs):
        # super(SincConvFast, self).__init__(**kwargs)
        super().__init__()

        self.nb_filters = nb_filters
        self.kernel_size = kernel_size
        self.sample_freq = sample_freq
        self.stride = stride
        self.padding = padding
        self.min_low_hz = min_low_hz
        self.min_band_hz = min_band_hz

        # Force filter size to be odd for later optimizations with symmetry
        if kernel_size % 2 == 0:
            self.kernel_size = self.kernel_size + 1

        # Set trainable parameters
        self.low_hz = self.add_weight(
            name='low_hz',
            shape=(self.nb_filters,),
            initializer="zeros",
            trainable=True)
        self.band_hz = self.add_weight(
            name='band_hz',
            shape=(self.nb_filters,),
            initializer="zeros",
            trainable=True)
        
        # Initialize weights with frequencies of the mel-scale filter-bank
        low_freq_mel = self.hz_to_mel(30)
        high_freq_mel = self.hz_to_mel(self.sample_freq / 2 - (self.min_low_hz + self.min_band_hz))
        mel_points = np.linspace(low_freq_mel, high_freq_mel, num=self.nb_filters + 1)
        hz_points = self.mel_to_hz(mel_points)
        self.set_weights([hz_points[:-1], np.diff(hz_points)])
      
        # Determine half of t
        t_linspace = np.arange(-(self.kernel_size - 1) / 2, 0)
        t = tf.Variable(2 * np.pi * t_linspace / self.sample_freq)
        t = tf.cast(t, "float32")
        self.t = tf.reshape(t, (1, -1))

        # Determine half of the hamming window
        n = np.linspace(0, (self.kernel_size / 2) - 1, num=int((self.kernel_size / 2)))
        window = 0.54 - 0.46 * tf.cos(2 * np.pi * n / self.kernel_size)
        window = tf.cast(window, "float32")
        self.window = tf.Variable(window)

    def call(self, X):
        low = self.min_low_hz + tf.abs(self.low_hz)
        high = tf.clip_by_value(low + self.min_band_hz + tf.abs(self.band_hz), self.min_low_hz, self.sample_freq / 2)
        band = high - low

        low_times_t = tf.linalg.matmul(tf.reshape(low, (-1, 1)), self.t)
        high_times_t = tf.linalg.matmul(tf.reshape(high, (-1, 1)), self.t)

        band_pass_left = ((tf.sin(high_times_t) - tf.sin(low_times_t)) / (self.t / 2)) * self.window
        band_pass_center = tf.reshape(2 * band, (-1, 1))
        band_pass_right = tf.reverse(band_pass_left, [1])

        filters = tf.concat([band_pass_left,
                             band_pass_center,
                             band_pass_right], axis=1)
        filters = filters / (2 * band[:, None])

        # TF convolution assumes data is stored as NWC
        filters = tf.transpose(filters)
        filters = tf.reshape(filters, (self.kernel_size, 1, self.nb_filters))

        return tf.nn.conv1d(X, filters, self.stride, self.padding)

    def compute_output_shape(self, input_shape):
        out_width_size = conv_utils.conv_output_length(
            input_shape[1],
            self.kernel_size,
            padding="valid",
            stride=1,
            dilation=1)
        return (input_shape[0], out_width_size, self.nb_filters)


X = np.arange(63, dtype=np.single).reshape((1, 63, 1))
sinc_layer = SincConvFast(2, 9, 400)
y = sinc_layer(X)

print(y.numpy().transpose(0, 2, 1))

In [None]:
import tensorflow as tf
print(tf.__version__)

In [None]:
!pip install tensorflow==2.2.0

## Load dataset (BreathSound)

**The following preprocessing steps (described in the original article) are not implemented:**

- Non-speech intervals at the beginning and end of each sentence were removed
- The BreathSound sentences with internal silences lasting more than 125 ms were split into multiple chunks.
- For the BreathSound corpus, the training and test material have been randomly selected to exploit 12-15 seconds of training material for each speaker and test sentences lasting 2-6 seconds.

In [None]:
sample_frequency = 8000 # 8kHz (BreathSpeech)
frame_size = 0.5    # 300ms ~ 0.300
frame_stride = 0.05      # 10ms
max_num_frames = 10

frame_length = int(round(frame_size * sample_frequency))
frame_step = int(round(frame_stride * sample_frequency))

batch_size = 64
nb_speakers = 3

print("Input length: {} ({}s)".format(frame_length, frame_size * max_num_frames))
print("Input shape: {}".format((batch_size, frame_length)))

#### Download BreathSound

In [None]:
#!wget https://www.openslr.org/resources/12/train-clean-100.tar.gz
#!tar xf train-clean-100.tar.gz

In [None]:
# from IPython.display import Audio
# Audio('BreathSound/train-clean-100/1081/128618/1081-128618-0012.flac')
# Audio('breath-deep/data/output/train/01_male_23_BQuyen/normal/01_male_23_BQuyen_0_normal.wav')

#### Create BreathSound generator

In [None]:
from tensorflow.keras.utils import Sequence
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
import soundfile as sf
import glob

def get_frames_indices(filename):
    signal, fs = sf.read(filename)

    # Determine number of frames
    signal_length = len(signal)
    assert signal_length > frame_length
    num_frames = int(np.floor((signal_length - frame_length) / frame_step))

    # Limit the number of frames
    num_frames = min(num_frames, max_num_frames)

    return np.arange(0, num_frames * frame_step, frame_step)

def load_dataset(data_folder, max_speakers=10, max_utterances=10):
    X = []
    y = []

    files = glob.glob(data_folder)
#     print(files)
    for speaker_id in range(min(max_speakers, len(files))):
#         print(speaker_id)
#         speaker_files = glob.glob(files[speaker_id] + '/*')
        speaker_files = glob.glob(files[speaker_id])
#         print(speaker_files)

        nb_utterances_for_speaker = 0

        for sentence_id in range(len(speaker_files)):
#             print(speaker_files[sentence_id])
            sentence_files = glob.glob(speaker_files[sentence_id] + '/*.wav')
#             print(sentence_files)

            for utterance_id in range(len(sentence_files)):
                if nb_utterances_for_speaker >= max_utterances:
                    break

                filename = sentence_files[utterance_id]
                frames = get_frames_indices(filename)

                for frame in frames:
                    X.append([filename, frame])
                    y.append(speaker_id)
#                     y.append(sentence_id)
                
                nb_utterances_for_speaker += 1

    # mean = np.mean(X)
    # std = np.std(X)

    # X = (X - mean)/std
    
    return X, y

class BreathSoundGenerator(Sequence) :
  
    def __init__(self, X, y, batch_size):
        self.X = X
        self.y = y
        self.batch_size = batch_size

    def __len__(self):
        nb_batches = len(self.y) / float(self.batch_size)
        return np.ceil(nb_batches).astype(np.int)
  
    def __getitem__(self, batch_id):
        X_batch = np.zeros((self.batch_size, frame_length, 1))
        y_batch = np.zeros(self.batch_size)

        for i in range(self.batch_size):
            id = batch_id * self.batch_size + i
            if id >= len(self.y):
                id = np.random.randint(0, len(self.y))

            path, frame = self.X[id]
            signal, fs = sf.read(path)

            X_batch[i, :, 0] = signal[frame:frame+frame_length]
            y_batch[i] = self.y[id]

        return X_batch, y_batch

In [None]:
# Load and determine audio filenames and their associated speaker
X_, y_ = load_dataset("/content/gdrive/My Drive/Breath-Data/Users-training/*/*",
                    max_speakers=nb_speakers,
                    max_utterances=8)

from sklearn import preprocessing

nb_speakers = 3

# X_, y_ = load_dataset("/content/gdrive/My Drive/Breath-Data/data/training/*",
#                     max_speakers=nb_speakers,
#                     max_utterances=8)

X__, y__ = load_dataset("/content/gdrive/My Drive/Breath-Data/Users-testing/*/*",
                    max_speakers=nb_speakers,
                    max_utterances=8)

!ls /content/gdrive/My Drive/Breath-Data/data/*

# mean = np.mean(np.array(X_), axis=0)
# std = np.std(np.array(X_), axis=0)

# X_ = (X_ - mean)/std



print(len(X_), len(y_))
# Split in train and validation sets
# X_train, X_val, y_train, y_val = train_test_split(X_, y_, test_size=0.2)
X_train, _, y_train, _ = train_test_split(X_, y_, test_size=0.1)
_, X_val, _, y_val = train_test_split(X__, y__, test_size=0.1)

# normalize the data attributes
# y_train = preprocessing.normalize(y_train, axis=0)
# y_val = preprocessing.normalize(y_val, axis=0)

print(y_train)
print(y_val)

# Instantiate custom generator to load each batch at once
train_gen = BreathSoundGenerator(X_train, y_train, batch_size)
print(train_gen)
val_gen = BreathSoundGenerator(X_val, y_val, batch_size)

In [None]:
print("Number of training batches:", len(train_gen))
print("Number of validation batches:", len(val_gen))

## SincNet

**The model is simplified to avoid overfitting on our subset of BreathSound:**

- No LayerNormalization on the input as it prevents the model to converge
- The number of filters is reduced
- 2 conv layers instead of 3
- 2 dense layers instead of 3

### Create model

In [None]:
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, MaxPooling1D, BatchNormalization, LeakyReLU, Flatten, LayerNormalization
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.callbacks import ModelCheckpoint

In [None]:
from keras.layers import Input, Dense, Lambda, Layer
from keras.initializers import Constant
from keras.models import Model
from keras import backend as K

# Custom loss layer
class CustomMultiLossLayer(Layer):
    def __init__(self, nb_outputs=2, **kwargs):
        self.nb_outputs = nb_outputs
        self.is_placeholder = True
        super(CustomMultiLossLayer, self).__init__(**kwargs)
        
    def build(self, input_shape=None):
        # initialise log_vars
        self.log_vars = []
        for i in range(self.nb_outputs):
            self.log_vars += [self.add_weight(name='log_var' + str(i), shape=(1,),
                                              initializer=Constant(0.), trainable=True)]
        super(CustomMultiLossLayer, self).build(input_shape)

    def multi_loss(self, ys_true, ys_pred):
        assert len(ys_true) == self.nb_outputs and len(ys_pred) == self.nb_outputs
        loss = 0
        for y_true, y_pred, log_var in zip(ys_true, ys_pred, self.log_vars):
            precision = K.exp(-log_var[0])
            loss += K.sum(precision * (y_true - y_pred)**2. + log_var[0], -1)
        return K.mean(loss)

    def call(self, inputs):
        ys_true = inputs[:self.nb_outputs]
        ys_pred = inputs[self.nb_outputs:]
        loss = self.multi_loss(ys_true, ys_pred)
        self.add_loss(loss, inputs=inputs)
        # We won't actually use the output.
        return K.concatenate(inputs, -1)

In [None]:

# %tensorflow_version 2.x

from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import BatchNormalization, Conv2D, MaxPooling2D, Activation, Flatten, Dropout, Dense, DepthwiseConv2D, GlobalAveragePooling2D, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Conv1D as conv1_1
from tensorflow.keras.layers import Conv1D as conv1_2
from tensorflow.keras.layers import Conv1D as conv1_3
from tensorflow.keras.layers import Dense, Embedding, LSTM, Bidirectional, Flatten
from keras.layers.merge import add
import tensorflow.keras.backend as K

INPUT_SHAPE = (frame_length, 1)

class MultiTaskLossFunctionNet(object):

  def _linear_basenetwork(inputs, classes = nb_speakers, finAct = 'linear'):
    x = SincConvFast_1(64, 251, sample_frequency)(inputs)
    # x = conv1_2(filters=64, kernel_size=251)(inputs)
    x = MaxPooling1D(pool_size=3)(x)
    x = LayerNormalization()(x)
    x = LeakyReLU(alpha=0.2)(x)

    x = Conv1D(filters=32, kernel_size=5)(x)
    x = MaxPooling1D(pool_size=3)(x)
    x = LayerNormalization()(x)
    x = LeakyReLU(alpha=0.2)(x)

    x = Flatten()(x)
    x = LayerNormalization()(x)

    x = Dense(64)(x)
    x = BatchNormalization(momentum=0.05)(x)
    x = LeakyReLU(alpha=0.2)(x)

    x = Dense(64)(x)
    x = BatchNormalization(momentum=0.05)(x)
    x = LeakyReLU(name='layer_features_1', alpha=0.2)(x)

    x = Dense(nb_speakers)(x)
    x = Activation(finAct, name='output_1')(x)

    return x


  def _relu_basenetwork(inputs, classes = nb_speakers, finAct = 'sigmoid'):
    x = SincConvFast_1(64, 251, sample_frequency)(inputs)
    # x = conv1_2(filters=64, kernel_size=251)(inputs)
    x = MaxPooling1D(pool_size=3)(x)
    x = LayerNormalization()(x)
    x = LeakyReLU(alpha=0.2)(x)

    x = Conv1D(filters=32, kernel_size=5)(x)
    x = MaxPooling1D(pool_size=3)(x)
    x = LayerNormalization()(x)
    x = LeakyReLU(alpha=0.2)(x)

    x = Flatten()(x)
    x = LayerNormalization()(x)

    x = Dense(64)(x)
    x = BatchNormalization(momentum=0.05)(x)
    x = LeakyReLU(alpha=0.2)(x)

    x = Dense(64)(x)
    x = BatchNormalization(momentum=0.05)(x)
    x = LeakyReLU(name='layer_features_2', alpha=0.2)(x)

    x = Dense(nb_speakers)(x)
    x = Activation(finAct, name='output_2')(x)

    return x


  def _softmax_basenetwork(inputs, classes = nb_speakers, finAct = 'softmax'):
    x = SincConvFast_2(64, 251, sample_frequency, input_shape=(frame_length, 1))(inputs)
    # x = conv1_3(filters=64, kernel_size=251)(inputs)
    x = MaxPooling1D(pool_size=3)(x)
    x = LayerNormalization()(x)
    x = LeakyReLU(alpha=0.2)(x)

    x = Conv1D(filters=32, kernel_size=5)(x)
    x = MaxPooling1D(pool_size=3)(x)
    x = LayerNormalization()(x)
    x = LeakyReLU(alpha=0.2)(x)

    x = Flatten()(x)
    x = LayerNormalization()(x)

    x = Dense(64)(x)
    x = BatchNormalization(momentum=0.05)(x)
    x = LeakyReLU(alpha=0.2)(x)

    x = Dense(64)(x)
    x = BatchNormalization(momentum=0.05)(x)
    x = LeakyReLU(name='layer_features_3', alpha=0.2)(x)

    x = Dense(nb_speakers)(x)
    x = Activation(finAct, name='output_3')(x)

    return x

  def basenetwork():
    inputs = Input(shape=INPUT_SHAPE)
    x = SincConvFast_2(64, 251, sample_frequency, input_shape=(frame_length, 1))(inputs)
    x = MaxPooling1D(pool_size=3)(x)
    x = LayerNormalization()(x)
    x = LeakyReLU(alpha=0.2)(x)

    x = Conv1D(filters=32, kernel_size=5)(x)
    x = MaxPooling1D(pool_size=3)(x)
    x = LayerNormalization()(x)
    x = LeakyReLU(alpha=0.2)(x)

    x = Flatten()(x)
    x = LayerNormalization()(x)

    x = Dense(64)(x)
    x = BatchNormalization(momentum=0.05)(x)
    x = LeakyReLU(alpha=0.2)(x)

    x = Dense(64)(x)
    x = BatchNormalization(momentum=0.05)(x)
    x = LeakyReLU(name='layer_features_3', alpha=0.2)(x)

    return x

  def basenetwork_lstm():
    inputs = Input(shape=INPUT_SHAPE)
    x = SincConvFast_2(64, 251, sample_frequency, input_shape=(frame_length, 1))(inputs)
    
    z1 = Bidirectional(CuLSTM(128, return_sequences=True))(x)
    z2 = Bidirectional(CuLSTM(units=128, return_sequences=True))(z1)
    z3 = add([z1, z2])  # residual connection
    z4 = Bidirectional(CuLSTM(units=128, return_sequences=True))(z3)
    z5 = Bidirectional(CuLSTM(units=128, return_sequences=False))(z4)
    z6 = add([z4, z5])  # residual connection    
    z61 = Flatten()(z6)        
    z7 = Dense(256, activation='relu')(z61)
    z8 = Dropout(0.5)(z7)
    
    return z8


  @staticmethod
  def build(inputShape=INPUT_SHAPE, numGender = nb_speakers, numRace = nb_speakers):
    inputs = Input(shape=INPUT_SHAPE)
    x = SincConvFast_2(64, 251, sample_frequency, input_shape=(frame_length, 1))(inputs)

    ##########################
    z1 = Bidirectional(CuLSTM(128, return_sequences=True))(x)
    z2 = Bidirectional(CuLSTM(units=128, return_sequences=True))(z1)
    z3 = add([z1, z2])  # residual connection
    z4 = Bidirectional(CuLSTM(units=128, return_sequences=True))(z3)
    z5 = Bidirectional(CuLSTM(units=128, return_sequences=False))(z4)
    z6 = add([z4, z5])  # residual connection    
    z61 = Flatten()(z6)        
    z7 = Dense(256, activation='relu')(z61)
    z8 = Dropout(0.5)(z7)
    ##########################

    x = MaxPooling1D(pool_size=3)(x)
    x = LayerNormalization()(x)
    x = LeakyReLU(alpha=0.2)(x)

    x = Conv1D(filters=32, kernel_size=5)(x)
    x = MaxPooling1D(pool_size=3)(x)
    x = LayerNormalization()(x)
    x = LeakyReLU(alpha=0.2)(x)

    x = Flatten()(x)
    x = LayerNormalization()(x)

    x = Dense(64)(x)
    x = BatchNormalization(momentum=0.05)(x)
    x = LeakyReLU(alpha=0.2)(x)

    x = Dense(64)(x)
    x = BatchNormalization(momentum=0.05)(x)
    x = LeakyReLU(name='layer_features_3', alpha=0.2)(x)


    # linearBranch=MultiTaskLossFunctionNet._linear_basenetwork(inputs=inputs,
    #   classes=nb_speakers, finAct='softmax') # linear
    # reluBranch=MultiTaskLossFunctionNet._relu_basenetwork(inputs=inputs,
    #   classes = numGender, finAct='softmax') # relu
    # softmaxBranch=MultiTaskLossFunctionNet._softmax_basenetwork(inputs=inputs,
    #   classes = numRace, finAct='softmax')
    
    # Dung chung 1 model
    branch_1 = Dense(nb_speakers, activation='softmax', name='output_1')(x) 
    branch_2 = Dense(nb_speakers, activation='sigmoid', name='output_2')(x) 
    branch_3 = Dense(nb_speakers, activation='softmax', name='output_3')(x) 
    branch_4 = Dense(nb_speakers, activation='softmax', name='output_4')(z8) 
    # branch_4 = Dense(nb_speakers, activation='softmax', name='output_4')(x)
    # branch_5 = Dense(nb_speakers, activation='softmax', name='output_5')(x)  


    # Tạo một mô hình sử dụng đầu vào là một batch loss function, sau đó mô hình sẽ 
    # rẽ nhánh, một nhánh xác định đặc trưng của colors và một nhánh xác định đặc trưng của fashion
    model = Model(
      inputs=inputs,
      # outputs=[linearBranch, reluBranch, softmaxBranch],
      outputs=[branch_1, 
               branch_2, 
               branch_3, 
               branch_4, 
              #  branch_5
               ],
      name="multitask_net")

    return model

model_multi = MultiTaskLossFunctionNet.build(inputShape=INPUT_SHAPE)
model_multi.summary()

In [None]:
print(model_multi.metrics_names)

### Train model - Multitask Sincnet CNN

In [None]:
def masked_loss_function(y_true, y_pred):
    mask = K.cast(K.not_equal(y_true, mask_value), K.floatx())
    return K.categorical_crossentropy(y_true * mask, y_pred * mask)

In [None]:
from tensorflow.keras.losses import mean_squared_error, binary_crossentropy, sparse_categorical_crossentropy
from tensorflow.keras.metrics import kl_divergence

def kl_crossentropy(y_true, y_pred):
    kl = kl_divergence(y_true, y_pred)
    crossentropy = sparse_categorical_crossentropy(y_true, y_pred)
    return kl + crossentropy


def mse_crossentropy(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    crossentropy = sparse_categorical_crossentropy(y_true, y_pred)
    return ((mse + crossentropy)/2)

In [None]:
# import tensorflow_addons as tfa

losses = {
	# "output_1": "poisson",
  # "output_1": "mse",
  "output_1": "sparse_categorical_crossentropy",
	# "output_2": "cosine_similarity",
  # "output_2": "center_loss",
  # "output_2": mean_squared_error, # accuracy low
  "output_2": "poisson",
	"output_3": mse_crossentropy,
  "output_4": "sparse_categorical_crossentropy",
  # "output_5": "sparse_categorical_crossentropy",
}

lossWeights = {"output_1": 3000, 
               "output_2": 3000, 
               "output_3": 3000, 
               "output_4": 1.0, 
              #  "output_5": 1000.0
               }
model_multi.compile(loss=losses, loss_weights= lossWeights, optimizer=RMSprop(lr=0.001), metrics=['accuracy'])



In [None]:
if not os.path.exists('checkpoints'):
	os.mkdir('checkpoints')
checkpoint_path = "./checkpoints/training-{epoch:04d}.ckpt"
save_callback = ModelCheckpoint(filepath=checkpoint_path,
                                save_best_only=True,
                                save_weights_only=True,
                                verbose=1)

history = model_multi.fit(train_gen,
                            validation_data=val_gen,
                            epochs=100,
                            initial_epoch=0,
                            callbacks=[save_callback]
                          )

### Evaluate model

#### Loss and accuracy

In [None]:
# FIXME: Evaluating a generator returns different results at every call.

val_accuracy = model_multi.evaluate(val_gen)

# print("Loss on validation set:", val_loss)
print("Accuracy on validation set:", val_accuracy)

In [None]:
import matplotlib.pyplot as plt

plt.plot(history.history['output_3_accuracy'])
plt.plot(history.history['val_output_3_accuracy'])
plt.title('Model accuracy over epochs')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
# !pip install matplotlib

#### Confusion matrix and metrics

In [None]:
# Load all batches of validation generator in memory

nb_batches = len(val_gen)
X_val = np.empty((nb_batches, batch_size, frame_length, 1))
y_val = np.empty((nb_batches, batch_size))

for i in range(nb_batches):
    X_val[i], y_val[i] = val_gen.__getitem__(i)

X_val = X_val.reshape((nb_batches * batch_size, -1))
y_val = y_val.reshape(nb_batches * batch_size)

y_val_actual = np.argmax(model_multi.predict(X_val), axis=-1)
print(y_val)
print(y_val_actual)

In [None]:
from sklearn.metrics import precision_recall_fscore_support

# https://en.wikipedia.org/wiki/Precision_and_recall

p, r, f1, _ = precision_recall_fscore_support(y_val,
                                              y_val_actual[0], # outcome have 3 y_vals
                                              average='macro',
                                              zero_division=0)

# print(y_val)
# print(y_val_actual)

print("Precision:", p)
print("Recall:", r)
print("F1 score:", f1)

In [None]:
from sklearn import metrics

# Print the precision and recall, among other metrics
print(metrics.classification_report(y_val, y_val_actual[0], digits=3))

In [None]:
from sklearn.metrics import confusion_matrix
from mlxtend.plotting import plot_confusion_matrix

cm = confusion_matrix(y_val, y_val_actual[0])

plot_confusion_matrix(cm,
                      show_normed=True,
                      show_absolute=False,
                      figsize=(10, 10),
                      hide_ticks=True)

## Model BiLSTM 

##### Pre-Procceing Data

In [None]:
!pip install playsound

In [None]:
import librosa
import os
import glob
import itertools
from playsound import playsound
from random import shuffle
import numpy as np
import matplotlib.pyplot as plt
from librosa.feature import melspectrogram
from librosa.util import normalize
from librosa.display import waveplot

In [None]:
## Data library

heavyData = '/content/gdrive/My Drive/Breath-Data/Training-Data/heavy/'
otherData = '/content/gdrive/My Drive/Breath-Data/Training-Data/other/'
deepData = '/content/gdrive/My Drive/Breath-Data/Training-Data/deep/'
normalData = '/content/gdrive/My Drive/Breath-Data/Training-Data/normal/'

In [None]:
# os.chdir(heavyData)
# !ls

In [None]:
#Read original data

def readCoughData(file):
    origData,origSampFreq = librosa.load(file, sr=None)
    return origData, origSampFreq

In [None]:
# resample original data to 16000 Khz

def resample(originalData, origSampFreq, targetSampFreq):
    resampledData = librosa.resample(originalData, origSampFreq, targetSampFreq)
    return resampledData

In [None]:
# Normalize Sound Data

def normalizeSound(resampledData, axis):
    """ Axis is 0 for row-wise and 1 
    for column wise"""
    normalizedData = normalize(resampledData, axis)
    return normalizedData

In [None]:
# Calculate Mel-Spectogram

def calculateMelSpectogram(normalizedData, hop_length, win_length, sr):
    #newSamplingFreq = 16000
    S=librosa.feature.melspectrogram(normalizedData, sr=sr, hop_length=hop_length, win_length=win_length)
    return S

In [None]:
# plot orginal time domain data

def plotSound(soundData, sr, x_axis_string):
    waveplot(soundData, sr, x_axis=x_axis_string)

In [None]:
#Plot melspectogram

def plotMelSpectogram(S, sr, ref=np.max):
    plt.figure(figsize=(10, 4))
    S_dB = librosa.power_to_db(S, ref=np.max)
    librosa.display.specshow(S_dB, x_axis='time',y_axis='mel', sr=16000,)
    plt.colorbar(format='%+2.0f dB')
    plt.title('Mel-frequency spectrogram')
    plt.tight_layout()
    plt.show()

In [None]:
def featureExtraction(audioFile, targetSampFreq, axis, hop_length,win_length):
    y, y_sr = readCoughData(file=audioFile)
    print(y, y_sr)
    resampledData = resample(originalData=y, origSampFreq=y_sr, targetSampFreq=targetSampFreq)
    normalizedData = normalizeSound(resampledData, axis=axis)
    S = calculateMelSpectogram(normalizedData=normalizedData, hop_length=hop_length, win_length=win_length, sr=targetSampFreq)
    plotSound(soundData=normalizedData, sr=targetSampFreq,x_axis_string='time')
    plotMelSpectogram(S, sr=targetSampFreq, ref=np.max)
    return S

In [None]:
import tensorflow as tf
import h5py
import numpy as np
import pandas as pd
from tensorflow.python.keras import backend as K
from matplotlib import pyplot as plt
from tensorflow import keras
from tensorflow.keras.layers import MaxPooling2D, Conv2D, Dense, Activation, Flatten, Dropout, Input
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam, RMSprop

In [None]:
def cough_detection_model():
    input_layer = Input((432,228,1))
    x = MaxPooling2D(pool_size=(2, 2))(input_layer)
    x = Conv2D(filters=32,kernel_size=(5,5),padding='same')(x)
    x = Activation('relu')(x)
    x = MaxPooling2D(pool_size=(2, 2))(x)
    x = Conv2D(filters=32,kernel_size=(5,5),padding='same')(x)
    x = Activation('relu')(x)
    x = MaxPooling2D(pool_size=(2, 2))(input_layer)
    x = Flatten()(x)
    x = Dense(128)(x)
    x = Activation('relu')(x)
    x = Dropout(0.5)(x)
    x = Dense(128)(x)
    x = Activation('relu')(x)
    x = Dropout(0.5)(x)
    output_layer = Dense(2,activation = 'softmax')(x)
    model = Model(inputs=input_layer,outputs=output_layer)
    adam = Adam(lr=0.0001)
    model.compile(loss='binary_crossentropy',optimizer=adam,metrics=['accuracy'])
    return model

In [None]:
# def one_hot(a):
#     b = np.zeros((a.size, a.max()+1))
#     b[np.arange(a.size),a] = 1
#     return b
# X_train =                         # load your data here shape (80,100,50)
# X_train = X_train.reshape((80,100,50,1))
# y_train =                         # load your labels here shape (80,1)
# y_train = one_hot(y_train)                       # one_hot_encoding
# number_of_epochs = 50 # number of times you fed each data on X_train to the model
# #model = cough_detection_model() # here you have to call the model you want to use, in this case DL_MC
# model = cough_detection_model()
# print('# Fit model on training data')
# history = model.fit(X_train, y_train,
#                     batch_size = 4,
#                     epochs = number_of_epochs, validation_data = (X_train,y_train)) #I have set same data for training and
#                                                 # for validation because we have few instances, later when we have
#                                                 #more data we will make an split train/validation/test
# print('\nhistory dict:', history.history)

##### Import Library 

In [None]:
import numpy as np
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
import json
import pickle
from scipy.io import wavfile

# os.environ["CUDA_DEVICE_ORDER"]="0"
import tensorflow as tf

with tf.device('/device:GPU:0'):

    import tensorflow.keras as keras
    import tensorflow as tf

# Allow memory growth for the GPU
# physical_devices = tf.config.experimental.list_physical_devices('GPU')
# tf.config.experimental.set_memory_growth(physical_devices, True)

# from tensorflow.keras.utils import multi_gpu_model
# from keras.backend.tensorflow_backend import set_session
    import librosa
    from sklearn.metrics import classification_report, confusion_matrix
    from tensorflow.keras.callbacks import ModelCheckpoint
    from tensorflow.keras.utils import to_categorical
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import Conv2D, MaxPooling2D
    from tensorflow.keras.layers import Activation, Dropout, Flatten, Dense, Permute, Reshape, TimeDistributed
    from tensorflow.keras.optimizers import Adam
    from tensorflow.keras.layers import Dense, Embedding, LSTM, Bidirectional, Flatten
    from tensorflow.compat.v1.keras.layers import CuDNNLSTM as CuLSTM
    from tensorflow.compat.v1.keras.layers import Input, Dense, Lambda, Layer
    from tensorflow.keras.layers import add
    from tensorflow.keras.layers import Input
    from tensorflow.keras.models import Model
    from tensorflow.keras.layers import BatchNormalization
    from tensorflow.keras.layers.experimental import preprocessing

    normalize = preprocessing.Normalization()
# from keras.utils import multi_gpu_model

# from LSTM_MODEL import LSTM_MODEL
# from dataset import BreathDataGenerator



##### Init model

In [None]:
# from keras.layers import Input, Dense, Lambda, Layer
from tensorflow.keras.initializers import Constant
from tensorflow.keras.models import Model
from tensorflow.keras import backend as K

# Custom loss layer
class CustomMultiLossLayer(tf.compat.v1.keras.layers.Layer):
    def __init__(self, nb_outputs=2, **kwargs):
        self.nb_outputs = nb_outputs
        self.is_placeholder = True
        super(CustomMultiLossLayer, self).__init__(**kwargs)
        
    def build(self, input_shape=None):
        # initialise log_vars
        self.log_vars = []
        for i in range(self.nb_outputs):
            self.log_vars += [self.add_weight(name='log_var' + str(i), shape=(1,),
                                              initializer=Constant(0.), trainable=True)]
        super(CustomMultiLossLayer, self).build(input_shape)

    def multi_loss(self, ys_true, ys_pred):
        # print(self.nb_outputs)
        # print(len(ys_true))
        # print(len(ys_pred))
        # print(ys_true)
        # print(ys_pred)
        assert len(ys_true) == self.nb_outputs and len(ys_pred) == self.nb_outputs
        loss = 0
        for y_true, y_pred, log_var in zip(ys_true, ys_pred, self.log_vars):
            precision = K.exp(-log_var[0])
            loss += K.sum(precision * (y_true - y_pred)**2. + log_var[0], -1)
        return K.mean(loss)

    def call(self, inputs):
        ys_true = inputs[:self.nb_outputs]
        ys_pred = inputs[self.nb_outputs:]
        loss = self.multi_loss(ys_true, ys_pred)
        self.add_loss(loss, inputs=inputs)
        # We won't actually use the output.
        return K.concatenate(inputs, -1)

In [None]:
# from tensorflow.python.framework.ops import disable_eager_execution
# disable_eager_execution()

def get_prediction_model(classes):

    # inp = Input(shape=INPUT_SIZE, name='inp') 
    input = Input(shape=(3,32,251), name='inp') 

    # inputs = Input(shape=INPUT_SHAPE)
    # x = SincConvFast_2(64, 251, sample_frequency, input_shape=(frame_length, 1))(inp)
    
    z1 = Bidirectional(CuLSTM(128, return_sequences=True))(inp)
    z2 = Bidirectional(CuLSTM(units=128, return_sequences=True))(z1)
    z3 = add([z1, z2])  # residual connection
    z4 = Bidirectional(CuLSTM(units=128, return_sequences=True))(z3)
    z5 = Bidirectional(CuLSTM(units=128, return_sequences=False))(z4)
    z6 = add([z4, z5])  # residual connection    
    z61 = Flatten()(z6)        
    z7 = Dense(256, activation='relu')(z61)
    z8 = Dropout(0.5)(z7)

    out = Dense(classes, activation='softmax')(z7)

    y1_pred = Dense(classes, activation='relu', name='output_1')(out)
    y2_pred = Dense(classes, activation='relu', name='output_2')(out)

    model  = Model(inp, [y1_pred, y2_pred])

    model.summary()

    return model
    # return y1_pred, y2_pred

def get_trainable_model(data_input_shape, classes, learning_rate):
    # data_input_shape=INPUT_SIZE
    # classes=N_CLASSES
    # learning_rate=0.001
    
    inp = Input(shape=data_input_shape, name='inp')
    # inp = input[0]

    z1 = Bidirectional(CuLSTM(128, return_sequences=True))(inp)
    z2 = Bidirectional(CuLSTM(units=128, return_sequences=True))(z1)
    z3 = add([z1, z2])  # residual connection
    z4 = Bidirectional(CuLSTM(units=128, return_sequences=True))(z3)
    z5 = Bidirectional(CuLSTM(units=128, return_sequences=False))(z4)
    z6 = add([z4, z5])  # residual connection    
    z61 = Flatten()(z6)        
    z7 = Dense(256, activation='relu')(z61)
    z8 = Dropout(0.5)(z7)

    out = Dense(classes, activation='softmax')(z7)

    y1_pred = Dense(classes, activation='relu', name='output_1')(out)
    y2_pred = Dense(classes, activation='relu', name='output_2')(out)

    model_input  = Model(inp, [y1_pred, y2_pred])

    # model.summary()
    # try:
    #   y1_pred, y2_pred = get_prediction_model(inp, classes)
    # except Exception as e:
    #   pass
    # print(prediction_model(classes))
    # print("AAAAAAAAAAAAAAADDD")
    y1_pred, y2_pred = model_input(inp)

    y1_true = Input(shape=(classes,), name='y1_true')
    # print(y1_true)
    y2_true = Input(shape=(classes,), name='y2_true')
    out = CustomMultiLossLayer(nb_outputs=2, name='output_3')([y1_true, y2_true, y1_pred, y2_pred])
    model = Model([inp, y1_true, y2_true], out)

    # model.summary()

    opt = Adam(lr=learning_rate)
    # losses = {
    #   "output_1": "sparse_categorical_crossentropy",
    #   "output_2": "poisson"
    # }
    model.compile(optimizer='adam', loss=None)

    return model

In [None]:
class LSTM_MODEL(object):
    @staticmethod
    def build_simple_lstm(data_input_shape, classes, learning_rate):
        model = Sequential()
        model.add(CuLSTM(units=128, return_sequences=True, input_shape=data_input_shape))
        model.add(CuLSTM(units=128,  return_sequences=False))
        model.add(Dense(units=64, activation="relu"))
        model.add(Dense(units=classes, activation="softmax"))
        # Keras optimizer defaults:
        # Adam   : lr=0.001, beta_1=0.9,  beta_2=0.999, epsilon=1e-8, decay=0.
        # RMSprop: lr=0.001, rho=0.9,                   epsilon=1e-8, decay=0.
        # SGD    : lr=0.01,  momentum=0.,                             decay=0.
        opt = Adam(lr=learning_rate)
        model.compile(loss="categorical_crossentropy", optimizer=opt, metrics=["accuracy"])
        return model
    
    @staticmethod
    def build_bilstm(data_input_shape, classes, learning_rate):
        model = Sequential()
        model.add(Bidirectional(CuLSTM(128), input_shape=data_input_shape))
        model.add(Dropout(0.5))
        model.add(Dense(classes, activation='softmax'))
        opt = Adam(lr=learning_rate)
        model.compile(loss="categorical_crossentropy", optimizer=opt, metrics=["accuracy"])
        # model.summary()

        return model

    @staticmethod
    def build_residual_bilstm(data_input_shape, classes, learning_rate):
        inp = Input(shape=data_input_shape)

        # inp = SincConvFast_1(64, 251, sample_frequency)(inp)
        # inp = normalize(inp)
        # inp = LayerNormalization()(inp)
        # inp = SincConvFast(64, 251, sample_frequency)(inp)
        z1 = Bidirectional(CuLSTM(128, return_sequences=True))(inp)
        z2 = Bidirectional(CuLSTM(units=128, return_sequences=True))(z1)
        z3 = add([z1, z2])  # residual connection
        z4 = Bidirectional(CuLSTM(units=128, return_sequences=True))(z3)
        z5 = Bidirectional(CuLSTM(units=128, return_sequences=False))(z4)
        z6 = add([z4, z5])  # residual connection    
        z61 = Flatten()(z6)        
        z7 = Dense(256, activation='relu')(z61)
        z8 = Dropout(0.5)(z7)
        out = Dense(classes, activation='softmax')(z8)
        model = Model(inputs=[inp], outputs=out)
        opt = Adam(lr=learning_rate)
        model.compile(loss=None, optimizer=opt, metrics=["accuracy"])
        # model.summary()
        return model
    
    @staticmethod
    def get_trainable_model(data_input_shape, classes, learning_rate=0.001):
        # data_input_shape=INPUT_SIZE
        # classes=N_CLASSES
        # learning_rate=0.001
        
        inp = Input(shape=data_input_shape, name='inp')
        y1_pred, y2_pred = get_prediction_model(inp)
        y1_true = Input(shape=(classes,), name='y1_true')
        y2_true = Input(shape=(classes,), name='y2_true')
        out = CustomMultiLossLayer(nb_outputs=classes, name='output_3')([y1_true, y2_true, y1_pred, y2_pred])
        model = Model(inputs=[inp, y1_true, y2_true], outputs=[out, y1_pred, y2_pred])
        opt = Adam(lr=learning_rate)
        losses = {
          "output_2": "sparse_categorical_crossentropy",
          "output_3": "poisson"
        }
        model.compile(loss=None, optimizer=opt, metrics=["accuracy"])

        return model



##### Create dataset module

In [None]:
!pip install pydub

In [None]:
import sklearn
from librosa.util import normalize
from sklearn import preprocessing

from pydub import AudioSegment, effects 
def normalize_manual(x, axis=0):
    return sklearn.preprocessing.minmax_scale(x, axis=axis)

def normalize_fixed(x, current_range =[[0, 100]], normed_range=[[0, 1]]):
    current_min, current_max = tf.expand_dims(current_range[:, 0], 1), tf.expand_dims(current_range[:, 1], 1)
    normed_min, normed_max = tf.expand_dims(normed_range[:, 0], 1), tf.expand_dims(normed_range[:, 1], 1)
    x_normed = (x - current_min) / (current_max - current_min)
    x_normed = x_normed * (normed_max - normed_min) + normed_min
    return x_normed

def despike(yi, th=1.e-8):
    '''Remove spike from array yi, the spike area is where the difference between the neigboring points is higher than th.'''
    y = np.copy(yi) # use y = y1 if it is OK to modify input array
    n = len(y)
    x = np.arange(n)
    c = np.argmax(y)
    d = abs(np.diff(y))
    try:
        l = c - 1 - np.where(d[c-1::-1]<th)[0][0]
        r = c + np.where(d[c:]<th)[0][0] + 1
    except: # no spike, return unaltered array
        return y
    # for fit, use area twice wider then the spike
    if (r-l) <= 3:
        l -= 1
        r += 1
    s = int(round((r-l)/2.))
    lx = l - s
    rx = r + s
    # make a gap at spike area
    xgapped = np.concatenate((x[lx:l],x[r:rx]))
    ygapped = np.concatenate((y[lx:l],y[r:rx]))
    # quadratic fit of the gapped array
    z = np.polyfit(xgapped,ygapped,2)
    p = np.poly1d(z)
    y[l:r] = p(x[l:r])
    return y

class BreathDataTrainingGenerator(tf.keras.utils.Sequence):#tensorflow.python.keras.utils.data_utils --- tf.keras.utils.Sequence
    'Generates data for Keras'
    def __init__(self, directory, 
                    list_labels=['normal', 'deep', 'heavy', 'other'], 
                    batch_size=32,
                    dim=None,
                    classes=None, 
                    shuffle=True):
        'Initialization'
        self.directory = directory
        self.list_labels = list_labels
        self.dim = dim
        self.__flow_from_directory(self.directory)
        self.batch_size = batch_size
        self.classes = len(self.list_labels)
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.wavs) / self.batch_size))

    def __getitem__(self, index):
        # print("In get Item!!")
        # 'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        # Find list of IDs
        rawX = [self.wavs[k] for k in indexes]
        rawY = [self.labels[k] for k in indexes]

        # Generate data
        X, Y = self.__feature_extraction(rawX, rawY)
        # print("TADDDDDDDD", len([X, Y, Y]), " - ", len([X, Y, Y][0]), " - ", len([X, Y, Y][0][0]))
        # print("Done getting data")
        return [X, Y, Y], Y

    def __flow_from_directory(self, directory):
        self.wavs = []
        self.labels = []
        for dir in os.listdir(directory):
            sub_dir = os.path.join(directory, dir)
            if os.path.isdir(sub_dir) and dir in self.list_labels:
                print(sub_dir)
                label = self.list_labels.index(dir)
                try:
                    for file in os.listdir(sub_dir):
                        if file == '22_male_21_VHung_41_225121_deep.wav':
                            continue
                        
                        self.wavs.append(os.path.join(sub_dir, file))
                        self.labels.append(label)
                except Exception as e:
                    print(e)
                    print(file)
                    pass


    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.wavs))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __feature_extraction(self, list_wav, list_label):
        # print("Go to feature extraction!!!")
        'Generates data containing batch_size samples'
        X = []
        Y = []
        for i in range(self.batch_size):
            rate, data = wavfile.read(list_wav[i]) #bug in here

            # data, rate = readCoughData(file=list_wav[i])


            # resampledData = resample(originalData=data, origSampFreq=rate, targetSampFreq=16000)
            # normalizedData = normalizeSound(resampledData, axis=0)
            # S = calculateMelSpectogram(normalizedData=normalizedData, hop_length=512, win_length=1024, sr=16000)
            # plotSound(soundData=normalizedData, sr=8000,x_axis_string='time')
            # plotMelSpectogram(S, sr=8000, ref=np.max)


            # data = effects.normalize(data)
            # print("End")
            # data = despike(data)

            data = np.array(data, dtype=np.float32)
            data *= 1./32768
            # feature = librosa.feature.melspectrogram(y=data, sr=rate, n_fft=2048, hop_length=512, power=2.0)
            # feature = librosa.feature.rmse(data+ 0.0001)
            # feature = librosa.feature.spectral_rolloff(y=data, sr=rate)[0]
            feature = librosa.feature.mfcc(y=data, sr=rate, 
                                           n_mfcc=40, fmin=0, fmax=8000,
                                           n_fft=int(16*64), hop_length=int(16*32), power=2.0)

            # print(S.shape) # (128, 251)
            # feature = normalize(feature)
            feature = np.resize(feature, self.dim)

            # mellog = np.log(feature + 1e-9)
            # feature = librosa.util.normalize(mellog)
            # feature= sklearn.preprocessing.normalize(feature)

            # normalize data
            # feature = normalize(feature)

            category_label =  to_categorical(list_label[i], num_classes= len(self.list_labels) )
            X.append(feature)
            Y.append(category_label)
        
        # print(X)
        # X = normalize_fixed(X)
        # X -= np.mean(X, keepdims=True)
        # X = (X- np.min(X, 0)) / (np.max(X, 0) + 0.0001) 
        # X /= (np.std(X, keepdims=True) + tf.keras.backend.epsilon())
        X = np.array(X, dtype=np.float32)
        mean = np.mean(X, axis=0)
        std = np.std(X, axis=0)

        X = (X - mean)/std

        # print(len(X[0]))
        # print(len(X[0][0]))

        # min_max_scaler = preprocessing.MinMaxScaler()
        
        Y = np.array(Y, dtype=int)
        return X, Y

class BreathDataValidationGenerator(tf.keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, directory, 
                    list_labels=['normal', 'deep', 'heavy', 'other'], 
                    batch_size=32,
                    dim=None,
                    classes=None, 
                    shuffle=True):
        'Initialization'
        self.directory = directory
        self.list_labels = list_labels
        self.dim = dim
        self.__flow_from_directory(self.directory)
        self.batch_size = batch_size
        self.classes = len(self.list_labels)
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.wavs) / self.batch_size))

    def __getitem__(self, index):
        # print("In get Item!!")
        # 'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        # Find list of IDs
        rawX = [self.wavs[k] for k in indexes]
        rawY = [self.labels[k] for k in indexes]

        # Generate data
        X, Y = self.__feature_extraction(rawX, rawY)
        # print("Done getting data")
        return X, Y

    def __flow_from_directory(self, directory):
        self.wavs = []
        self.labels = []
        for dir in os.listdir(directory):
            sub_dir = os.path.join(directory, dir)
            if os.path.isdir(sub_dir) and dir in self.list_labels:
                print(sub_dir)
                label = self.list_labels.index(dir)
                for file in os.listdir(sub_dir):
                    self.wavs.append(os.path.join(sub_dir, file))
                    self.labels.append(label)

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.wavs))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __feature_extraction(self, list_wav, list_label):
        # print("Go to feature extraction!!!")
        'Generates data containing batch_size samples'
        X = []
        Y = []
        for i in range(self.batch_size):
            rate, data = wavfile.read(list_wav[i]) #bug in here

            # data, rate = readCoughData(file=list_wav[i])


            # resampledData = resample(originalData=data, origSampFreq=rate, targetSampFreq=16000)
            # normalizedData = normalizeSound(resampledData, axis=0)
            # S = calculateMelSpectogram(normalizedData=normalizedData, hop_length=512, win_length=1024, sr=16000)
            # plotSound(soundData=normalizedData, sr=8000,x_axis_string='time')
            # plotMelSpectogram(S, sr=8000, ref=np.max)


            # data = effects.normalize(data)
            # print("End")
            # data = despike(data)

            data = np.array(data, dtype=np.float32)
            data *= 1./32768
            # feature = librosa.feature.melspectrogram(y=data, sr=rate, n_fft=2048, hop_length=512, power=2.0)
            # feature = librosa.feature.rmse(data+ 0.0001)
            # feature = librosa.feature.spectral_rolloff(y=data, sr=rate)[0]
            feature = librosa.feature.mfcc(y=data, sr=rate, 
                                           n_mfcc=40, fmin=0, fmax=8000,
                                           n_fft=int(16*64), hop_length=int(16*32), power=2.0)

            # print(S.shape) # (128, 251)
            # feature = normalize(feature)
            feature = np.resize(feature, self.dim)

            # mellog = np.log(feature + 1e-9)
            # feature = librosa.util.normalize(mellog)
            # feature= sklearn.preprocessing.normalize(feature)

            # normalize data
            # feature = normalize(feature)

            category_label =  to_categorical(list_label[i], num_classes= len(self.list_labels) )
            X.append(feature)
            Y.append(category_label)
        
        # print(X)
        # X = normalize_fixed(X)
        # X -= np.mean(X, keepdims=True)
        # X = (X- np.min(X, 0)) / (np.max(X, 0) + 0.0001) 
        # X /= (np.std(X, keepdims=True) + tf.keras.backend.epsilon())
        X = np.array(X, dtype=np.float32)
        mean = np.mean(X, axis=0)
        std = np.std(X, axis=0)

        X = (X - mean)/std

        # print(len(X[0]))
        # print(len(X[0][0]))

        # min_max_scaler = preprocessing.MinMaxScaler()
        
        Y = np.array(Y, dtype=int)
        return X, Y


##### Constants

In [None]:
# Set the config for training

BATCH_SIZE = 32
# LIST_LABELS = ['normal', 'deep', 'heavy', 'other']
LIST_LABELS = ['normal', 'deep', 'heavy']
N_CLASSES = len(LIST_LABELS)
N_EPOCHS = 100


# INPUT_SIZE = (40, 126, 1) # Input size for CNN training
INPUT_SIZE = (32, 251) # Input size for LSTM training


TRAINING_SOURCE = '/content/gdrive/My Drive/Breath-Data/data/training/'
VALID_SOURCE = '/content/gdrive/My Drive/Breath-Data/data/validation/'
MODE = 'TRAINING'
MODEL_OUTPUT = '/content/gdrive/My Drive/Breath-Data/data/model_output'

RUN_TITLE = "20210706-Residual"

##### Training model

In [None]:
# Generate data for training
# try:
train_generator = BreathDataTrainingGenerator(
          TRAINING_SOURCE,
          list_labels=LIST_LABELS,
          batch_size=BATCH_SIZE,
          dim=INPUT_SIZE,
          shuffle=False)
# except Exception as e:
#   pass

N_TRAIN_SAMPLES = len(train_generator.wavs)
print("Train samples: {}".format(N_TRAIN_SAMPLES))

validation_generator = BreathDataTrainingGenerator(
        VALID_SOURCE,
        list_labels=LIST_LABELS,
        batch_size=BATCH_SIZE,
        dim=INPUT_SIZE,
        shuffle=False)
N_VALID_SAMPLES = len(validation_generator.wavs)
print("Validation samples: {}".format(N_VALID_SAMPLES))

TRAIN_NEW = True

print([train_generator,  np.array(train_generator.classes, dtype=int), np.array(train_generator.classes, dtype=int)])

if TRAIN_NEW:

  # build LSTM model 

  # model = LSTM_MODEL.build_simple_lstm(data_input_shape=INPUT_SIZE, classes=N_CLASSES, learning_rate=0.001)

  # model = LSTM_MODEL.build_residual_bilstm(data_input_shape=INPUT_SIZE, classes=N_CLASSES, learning_rate=0.001)
  # inp = Input(shape=INPUT_SIZE, name='inp')
  # prediction_model = get_prediction_model(3)
  # prediction_model.summary()
  model = get_trainable_model(data_input_shape=INPUT_SIZE, classes=N_CLASSES, learning_rate=0.001)

  # model = LSTM_MODEL.build_seld_net(data_input_shape=INPUT_SIZE, classes=N_CLASSES, learning_rate=0.001)

  # model = LSTM_MODEL.build_bilstm(data_input_shape=INPUT_SIZE, classes=N_CLASSES, learning_rate=0.001)

  model.summary()
else: 
  from keras.models import load_model
  # Reload model
  model = load_model('5S-50-ResidualLSTM-weights-improvement_bi_lstm-30-0.73-0.45.hdf5')

# Checkpoint
if not os.path.exists(MODEL_OUTPUT):
    os.makedirs(MODEL_OUTPUT)

epoch = 0
accuracy = 0.1
val_accurac = 0.1



filepath= os.path.join(MODEL_OUTPUT, RUN_TITLE + "LSTM-weights-improvement_bi_lstm-{epoch:02d}-{accuracy:.2f}-{val_accuracy:.2f}.hdf5") 
# filepath="./model_output/LSTM-weights-improvement-{epoch:02d}-{val_acc:.2f}.hdf5"

checkpoint = ModelCheckpoint(filepath, monitor='val_acc', save_best_only=False, mode='max')
callbacks_list = [checkpoint]



# Start training
model.fit_generator(
        train_generator,
        steps_per_epoch= N_TRAIN_SAMPLES // BATCH_SIZE,
        initial_epoch=0,
        epochs=N_EPOCHS,
        validation_data=validation_generator,
        validation_steps=N_VALID_SAMPLES // BATCH_SIZE,
        callbacks=callbacks_list,
        use_multiprocessing=True,
)