# Add title

## Import libraries

In [1]:
import numpy as np
import plaidml.keras
import os
plaidml.keras.install_backend()
os.environ["KERAS_BACKEND"] = "plaidml.keras.backend"

from keras import backend as K
print(K)

import keras
from keras.preprocessing.image import ImageDataGenerator

print("Keras version %s" %keras.__version__)

<module 'plaidml.keras.backend' from 'E:\\anaconda3\\envs\\plaidML\\lib\\site-packages\\plaidml\\keras\\backend.py'>
Keras version 2.2.4


## Load meta file (onset, offset, labels)

In [2]:
import pandas as pd
df = pd.read_csv('E:\\Xception\\TUT-SED-synthetic-2016\\meta.txt', delimiter = "\t", header=None)
df.columns = ["file", "drop1", "onset", "offset", "label", "drop2"]
# delete not useful columns
df = df.drop(columns=["drop1", "drop2"])

## Extract Features

In [8]:
# Load various imports 
import pandas as pd
#import librosa.display
import librosa

directory = 'E:\\Xception\\TUT-SED-synthetic-2016\\audio\\'

features = []
num_frames = []

#length of splitted subvectors
length = 1024

# Iterate through each sound file and extract the features 
for i in range(100):
    
    if i%10==0:
        print("processing element ",i)
    
    file_name = directory+'TUT-SED-synthetic-2016-mix-'+str(i)+'.wav'
        
    # load single audio file
    audio, sr = librosa.load(file_name, res_type='kaiser_fast')
    
    #mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=40, n_fft=440, hop_length=73, n_mels=40)
    #mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=40)
    #mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_fft=1024, hop_length=882, n_mels=40)
    
    # extract mfcc features with 20ms frame and 50% overlap at 22050 Hz
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=40, n_fft=440, hop_length=220, n_mels=40)
    
    # save audio number of frames for labelling process
    num_frames.append(mfccs.shape[1])
    
    # create subvectors of length and pad the last to length
    for j in range(int(mfccs.shape[1]/length)+1):
        
        # subdivide in 1024 elements vectors
        feat = mfccs[:,j*length:(j+1)*length]
        
        # last vector need padding
        if feat.shape[1] != length:
            feat = np.pad(feat, pad_width=((0, 0), (0, length-feat.shape[1])), mode='constant')
        
        # add new splitted vectors to features list
        features.append(feat)
    
    # add extracted feature to features' list 
    #features.append([mfccs])

print('Extracted a total of', len(features), 'features vectors')

processing element  0
processing element  10
processing element  20
processing element  30
processing element  40
processing element  50
processing element  60
processing element  70
processing element  80
processing element  90
Extracted a total of 3376 features vectors


## Data labelling

In [9]:
#labelling matrix creation, this is done due to poliphony of data

import numpy as np

num_classes = df['label'].nunique()
labels = []

#length of splitted subvectors
length = 1024

# to transform literal label to a numerical value
label_switch = {"alarms_and_sirens" : 0,
                "baby_crying" : 1,
                "bird_singing" : 2,
                "bus" : 3,
                "cat_meowing" : 4,
                "crowd_applause" : 5,
                "crowd_cheering" : 6,
                "dog_barking" : 7,
                "footsteps" : 8,
                "glass_smash" : 9,
                "gun_shot" : 10,
                "horsewalk" : 11,
                "mixer" : 12,
                "motorcycle" : 13,
                "rain" : 14,
                "thunder" : 15,        
}

# iterate over each file and create labels based on meta file
for i in range(100):
    
    if i%10==0:
        print("processing element ",i)
        
    file_name = directory+'TUT-SED-synthetic-2016-mix-'+str(i)+'.wav'

    # initialize matrices with zeros
    #num_frames = np.vstack(features[i]).shape[1]
    single_labels = np.zeros((num_classes, num_frames[i]), dtype=int)

    # extract metadata of single audio file
    query_name = file_name[len('E:\\Xception\\TUT-SED-synthetic-2016\\audio\\'):]
    query_name = 'audio/' + query_name
    subdf = df.query('file==@query_name')
    
    # populate matrix based on meta file
    # more efficient to do it starting from metafile instead of iterating on the matrix since 
    # features are less than the matrix and there are more zeros than ones at the end of 
    # the process
    onset = np.array(subdf.loc[:, 'onset'])
    offset = np.array(subdf.loc[:, 'offset'])
    labeldf = np.array(subdf.loc[:, 'label'])
    
    for j in range(onset.shape[0]):
        # metadata values are in seconds so transform in 10ms intervals
        start = int(onset[j]*100)
        end = int(offset[j]*100)
        # label based on start and end of the single audio
        for k in range(end-start):
            single_labels[label_switch[labeldf[j]]][k+start] = 1
    
    """
    for index, row in subdf.iterrows():
        # metadata values are in seconds so transform in 10ms intervals
        start = int(row['onset']*100)
        end = int(row['offset']*100)
        # label based on start and end of the single audio
        for k in range(end-start):
            single_labels[label_switch[row['label']]][k+start] = 1"""
           
    # create subvectors of length and pad the last to length
    for j in range(int(single_labels.shape[1]/length)+1):
        
        # subdivide in 1024 elements vectors
        lab = single_labels[:,j*length:(j+1)*length]
        
        # last vector need padding
        if lab.shape[1] != length:
            lab = np.pad(lab, pad_width=((0, 0), (0, length-lab.shape[1])), mode='constant')
        
        # add new splitted vectors to labels list
        labels.append(lab)
    
    
    # add extracted labels to labels' list 
    #labels.append([single_labels])


processing element  0
processing element  10
processing element  20
processing element  30
processing element  40
processing element  50
processing element  60
processing element  70
processing element  80
processing element  90


In [15]:
prova = np.array(labels[0])
for i in range(30000):
    print(prova[0][13][i])

1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0


0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0


0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0


1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1


## Create subvectors for each mixture (CAN BE DELETED)

In [8]:
# create data structures to contain data
splitted_features = []
splitted_labels =[]

#length of splitted subvectors
length = 1024

# iterate on each audio file
for i in range(100):
    
    if i%10==0:
        print("processing element ",i)
    
    # extract working features and labels for current element
    working_f = np.vstack(features[i])
    working_l = np.vstack(labels[i])
    
    # iterate for all the possible subsequencies
    for j in range(int(working_f.shape[1]/length)+1):
        
        # subdivide in 1024 elements vectors
        feat = working_f[:,j*length:(j+1)*length]
        lab = working_l[:,j*length:(j+1)*length]
        
        # last vector need padding
        if feat.shape[1] != length:
            feat = np.pad(feat, pad_width=((0, 0), (0, length-feat.shape[1])), mode='constant')
            lab = np.pad(lab, pad_width=((0, 0), (0, length-lab.shape[1])), mode='constant')
        
        # add new splitted vectors to new lists
        splitted_features.append(feat)
        splitted_labels.append(lab)


processing element  0
processing element  10
processing element  20
processing element  30
processing element  40
processing element  50
processing element  60
processing element  70
processing element  80
processing element  90


## Create dataframe to manipulate features and labels

In [10]:
# create empty data frame in pandas
dataframe = pd.DataFrame()

# add features
dataframe['feature']  = features

# add labels
dataframe['class_label']  = labels

print('Total dataset contains', len(dataframe), ' files')

Total dataset contains 3376  files


In [12]:
dataframe['class_label'][0]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 1, 1],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

## Save processed data to pickle file

In [10]:
dataframe.to_pickle('E:\\Xception\\TUT-SED-synthetic-2016\\processed_data_frame.pkl')

## Delete unused variable to save RAM

In [None]:
reset_selective -f features
reset_selective -f labels
reset_selective -f splitted_features
reset_selective -f splitted_labels

## Load processed data from pickle file

In [2]:
import pandas as pd
dataframe = pd.read_pickle('E:\\Xception\\TUT-SED-synthetic-2016\\processed_data_frame.pkl')
print("imported", dataframe.shape[0],"elements")

imported 3376 elements


## Preprocess data to input the network 

In [3]:
#from sklearn.preprocessing import LabelEncoder
#from keras.utils import to_categorical

# Convert features and corresponding classification labels into numpy arrays
X = np.array(dataframe.feature.tolist())
y = np.array(dataframe.class_label.tolist())

# no need to encode labels since they are already in form of one hot encode

# delete dataframe to save memory
#reset_selective -f dataframe

# split the dataset 
from sklearn.model_selection import train_test_split 

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

# delete X to save memory
#reset_selective -f X

## Delete dataframe to save RAM

In [4]:
reset_selective -f dataframe

## Baseline Model

In [6]:
from keras.models import Model
from keras.layers import SeparableConv2D, ZeroPadding2D, Activation, Dropout, Dense, \
                            Conv2D, MaxPooling2D, Reshape, GRU
from keras.layers.normalization import BatchNormalization
from keras import Input, optimizers

num_rows = X.shape[1]
num_columns = X.shape[2]
num_channels = 1

x_train = x_train.reshape(x_train.shape[0], num_columns, num_rows, num_channels)
x_test = x_test.reshape(x_test.shape[0], num_columns, num_rows, num_channels)

num_labels = y.shape[1]
#num_labels = 16

def Net():
    
    # input layer
    inputs = Input(shape=(num_columns, num_rows, num_channels))
    
    # DWS-CNN layer 1
    #x = ZeroPadding2D(padding=(2))(inputs)
    # use valid padding since padding is introduced before due to its special form (maybe it's equal to use same padding?)
    x = Conv2D(256,kernel_size=(5,5), strides=(1,1), padding='same')(inputs)
    x = Activation('relu')(x)
    # Batch Normalisation before passing it to the next layer
    x = BatchNormalization()(x)
    # Pooling
    x = MaxPooling2D(pool_size=(1,5), padding='same')(x)
    # Dropout
    x = Dropout(0.25)(x)
    
    # DWS-CNN layer 2
    #x = ZeroPadding2D(padding=(2))(x)
    # use valid padding since padding is introduced before due to its special form
    x = Conv2D(256,kernel_size=(5,5), strides=(1,1), padding='same')(x)
    x = Activation('relu')(x)
    # Batch Normalisation before passing it to the next layer
    x = BatchNormalization()(x)
    # Pooling
    x = MaxPooling2D(pool_size=(1,4), padding='same')(x)
    # Dropout
    x = Dropout(0.25)(x)
    
    # DWS-CNN layer 3
    #x = ZeroPadding2D(padding=(2))(x)
    # use valid padding since padding is introduced before due to its special form
    x = Conv2D(256,kernel_size=(5,5), strides=(1,1), padding='same')(x)
    x = Activation('relu')(x)
    # Batch Normalisation before passing it to the next layer
    x = BatchNormalization()(x)
    # Pooling
    x = MaxPooling2D(pool_size=(1,2), padding='same')(x)
    # Dropout
    x = Dropout(0.25)(x)
    
    x = Reshape((1024, 256))(x)
    
    # GRU
    x = GRU(256, return_sequences=True)(x)
        
    # classifier layer
    outputs = Dense(num_labels,activation='sigmoid')(x)
    
    
    # model compilation for training
    adam = optimizers.Adam(lr=0.0001)
    model = Model(inputs, outputs)                            
    model.compile(loss='binary_crossentropy', optimizer=adam, metrics=["binary_accuracy"])
    
    return model


# create the model
dilated_kernel = (3,3)
dilation = 10
dilated_padding = 1
model = Net(dilated_kernel,dilation,dilated_padding)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 1024, 40, 1)       0         
_________________________________________________________________
conv2d_4 (Conv2D)            (None, 1024, 40, 256)     6656      
_________________________________________________________________
activation_4 (Activation)    (None, 1024, 40, 256)     0         
_________________________________________________________________
batch_normalization_4 (Batch (None, 1024, 40, 256)     1024      
_________________________________________________________________
max_pooling2d_4 (MaxPooling2 (None, 1024, 8, 256)      0         
_________________________________________________________________
dropout_4 (Dropout)          (None, 1024, 8, 256)      0         
_________________________________________________________________
conv2d_5 (Conv2D)            (None, 1024, 8, 256)      1638656   
__________

## Dessed model

In [7]:
from keras.models import Model
from keras.layers import SeparableConv2D, ZeroPadding2D, Activation, Dropout, Dense, \
                            Conv2D, MaxPooling2D, Reshape, GRU
from keras.layers.normalization import BatchNormalization
from keras import Input, optimizers

num_rows = X.shape[1]
num_columns = X.shape[2]
num_channels = 1

x_train = x_train.reshape(x_train.shape[0], num_columns, num_rows, num_channels)
x_test = x_test.reshape(x_test.shape[0], num_columns, num_rows, num_channels)

num_labels = y.shape[1]
#num_labels = 16

def Net():
    
    # input layer
    inputs = Input(shape=(num_columns, num_rows, num_channels))
    
    # DWS-CNN layer 1
    x = ZeroPadding2D(padding=(2))(inputs)
    # use valid padding since padding is introduced before due to its special form (maybe it's equal to use same padding?)
    x = SeparableConv2D(256,kernel_size=(5,5), strides=(1,1), padding='valid')(x)
    x = Activation('relu')(x)
    # Batch Normalisation before passing it to the next layer
    x = BatchNormalization()(x)
    # Pooling
    x = MaxPooling2D(pool_size=(1,5), strides=(1,5), padding='valid')(x)
    # Dropout
    x = Dropout(0.25)(x)
    
    # DWS-CNN layer 2
    x = ZeroPadding2D(padding=(2))(x)
    # use valid padding since padding is introduced before due to its special form
    x = SeparableConv2D(256,kernel_size=(5,5), strides=(1,1), padding='valid')(x)
    x = Activation('relu')(x)
    # Batch Normalisation before passing it to the next layer
    x = BatchNormalization()(x)
    # Pooling
    x = MaxPooling2D(pool_size=(1,4), strides=(1,4), padding='valid')(x)
    # Dropout
    x = Dropout(0.25)(x)
    
    # DWS-CNN layer 3
    x = ZeroPadding2D(padding=(2))(x)
    # use valid padding since padding is introduced before due to its special form
    x = SeparableConv2D(256,kernel_size=(5,5), strides=(1,1), padding='valid')(x)
    x = Activation('relu')(x)
    # Batch Normalisation before passing it to the next layer
    x = BatchNormalization()(x)
    # Pooling
    x = MaxPooling2D(pool_size=(1,2), strides=(1,2), padding='valid')(x)
    # Dropout
    x = Dropout(0.25)(x)
    
    x = Reshape((1024, 256))(x)
    
    # GRU
    x = GRU(256, return_sequences=True)(x)
        
    # classifier layer
    outputs = Dense(num_labels,activation='sigmoid')(x)
    
    
    # model compilation for training
    adam = optimizers.Adam(lr=0.0001)
    model = Model(inputs, outputs)                            
    model.compile(loss='binary_crossentropy', optimizer=adam, metrics=["binary_accuracy"])
    
    return model


# create the model
dilated_kernel = (3,3)
dilation = 10
dilated_padding = 1
model = Net()
model.summary()

INFO:plaidml:Opening device "opencl_amd_gfx1010.0"


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 1024, 40, 1)       0         
_________________________________________________________________
zero_padding2d_1 (ZeroPaddin (None, 1028, 44, 1)       0         
_________________________________________________________________
separable_conv2d_1 (Separabl (None, 1024, 40, 256)     537       
_________________________________________________________________
activation_1 (Activation)    (None, 1024, 40, 256)     0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 1024, 40, 256)     1024      
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 1024, 8, 256)      0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 1024, 8, 256)      0         
__________

## Baseline dilated

In [12]:
from keras.models import Model
from keras.layers import SeparableConv2D, ZeroPadding2D, Activation, Dropout, Dense, \
                            Conv2D, MaxPooling2D, Reshape, Permute
from keras.layers.normalization import BatchNormalization
from keras import Input, optimizers

num_rows = X.shape[1]
num_columns = X.shape[2]
num_channels = 1

x_train = x_train.reshape(x_train.shape[0], num_columns, num_rows, num_channels)
x_test = x_test.reshape(x_test.shape[0], num_columns, num_rows, num_channels)

num_labels = y.shape[1]
#num_labels = 16

def Net(dilated_kernel, dilation, dilated_padding):
    
    # input layer
    inputs = Input(shape=(num_columns, num_rows, num_channels))
    
    # DWS-CNN layer 1
    #x = ZeroPadding2D(padding=(2))(inputs)
    # use valid padding since padding is introduced before due to its special form (maybe it's equal to use same padding?)
    x = Conv2D(256,kernel_size=(5,5), strides=(1,1), padding='same')(inputs)
    x = Activation('relu')(x)
    # Batch Normalisation before passing it to the next layer
    x = BatchNormalization()(x)
    # Pooling
    x = MaxPooling2D(pool_size=(1,5), padding='same')(x)
    # Dropout
    x = Dropout(0.25)(x)
    
    # DWS-CNN layer 2
    #x = ZeroPadding2D(padding=(2))(x)
    # use valid padding since padding is introduced before due to its special form
    x = Conv2D(256,kernel_size=(5,5), strides=(1,1), padding='same')(x)
    x = Activation('relu')(x)
    # Batch Normalisation before passing it to the next layer
    x = BatchNormalization()(x)
    # Pooling
    x = MaxPooling2D(pool_size=(1,4), padding='same')(x)
    # Dropout
    x = Dropout(0.25)(x)
    
    # DWS-CNN layer 3
    #x = ZeroPadding2D(padding=(2))(x)
    # use valid padding since padding is introduced before due to its special form
    x = Conv2D(256,kernel_size=(5,5), strides=(1,1), padding='same')(x)
    x = Activation('relu')(x)
    # Batch Normalisation before passing it to the next layer
    x = BatchNormalization()(x)
    # Pooling
    x = MaxPooling2D(pool_size=(1,2), padding='same')(x)
    # Dropout
    x = Dropout(0.25)(x)
    
    #x = Reshape((1024, 256))(x)
    #x = Reshape((1024, 256, 1))(x)
    x = Permute((1,3,2))(x)
    #x = Permute((3,2,1))(x)
    
    # DIL-CNN 
    x = ZeroPadding2D(padding=(dilated_padding*dilation, 0))(x)
    x = Conv2D(256, kernel_size=dilated_kernel, dilation_rate=(dilation, 1))(x)#, strides=(1,3))(x)
    x = Activation('relu')(x)
    x = BatchNormalization()(x)
    x = Conv2D(256, (1,1), strides=(1,3))(x)
    
    #print(type(x))
    #print(x.shape)
    
    #x = Permute((2,1,3))(x)
    
    x = Reshape((1024, 256*85))(x)
    
    # classifier layer
    outputs = Dense(num_labels,activation='sigmoid')(x)
    
    
    # model compilation for training
    adam = optimizers.Adam(lr=0.0001)
    model = Model(inputs, outputs)                            
    model.compile(loss='binary_crossentropy', optimizer=adam, metrics=["binary_accuracy"])
    
    return model


# create the model
dilated_kernel = (3,3)
dilation = 10
dilated_padding = 1
model = Net(dilated_kernel,dilation,dilated_padding)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_7 (InputLayer)         (None, 1024, 40, 1)       0         
_________________________________________________________________
conv2d_26 (Conv2D)           (None, 1024, 40, 256)     6656      
_________________________________________________________________
activation_25 (Activation)   (None, 1024, 40, 256)     0         
_________________________________________________________________
batch_normalization_25 (Batc (None, 1024, 40, 256)     1024      
_________________________________________________________________
max_pooling2d_19 (MaxPooling (None, 1024, 8, 256)      0         
_________________________________________________________________
dropout_19 (Dropout)         (None, 1024, 8, 256)      0         
_________________________________________________________________
conv2d_27 (Conv2D)           (None, 1024, 8, 256)      1638656   
__________

In [6]:
type(x)

NameError: name 'x' is not defined

## Model definition

In [5]:
from keras.models import Model
from keras.layers import SeparableConv2D, ZeroPadding2D, Activation, Dropout, Dense, \
                            Conv2D, MaxPooling2D, Reshape, Permute
from keras.layers.normalization import BatchNormalization
from keras import Input, optimizers

num_rows = X.shape[1]
num_columns = X.shape[2]
num_channels = 1

x_train = x_train.reshape(x_train.shape[0], num_columns, num_rows, num_channels)
x_test = x_test.reshape(x_test.shape[0], num_columns, num_rows, num_channels)

num_labels = y.shape[1]
#num_labels = 16

def Net(dilated_kernel, dilation, dilated_padding):
    
    # input layer
    inputs = Input(shape=(num_columns, num_rows, num_channels))
    
    # DWS-CNN layer 1
    x = ZeroPadding2D(padding=(2))(inputs)
    # use valid padding since padding is introduced before due to its special form (maybe it's equal to use same padding?)
    x = SeparableConv2D(256,kernel_size=(5,5), strides=(1,1), padding='valid')(x)
    x = Activation('relu')(x)
    # Batch Normalisation before passing it to the next layer
    x = BatchNormalization()(x)
    # Pooling
    x = MaxPooling2D(pool_size=(1,5), strides=(1,5), padding='valid')(x)
    # Dropout
    x = Dropout(0.25)(x)
    
    # DWS-CNN layer 2
    x = ZeroPadding2D(padding=(2))(x)
    # use valid padding since padding is introduced before due to its special form
    x = SeparableConv2D(256,kernel_size=(5,5), strides=(1,1), padding='valid')(x)
    x = Activation('relu')(x)
    # Batch Normalisation before passing it to the next layer
    x = BatchNormalization()(x)
    # Pooling
    x = MaxPooling2D(pool_size=(1,4), strides=(1,4), padding='valid')(x)
    # Dropout
    x = Dropout(0.25)(x)
    
    # DWS-CNN layer 3
    x = ZeroPadding2D(padding=(2))(x)
    # use valid padding since padding is introduced before due to its special form
    x = SeparableConv2D(256,kernel_size=(5,5), strides=(1,1), padding='valid')(x)
    x = Activation('relu')(x)
    # Batch Normalisation before passing it to the next layer
    x = BatchNormalization()(x)
    # Pooling
    x = MaxPooling2D(pool_size=(1,2), strides=(1,2), padding='valid')(x)
    # Dropout
    x = Dropout(0.25)(x)
    
    #x = Reshape((1024, 256))(x)
    #x = Reshape((1024, 256, 1))(x)
    x = Permute((1,3,2))(x)
    #x = Permute((3,2,1))(x)
    
    # DIL-CNN 
    x = ZeroPadding2D(padding=(dilated_padding*dilation, 0))(x)
    x = Conv2D(256, kernel_size=dilated_kernel, dilation_rate=(dilation, 1))(x)#, strides=(1,3))(x)
    x = Activation('relu')(x)
    x = BatchNormalization()(x)
    #initializer = keras.initializers.Ones()
    #x = Conv2D(256, (1,1), strides=(1,3), kernel_initializer=initializer, trainable = False)(x)
    x = MaxPooling2D(pool_size=(1,3), strides=(1,3), padding='valid')(x)
    
    #x = Permute((2,1,3))(x)
    
    x = Reshape((1024, 256*84))(x)
    #x = Reshape((1024, 256*254))(x)
    
    # classifier layer
    outputs = Dense(num_labels,activation='sigmoid')(x)
    
    
    # model compilation for training
    adam = optimizers.Adam(lr=0.0001)
    model = Model(inputs, outputs)                            
    model.compile(loss='binary_crossentropy', optimizer=adam, metrics=["binary_accuracy"])
    
    return model


# create the model
dilated_kernel = (3,3)
dilation = 10
dilated_padding = 1
model = Net(dilated_kernel,dilation,dilated_padding)
model.summary()

INFO:plaidml:Opening device "opencl_amd_gfx1010.0"


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 1024, 40, 1)       0         
_________________________________________________________________
zero_padding2d_1 (ZeroPaddin (None, 1028, 44, 1)       0         
_________________________________________________________________
separable_conv2d_1 (Separabl (None, 1024, 40, 256)     537       
_________________________________________________________________
activation_1 (Activation)    (None, 1024, 40, 256)     0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 1024, 40, 256)     1024      
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 1024, 8, 256)      0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 1024, 8, 256)      0         
__________

## Train the model

In [8]:
from keras.callbacks import ModelCheckpoint, EarlyStopping
from datetime import datetime 

num_epochs = 250
# low batch size due to memory maximum dimension, modify if using smaller dataset
num_batch_size = 1

callbacks = [ModelCheckpoint(filepath='E:\\Xception\\TUT-SED-synthetic-2016\\model-{val_loss:.2f}.h5', 
                               verbose=1, save_best_only=True, monitor="val_loss"),
                EarlyStopping(monitor='val_loss', patience=30)]

start = datetime.now()

y_train = y_train.reshape(y_train.shape[0], 1024, 16)
y_test = y_test.reshape(y_test.shape[0], 1024, 16)

history = model.fit(x_train, y_train, batch_size=num_batch_size, epochs=num_epochs, validation_split=0.16, verbose=1, callbacks=callbacks)

duration = datetime.now() - start
print("Training completed in time: ", duration)

Train on 2268 samples, validate on 432 samples
Epoch 1/250

Epoch 00001: val_loss improved from inf to 0.25869, saving model to E:\Xception\TUT-SED-synthetic-2016\model-0.26.h5
Epoch 2/250

Epoch 00002: val_loss did not improve from 0.25869
Epoch 3/250

Epoch 00003: val_loss did not improve from 0.25869
Epoch 4/250

Epoch 00004: val_loss did not improve from 0.25869
Epoch 5/250

Epoch 00005: val_loss did not improve from 0.25869
Epoch 6/250

Epoch 00006: val_loss did not improve from 0.25869
Epoch 7/250

Epoch 00007: val_loss did not improve from 0.25869
Epoch 8/250

Epoch 00008: val_loss did not improve from 0.25869
Epoch 9/250

Epoch 00009: val_loss did not improve from 0.25869
Epoch 10/250

Epoch 00010: val_loss did not improve from 0.25869
Epoch 11/250

Epoch 00011: val_loss did not improve from 0.25869
Epoch 12/250

Epoch 00012: val_loss did not improve from 0.25869
Epoch 13/250

Epoch 00013: val_loss did not improve from 0.25869
Epoch 14/250

Epoch 00014: val_loss did not improve

KeyboardInterrupt: 

## Evaluate model

In [None]:
# Evaluate the model on the test data using `evaluate`
print("Evaluate on test data")
results = model.evaluate(x_test, y_test, batch_size=num_batch_size)
print("test loss, test acc:", results)

## Save model

In [12]:
filename = 'E:\\Xception\\TUT-SED-synthetic-2016\\93epochs.h5'
model.save(filename)

## Import saved model

In [2]:
model2 = keras.models.load_model('E:\\Xception\\TUT-SED-synthetic-2016\\model-0.24.h5')

INFO:plaidml:Opening device "opencl_amd_gfx1010.0"


ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

## Plot Results

In [18]:
import matplotlib.pyplot as plt

# summarize history for accuracy
plt.plot(history.history['binary_accuracy'])
plt.plot(history.history['val_binary_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

NameError: name 'history' is not defined

## Evaluate precision as micro averaging

In [9]:
TP = 0
FP = 0
TN = 0
FN = 0

y_pred = model.predict(x_test, batch_size=8, verbose=1)

for i in range(y_test.shape[0]):
    if i%100 == 0:
        print("processing element ", i)
    for j in range(y_test.shape[1]):
        for k in range(y_test.shape[2]):
            
            test = y_test[i][j][k]
            pred = y_pred[i][j][k]
            
            # binarization of predicted output
            if(pred >= 0.5):
                pred = 1
            else:
                pred = 0

            if(test == 1 and pred == 1):
                TP = TP + 1
            elif(test == 0 and pred == 1):
                FP = FP + 1
            elif(test == 0 and pred == 0):
                TN = TN + 1
            elif(test == 1 and pred == 0):
                FN = FN + 1
                
recall = TP/(TP+FN)

precision = TP/(TP+FP)

f1_score = 2*recall*precision/(recall+precision)

print("recall: ", recall)
print("precision: ", precision)
print("f1: ", f1_score)


processing element  0
processing element  100
processing element  200
processing element  300
processing element  400
processing element  500
processing element  600
recall:  0.02151236311109556
precision:  0.08621492635975302
f1:  0.03443299855695815


In [30]:
for i in range(1024):
    print(y_test[2][i])

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0

In [47]:
for i in range(1024):
    print(y[0][13][i])

1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0


In [20]:
for i in range(1024):
    print(y_pred[0][i])

[0.21831875 0.19079201 0.2323537  0.21826667 0.22935872 0.213474
 0.19853821 0.20665489 0.24988198 0.23628263 0.21988837 0.21437134
 0.2482107  0.21498178 0.2316786  0.20806466]
[0.20044847 0.21184596 0.24877325 0.20329052 0.21287866 0.22244747
 0.19988225 0.2014974  0.24494232 0.22701931 0.21065573 0.21188632
 0.27696618 0.22745173 0.22765787 0.22337013]
[0.19537592 0.2502194  0.25131878 0.18755238 0.21503678 0.23695046
 0.21251997 0.21084681 0.24443118 0.21573895 0.20837866 0.2130931
 0.29128855 0.23857309 0.23581608 0.2362524 ]
[0.1867612  0.22074294 0.23151766 0.17437732 0.19259697 0.21273993
 0.19510803 0.19595632 0.21892859 0.20272207 0.20248394 0.20584413
 0.26863563 0.2214418  0.2202636  0.23329222]
[0.18093604 0.22289339 0.21652482 0.17297673 0.19454937 0.20754953
 0.19475888 0.184376   0.21086808 0.2005726  0.18739617 0.20135629
 0.25741443 0.21447515 0.20507434 0.21405818]
[0.18056604 0.20529212 0.19876201 0.17796038 0.19459808 0.18952987
 0.18538655 0.1652983  0.19364004 0.

In [28]:
TP

4

In [13]:
f1_score

0.0003123403160134381

In [49]:
import pkgutil
list(pkgutil.iter_modules())


[ModuleInfo(module_finder=FileFinder('E:\\Xception'), name='mel_spectrogram_daniele', ispkg=False),
 ModuleInfo(module_finder=FileFinder('E:\\anaconda3\\envs\\plaidML\\DLLs'), name='_asyncio', ispkg=False),
 ModuleInfo(module_finder=FileFinder('E:\\anaconda3\\envs\\plaidML\\DLLs'), name='_bz2', ispkg=False),
 ModuleInfo(module_finder=FileFinder('E:\\anaconda3\\envs\\plaidML\\DLLs'), name='_ctypes', ispkg=False),
 ModuleInfo(module_finder=FileFinder('E:\\anaconda3\\envs\\plaidML\\DLLs'), name='_ctypes_test', ispkg=False),
 ModuleInfo(module_finder=FileFinder('E:\\anaconda3\\envs\\plaidML\\DLLs'), name='_decimal', ispkg=False),
 ModuleInfo(module_finder=FileFinder('E:\\anaconda3\\envs\\plaidML\\DLLs'), name='_elementtree', ispkg=False),
 ModuleInfo(module_finder=FileFinder('E:\\anaconda3\\envs\\plaidML\\DLLs'), name='_hashlib', ispkg=False),
 ModuleInfo(module_finder=FileFinder('E:\\anaconda3\\envs\\plaidML\\DLLs'), name='_lzma', ispkg=False),
 ModuleInfo(module_finder=FileFinder('E:\\an