In [1]:
import os
import datetime
import pandas as pd
import numpy as np
import soundfile as sf
from matplotlib import pyplot as plt

import tensorflow as tf
import tensorflow_io as tfio
from tensorboard import notebook

import keras.models
from keras import regularizers
from keras.models import Sequential
from keras.layers import Conv2D, Dense, Flatten
from keras.layers import Dropout
from keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import MaxPooling2D
from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Input
from tensorflow.keras.models import Model
from tensorflow.keras.layers import concatenate

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import MinMaxScaler


from tensorflow.keras.layers import Input, Conv2D
from tensorflow.keras.layers import MaxPool2D, Flatten, Dense, Lambda
from tensorflow.keras import Model

from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.applications.xception import Xception

import tensorflow_datasets as tfds

In [2]:
def load_audio(file_name):
    audio_data, sample_rate = sf.read(file_name)
    return audio_data

In [3]:
def preprocess(file_path):
    wav = load_audio(file_path)
    wav = wav[:480000]
    zero_padding = tf.zeros([480000] - tf.shape(wav), dtype=tf.float32)
    wav = tf.concat([zero_padding, wav], 0)

    spectrogram = tfio.audio.spectrogram(
        wav, nfft=1024, window=1024, stride=int(480000/299) + 1)

    mel_spectrogram = tfio.audio.melscale(
        spectrogram, rate=32000, mels=299, fmin=1000, fmax=11000)

    dbscale_mel_spectrogram = tfio.audio.dbscale(
        mel_spectrogram, top_db=80)

    freq_mask = tfio.audio.freq_mask(dbscale_mel_spectrogram, param=5)

    time_mask = tfio.audio.time_mask(freq_mask, param=5)
    time_mask = tf.expand_dims(time_mask, axis=2)
    return time_mask

In [4]:
def create_model(num_labels):
    cnn = Xception(input_shape=[299, 299, 3], weights='imagenet', include_top=False)
    
    for layer in cnn.layers:
        layer.trainable = False

    x = Flatten()(cnn.output) 
    output = Dense(units = num_labels, activation ='softmax')(x)
    
    model = Model([cnn.input], [output])
    model.summary()
    optimizer = keras.optimizers.Adam()
    model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=["accuracy"])
    return model

In [5]:
def train_model(trainImagesX, trainY, testImagesX, testY, num_labels, batch_amt, epoch_amt, save):
    model = create_model(num_labels)

    log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

   
    checkpoint = keras.callbacks.ModelCheckpoint(filepath="bestest_model",
                                                 mode='max',
                                                 monitor='val_accuracy',
                                                 save_best_only=True)
    
    model.fit( x=trainImagesX, y=trainY, validation_data=(testImagesX, testY),
        epochs=epoch_amt, batch_size=batch_amt, callbacks=[tensorboard_callback, checkpoint], verbose=True)
    
    if save:
        model.save("model")

    return model

In [6]:
def predict_file(file_name, model, label_encoder):
    image = preprocess(file_name)
    image = image.numpy().reshape(1, 299, 299, 1)
    image = np.repeat(image,repeats=3,axis=3)
    predicted_label = model.predict([image], verbose=False)
    return predicted_label

In [7]:
def process_images(df, inputPath):
    images = []
    for index_num, row in df.iterrows():
        images.append(preprocess(inputPath + row["primary_label"] + "/" + row["filename"]))
    return np.asarray(images).astype(np.float32)

In [8]:
def load_attributes(inputPath):
    cols = ["filename", "primary_label"]
    df = pd.read_csv(inputPath, skipinitialspace=True, usecols=cols)
    df = df.loc[df['primary_label'] <= "bncfly"]
    df = shuffle(df)
    df.reset_index(inplace=True, drop=True)
    return df

In [9]:
#os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
physical_devices = tf.config.list_physical_devices("GPU")
tf.config.experimental.set_memory_growth(physical_devices[0], True)

In [10]:
df = load_attributes("./Data/train_metadata.csv")

In [11]:
images = process_images(df, "./Data/Audio/")

In [12]:
images = np.repeat(images,repeats=3,axis=3)

In [13]:
split = train_test_split(df, images, test_size=0.2, random_state=0)
(trainAttrX, testAttrX, trainImagesX, testImagesX) = split
label_encoder = LabelEncoder()
trainY = to_categorical(label_encoder.fit_transform(trainAttrX["primary_label"]))
testY = to_categorical(label_encoder.fit_transform(testAttrX["primary_label"]))

In [14]:
num_labels = trainY.shape[1]

model = train_model(trainImagesX, trainY, testImagesX, testY, num_labels, 32, 50, True)

#model = keras.models.load_model("model")

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 299, 299, 3  0           []                               
                                )]                                                                
                                                                                                  
 block1_conv1 (Conv2D)          (None, 149, 149, 32  864         ['input_1[0][0]']                
                                )                                                                 
                                                                                                  
 block1_conv1_bn (BatchNormaliz  (None, 149, 149, 32  128        ['block1_conv1[0][0]']           
 ation)                         )                                                             

 block4_sepconv1_bn (BatchNorma  (None, 37, 37, 728)  2912       ['block4_sepconv1[0][0]']        
 lization)                                                                                        
                                                                                                  
 block4_sepconv2_act (Activatio  (None, 37, 37, 728)  0          ['block4_sepconv1_bn[0][0]']     
 n)                                                                                               
                                                                                                  
 block4_sepconv2 (SeparableConv  (None, 37, 37, 728)  536536     ['block4_sepconv2_act[0][0]']    
 2D)                                                                                              
                                                                                                  
 block4_sepconv2_bn (BatchNorma  (None, 37, 37, 728)  2912       ['block4_sepconv2[0][0]']        
 lization)

 n)                                                                                               
                                                                                                  
 block7_sepconv1 (SeparableConv  (None, 19, 19, 728)  536536     ['block7_sepconv1_act[0][0]']    
 2D)                                                                                              
                                                                                                  
 block7_sepconv1_bn (BatchNorma  (None, 19, 19, 728)  2912       ['block7_sepconv1[0][0]']        
 lization)                                                                                        
                                                                                                  
 block7_sepconv2_act (Activatio  (None, 19, 19, 728)  0          ['block7_sepconv1_bn[0][0]']     
 n)                                                                                               
          

 block9_sepconv3_bn (BatchNorma  (None, 19, 19, 728)  2912       ['block9_sepconv3[0][0]']        
 lization)                                                                                        
                                                                                                  
 add_7 (Add)                    (None, 19, 19, 728)  0           ['block9_sepconv3_bn[0][0]',     
                                                                  'add_6[0][0]']                  
                                                                                                  
 block10_sepconv1_act (Activati  (None, 19, 19, 728)  0          ['add_7[0][0]']                  
 on)                                                                                              
                                                                                                  
 block10_sepconv1 (SeparableCon  (None, 19, 19, 728)  536536     ['block10_sepconv1_act[0][0]']   
 v2D)     

                                                                                                  
 block12_sepconv3_act (Activati  (None, 19, 19, 728)  0          ['block12_sepconv2_bn[0][0]']    
 on)                                                                                              
                                                                                                  
 block12_sepconv3 (SeparableCon  (None, 19, 19, 728)  536536     ['block12_sepconv3_act[0][0]']   
 v2D)                                                                                             
                                                                                                  
 block12_sepconv3_bn (BatchNorm  (None, 19, 19, 728)  2912       ['block12_sepconv3[0][0]']       
 alization)                                                                                       
                                                                                                  
 add_10 (A



INFO:tensorflow:Assets written to: bestest_model\assets


INFO:tensorflow:Assets written to: bestest_model\assets


Epoch 2/50



INFO:tensorflow:Assets written to: bestest_model\assets


INFO:tensorflow:Assets written to: bestest_model\assets


Epoch 3/50



INFO:tensorflow:Assets written to: bestest_model\assets


INFO:tensorflow:Assets written to: bestest_model\assets


Epoch 4/50



INFO:tensorflow:Assets written to: bestest_model\assets


INFO:tensorflow:Assets written to: bestest_model\assets


Epoch 5/50



INFO:tensorflow:Assets written to: bestest_model\assets


INFO:tensorflow:Assets written to: bestest_model\assets


Epoch 6/50
Epoch 7/50



INFO:tensorflow:Assets written to: bestest_model\assets


INFO:tensorflow:Assets written to: bestest_model\assets


Epoch 8/50



INFO:tensorflow:Assets written to: bestest_model\assets


INFO:tensorflow:Assets written to: bestest_model\assets


Epoch 9/50
Epoch 10/50
  8/223 [>.............................] - ETA: 5:40 - loss: 6.7796 - accuracy: 0.7305

KeyboardInterrupt: 

In [None]:
#Accuracy on testing data
count = 0
correct = 0
for index_num, row in testAttrX.iterrows():
    prediction = predict_file("./Data/Audio/" + row["primary_label"] + "/" + row["filename"], model, label_encoder)
    classes_x = np.argmax(prediction, axis=1)
    prediction_class = label_encoder.inverse_transform(classes_x)
    if prediction_class[0] == row["primary_label"]:
        correct += 1
    count += 1
    
float(correct/count)

In [None]:
#Accuracy on training data
count = 0
correct = 0
for index_num, row in trainAttrX.iterrows():
    prediction = predict_file("./Data/Audio/" + row["primary_label"] + "/" + row["filename"], model, label_encoder)
    classes_x = np.argmax(prediction, axis=1)
    prediction_class = label_encoder.inverse_transform(classes_x)
    if prediction_class[0] == row["primary_label"]:
        correct += 1
    count += 1
    
float(correct/count)
wrong

In [None]:
prediction = predict_file("./Data/Audio/acafly/XC51408.ogg", model, label_encoder)
classes_x = np.argmax(prediction, axis=1)
prediction_class = label_encoder.inverse_transform(classes_x)
str(prediction_class[0]) + " with " + str(prediction[0][classes_x][0]*100) + "% confidence"