In [1]:
import os
import datetime
import pandas as pd
import numpy as np
import soundfile as sf
from matplotlib import pyplot as plt

import tensorflow as tf
import tensorflow_io as tfio
from tensorboard import notebook

import keras.models
from keras import regularizers
from keras.models import Sequential
from keras.layers import Conv2D, Dense, Flatten
from keras.layers import Dropout
from keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import MaxPooling2D
from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Input
from tensorflow.keras.models import Model
from tensorflow.keras.layers import concatenate

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import MinMaxScaler


from tensorflow.keras.layers import Input, Conv2D
from tensorflow.keras.layers import MaxPool2D, Flatten, Dense, Lambda
from tensorflow.keras import Model
from tensorflow.keras.applications.vgg16 import VGG16

In [2]:
def load_audio(file_name):
    audio_data, sample_rate = sf.read(file_name)
    return audio_data

In [3]:
def preprocess(file_path):
    wav = load_audio(file_path)
    wav = wav[:960000]
    zero_padding = tf.zeros([960000] - tf.shape(wav), dtype=tf.float32)
    wav = tf.concat([zero_padding, wav], 0)

    spectrogram = tfio.audio.spectrogram(
        wav, nfft=892, window=892, stride=int(960000/448) + 1)

    mel_spectrogram = tfio.audio.melscale(
        spectrogram, rate=32000, mels=224, fmin=0, fmax=12000)

    dbscale_mel_spectrogram = tfio.audio.dbscale(
        mel_spectrogram, top_db=80)

    freq_mask = tfio.audio.freq_mask(dbscale_mel_spectrogram, param=4)

    time_mask = tfio.audio.time_mask(freq_mask, param=4)
    time_mask = tf.expand_dims(time_mask, axis=2)
    return time_mask

In [4]:
def create_model(num_labels):
    vgg = VGG16(input_shape=[448, 224, 3], weights='imagenet', include_top=False)
    
    for layer in vgg.layers:
        layer.trainable = False

    # Fully connected layers  
    x = Flatten()(vgg.output) 
    x = Dropout(0.5)(x)
    x = Dense(1024, activation='relu', kernel_regularizer=regularizers.l2(0.001))(x)
    x = Dropout(0.5)(x)
    output = Dense(units = num_labels, activation ='softmax')(x)
    
    model = Model([vgg.input], [output])
    model.summary()
    optimizer = keras.optimizers.Adam()
    model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=["accuracy"])
    return model

In [5]:
def train_model(trainImagesX, trainY, testImagesX, testY, num_labels, batch_amt, epoch_amt, save):
    model = create_model(num_labels)

    log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)


    model.fit( x=trainImagesX, y=trainY, validation_data=(testImagesX, testY),
        epochs=epoch_amt, batch_size=batch_amt, callbacks=[tensorboard_callback], verbose=True)
    
    if save:
        model.save("model")

    return model

In [6]:
def predict_file(file_name, model, label_encoder, latitude, longitude, date):
    image = preprocess(file_name)

    predicted_label = model.predict([image.numpy().reshape(1, 64, 64, 3)])
    classes_x = np.argmax(predicted_label, axis=1)
    prediction_class = labelencoder.inverse_transform(classes_x)
    return prediction_class

In [7]:
def process_images(df, inputPath):
    images = []
    for index_num, row in df.iterrows():
        images.append(preprocess(inputPath + row["primary_label"] + "/" + row["filename"]))
    return np.asarray(images).astype(np.float32)

In [8]:
def load_attributes(inputPath):
    cols = ["filename", "primary_label", "scientific_name"]
    df = pd.read_csv(inputPath, skipinitialspace=True, usecols=cols)
    df = df.loc[df['primary_label'] <= "amewig"]
    df = shuffle(df)
    df.reset_index(inplace=True, drop=True)
    return df

In [9]:
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
  tf.config.experimental.set_memory_growth(gpu, True)

In [10]:
df = load_attributes("./Data/train_metadata.csv")

In [11]:
df['genus'] = df.apply(lambda row: str(row.scientific_name).split(' ', 1)[0], axis = 1)

In [12]:
images = process_images(df, "./Data/Audio/")

In [13]:
images = np.repeat(images,repeats=3,axis=3)

In [14]:
images = images / np.max(np.abs(images))

In [15]:
split = train_test_split(df, images, test_size=0.2, random_state=0)
(trainAttrX, testAttrX, trainImagesX, testImagesX) = split
label_encoder = LabelEncoder()
trainY = to_categorical(label_encoder.fit_transform(trainAttrX["genus"]))
testY = to_categorical(label_encoder.fit_transform(testAttrX["genus"]))

In [16]:
num_labels = trainY.shape[1]

train_model(trainImagesX, trainY, testImagesX, testY, num_labels, 16, 50, True)

#model = keras.models.load_model("model")

#preds = model.predict([testImagesX, testAttrX])


Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 448, 224, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 448, 224, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 448, 224, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 224, 112, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 224, 112, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 224, 112, 128)     147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 112, 56, 128)      0     

KeyboardInterrupt: 

In [None]:
prediction = predict_file("./Data/Audio/aldfly/XC477348.ogg", model, label_encoder,
                              42.0458, -79.441, "2019-05-27")
prediction