In [1]:
import os
import datetime
import pandas as pd
import numpy as np
import soundfile as sf
from matplotlib import pyplot as plt

import tensorflow as tf
import tensorflow_io as tfio
from tensorboard import notebook

import keras.models
from keras import regularizers
from keras.models import Sequential
from keras.layers import Conv2D, Dense, Flatten
from keras.layers import Dropout
from keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import MaxPooling2D
from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Input
from tensorflow.keras.models import Model
from tensorflow.keras.layers import concatenate

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import MinMaxScaler


from tensorflow.keras.layers import Input, Conv2D
from tensorflow.keras.layers import MaxPool2D, Flatten, Dense
from tensorflow.keras import Model

In [2]:
def load_audio(file_name):
    audio_data, sample_rate = sf.read(file_name)
    return audio_data[::4]

In [3]:
def preprocess(file_path):
    wav = load_audio(file_path)
    wav = wav[:120000]
    zero_padding = tf.zeros([120000] - tf.shape(wav), dtype=tf.float32)
    wav = tf.concat([zero_padding, wav], 0)

    spectrogram = tfio.audio.spectrogram(
        wav, nfft=512, window=512, stride=1875)

    mel_spectrogram = tfio.audio.melscale(
        spectrogram, rate=8000, mels=64, fmin=0, fmax=4000)

    dbscale_mel_spectrogram = tfio.audio.dbscale(
        mel_spectrogram, top_db=80)

    freq_mask = tfio.audio.freq_mask(dbscale_mel_spectrogram, param=2)

    time_mask = tfio.audio.time_mask(freq_mask, param=2)
    time_mask = tf.expand_dims(time_mask, axis=2)
    return time_mask

In [4]:
def create_model(num_labels):
    image_input = Input(shape =(64, 64, 1))
    # 1st Conv Block
        
    x = Conv2D (filters =64, kernel_size =3, padding ='same', activation='relu')(image_input)
    x = Conv2D (filters =64, kernel_size =3, padding ='same', activation='relu')(x)
    x = MaxPool2D(pool_size =2, strides =2, padding ='same')(x)

    # 2nd Conv Block

    x = Conv2D (filters =128, kernel_size =3, padding ='same', activation='relu')(x)
    x = Conv2D (filters =128, kernel_size =3, padding ='same', activation='relu')(x)
    x = MaxPool2D(pool_size =2, strides =2, padding ='same')(x)

    # 3rd Conv block  
    x = Conv2D (filters =256, kernel_size =3, padding ='same', activation='relu')(x) 
    x = Conv2D (filters =256, kernel_size =3, padding ='same', activation='relu')(x) 
    x = Conv2D (filters =256, kernel_size =3, padding ='same', activation='relu')(x) 
    x = MaxPool2D(pool_size =2, strides =2, padding ='same')(x)

    # 4th Conv block

    x = Conv2D (filters =512, kernel_size =3, padding ='same', activation='relu')(x)
    x = Conv2D (filters =512, kernel_size =3, padding ='same', activation='relu')(x)
    x = Conv2D (filters =512, kernel_size =3, padding ='same', activation='relu')(x)
    x = MaxPool2D(pool_size =2, strides =2, padding ='same')(x)

    # 5th Conv block

    x = Conv2D (filters =512, kernel_size =3, padding ='same', activation='relu')(x)
    x = Conv2D (filters =512, kernel_size =3, padding ='same', activation='relu')(x)
    x = Conv2D (filters =512, kernel_size =3, padding ='same', activation='relu')(x)
    x = MaxPool2D(pool_size =2, strides =2, padding ='same')(x)

    # Fully connected layers  
    x = Flatten()(x) 
    x = Dense(units = 4096, activation ='relu')(x) 
    x = Dense(units = 4096, activation ='relu')(x) 
    x = Dense(units = 2048, activation ='relu')(x) 
    x = Dense(units = 1024, activation ='relu')(x) 
    
    info_input = Input((2,))
    y = Dense(64, activation="relu")(info_input)
    y = Dense(128, activation="relu")(y)
    y = Dense(256, activation="relu")(y)
    y = Dense(512, activation="relu")(y)
    y = Dense(1024, activation="relu")(y)
    
    output = concatenate([x, y])
    output = Dense(units = 2048, activation ='relu')(output) 
    output = Dense(units = 2048, activation ='relu')(output) 
    output = Dense(units = 2048, activation ='relu')(output) 
    output = Dense(units = num_labels, activation ='softmax')(output)
    
    model = Model([image_input, info_input], [output])
    model.summary()
    optimizer = keras.optimizers.Adam()
    model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=["accuracy"])
    return model

In [5]:
def train_model(trainAttrX, trainImagesX, trainY, testAttrX, testImagesX, testY, num_labels, batch_amt, epoch_amt, save):
    model = create_model(num_labels)

    log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)


    model.fit( x=[trainImagesX, trainAttrX], y=trainY, validation_data=([testImagesX, testAttrX], testY),
        epochs=epoch_amt, batch_size=batch_amt, callbacks=[tensorboard_callback], verbose=False)
    
    if save:
        model.save("model")

    return model

In [6]:
def predict_file(file_name, model, label_encoder, latitude, longitude, date):
    image = preprocess(file_name)

    data = [[image, latitude, longitude, date]]
    df = pd.DataFrame(data, columns=['image', 'latitude', 'longitude', 'date'])
    continuous = ["latitude", "longitude"]
    
    cs = MinMaxScaler()
    predictContinuous = cs.fit_transform(df[continuous])
    
    predicted_label = model.predict([image.numpy().reshape(1, 157, 64, 1), predictContinuous])
    classes_x = np.argmax(predicted_label, axis=1)
    prediction_class = labelencoder.inverse_transform(classes_x)
    return prediction_class

In [7]:
# os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

In [8]:
def process_images(df, inputPath):
    images = []
    for index_num, row in df.iterrows():
        images.append(preprocess(inputPath + row["primary_label"] + "/" + row["filename"]))
    return np.asarray(images).astype(np.float32)

In [9]:
def load_attributes(inputPath):
    cols = ["latitude", "longitude", "date", "time", "filename", "primary_label"]
    df = pd.read_csv(inputPath, skipinitialspace=True, usecols=cols)
    df = df.loc[df['primary_label'] <= "ameavo"]
    df = shuffle(df)
    df.reset_index(inplace=True, drop=True)
    return df

In [10]:
def process_attributes(inputPath, train, test):
    continuous = ["latitude", "longitude"]
    
    cs = MinMaxScaler()
    trainContinuous = cs.fit_transform(train[continuous])
    testContinuous = cs.transform(test[continuous])
    
    
    dateBinarizer = LabelBinarizer().fit(df["date"])
    trainCategorical = dateBinarizer.transform(train["date"])
    testCategorical = dateBinarizer.transform(test["date"])
    
    trainX = trainContinuous
    testX = testContinuous
    
    return (trainX, testX)

In [11]:
df = load_attributes("./Data/train_metadata.csv")

In [12]:
images = process_images(df, "./Data/Audio/")

In [13]:
images = images / np.max(np.abs(images))

In [14]:
split = train_test_split(df, images, test_size=0.2, random_state=0)
(trainAttrX, testAttrX, trainImagesX, testImagesX) = split
label_encoder = LabelEncoder()
trainY = to_categorical(label_encoder.fit_transform(trainAttrX["primary_label"]))
testY = to_categorical(label_encoder.fit_transform(testAttrX["primary_label"]))

(trainAttrX, testAttrX) = process_attributes(df,
    trainAttrX, testAttrX)


In [15]:
%load_ext tensorboard

In [16]:
%tensorboard --logdir logs/fit

In [19]:
num_labels = trainY.shape[1]

train_model(trainAttrX, trainImagesX, trainY, testAttrX, testImagesX, testY, num_labels, 32, 100, True)

#model = keras.models.load_model("model")

#preds = model.predict([testImagesX, testAttrX])


Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 64, 64, 1)]  0           []                               
                                                                                                  
 conv2d_13 (Conv2D)             (None, 64, 64, 64)   640         ['input_3[0][0]']                
                                                                                                  
 conv2d_14 (Conv2D)             (None, 64, 64, 64)   36928       ['conv2d_13[0][0]']              
                                                                                                  
 max_pooling2d_5 (MaxPooling2D)  (None, 32, 32, 64)  0           ['conv2d_14[0][0]']              
                                                                                            

KeyboardInterrupt: 

In [18]:
prediction = predict_file("./Data/Audio/aldfly/XC477348.ogg", model, label_encoder,
                              42.0458, -79.441, "2019-05-27")
prediction

NameError: name 'model' is not defined