In [1]:
import os

import pandas as pd
import numpy as np
import soundfile as sf
from matplotlib import pyplot as plt

import tensorflow as tf
import tensorflow_io as tfio

import keras.models
from keras import regularizers
from keras.models import Sequential
from keras.layers import Conv2D, Dense, Flatten
from keras.layers import Dropout
from keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import MaxPooling2D
from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Input
from tensorflow.keras.models import Model
from tensorflow.keras.layers import concatenate

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import MinMaxScaler

In [2]:
def load_audio(file_name):
    audio_data, sample_rate = sf.read(file_name)
    return audio_data[::8]

In [3]:
def preprocess(file_path):
    wav = load_audio(file_path)
    wav = wav[:80000]
    zero_padding = tf.zeros([80000] - tf.shape(wav), dtype=tf.float32)
    wav = tf.concat([zero_padding, wav], 0)

    spectrogram = tfio.audio.spectrogram(
        wav, nfft=512, window=512, stride=512)

    mel_spectrogram = tfio.audio.melscale(
        spectrogram, rate=8000, mels=64, fmin=0, fmax=4000)

    dbscale_mel_spectrogram = tfio.audio.dbscale(
        mel_spectrogram, top_db=80)

    freq_mask = tfio.audio.freq_mask(dbscale_mel_spectrogram, param=2)

    time_mask = tfio.audio.time_mask(freq_mask, param=2)
    time_mask = tf.expand_dims(time_mask, axis=2)
    return time_mask

In [23]:
def create_model(num_labels):
    image_input = Input((157, 64, 1))
    x = Conv2D(16, (3,3), activation='relu', kernel_regularizer=regularizers.l2(0.0001))(image_input)
    x = Conv2D(32, (3,3), activation='relu', kernel_regularizer=regularizers.l2(0.0001))(x)
    x = Conv2D(64, (3,3), activation='relu', kernel_regularizer=regularizers.l2(0.0001))(x)
    x = Dropout(0.5)(x)
    x = Conv2D(128, (3,3), activation='relu', kernel_regularizer=regularizers.l2(0.0001))(x)
    x = Dropout(0.5)(x)
    x = Conv2D(128, (3,3), activation='relu', kernel_regularizer=regularizers.l2(0.0001))(x)
    x = Dropout(0.5)(x)
    x = Conv2D(128, (3,3), activation='relu', kernel_regularizer=regularizers.l2(0.0001))(x)
    x = Flatten()(x)
    x = Dropout(0.5)(x)
    x = Dense(512, activation="relu", kernel_regularizer=regularizers.l2(0.0001))(x)
    x = Dropout(0.5)(x)
    x = Dense(256, activation="relu", kernel_regularizer=regularizers.l2(0.0001))(x)
    x = Dropout(0.5)(x)
    x = Dense(128, activation="relu", kernel_regularizer=regularizers.l2(0.0001))(x)
    x = Dropout(0.5)(x)
    x = Dense(64, activation="relu", kernel_regularizer=regularizers.l2(0.0001))(x)
    x = Dropout(0.5)(x)
    x = Dense(32, activation="relu", kernel_regularizer=regularizers.l2(0.0001))(x)

    info_input = Input((2,))
    y = Dense(16, activation="relu", kernel_regularizer=regularizers.l2(0.0001))(info_input)
    x = Dropout(0.5)(x)
    y = Dense(32, activation="relu", kernel_regularizer=regularizers.l2(0.0001))(y)

    z = concatenate([x, y])
    z = Dense(256, activation="relu", kernel_regularizer=regularizers.l2(0.0001))(z)
    x = Dropout(0.5)(x)
    z = Dense(128, activation="relu", kernel_regularizer=regularizers.l2(0.0001))(z)
    x = Dropout(0.5)(x)
    z = Dense(64, activation="relu", kernel_regularizer=regularizers.l2(0.0001))(z)
    x = Dropout(0.5)(x)
    z = Dense(num_labels, activation="softmax")(z)

    model = Model([image_input, info_input], [z])
    model.summary()
    optimizer = keras.optimizers.Adam(learning_rate=0.001)
    model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=["accuracy"])
    return model

In [5]:
def train_model(trainAttrX, trainImagesX, trainY, testAttrX, testImagesX, testY, num_labels, batch_amt, epoch_amt, save):
    model = create_model(num_labels)

    model.fit( x=[trainImagesX, trainAttrX], y=trainY, validation_data=([testImagesX, testAttrX], testY),
        epochs=epoch_amt, batch_size=batch_amt)
    
    if save:
        model.save("model")

    return model

In [6]:
def predict_file(file_name, model, label_encoder, latitude, longitude, date):
    image = preprocess(file_name)

    data = [[image, latitude, longitude, date]]
    df = pd.DataFrame(data, columns=['image', 'latitude', 'longitude', 'date'])
    continuous = ["latitude", "longitude"]
    
    cs = MinMaxScaler()
    predictContinuous = cs.fit_transform(df[continuous])
    
    predicted_label = model.predict([image.numpy().reshape(1, 157, 64, 1), predictContinuous])
    classes_x = np.argmax(predicted_label, axis=1)
    prediction_class = labelencoder.inverse_transform(classes_x)
    return prediction_class

In [7]:
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

In [8]:
def process_images(df, inputPath):
    images = []
    for index_num, row in df.iterrows():
        images.append(preprocess(inputPath + row["primary_label"] + "/" + row["filename"]))
    return np.asarray(images).astype(np.float32)

In [9]:
def load_attributes(inputPath):
    cols = ["latitude", "longitude", "date", "time", "filename", "primary_label"]
    df = pd.read_csv(inputPath, skipinitialspace=True, usecols=cols)
    df = df.loc[df['primary_label'] <= "clcrob"]
    df = shuffle(df)
    df.reset_index(inplace=True, drop=True)
    return df

In [10]:
df = load_attributes("./Data/train_metadata.csv")

In [11]:
def process_attributes(inputPath, train, test):
    continuous = ["latitude", "longitude"]
    
    cs = MinMaxScaler()
    trainContinuous = cs.fit_transform(train[continuous])
    testContinuous = cs.transform(test[continuous])
    
    
    dateBinarizer = LabelBinarizer().fit(df["date"])
    trainCategorical = dateBinarizer.transform(train["date"])
    testCategorical = dateBinarizer.transform(test["date"])
    
    trainX = trainContinuous
    testX = testContinuous
    
    return (trainX, testX)

In [12]:
df = load_attributes("./Data/train_metadata.csv")

In [13]:
images = process_images(df, "./Data/Audio/")

In [14]:
split = train_test_split(df, images, test_size=0.25, random_state=0)
(trainAttrX, testAttrX, trainImagesX, testImagesX) = split
label_encoder = LabelEncoder()
trainY = to_categorical(label_encoder.fit_transform(trainAttrX["primary_label"]))
testY = to_categorical(label_encoder.fit_transform(testAttrX["primary_label"]))

(trainAttrX, testAttrX) = process_attributes(df,
    trainAttrX, testAttrX)


In [25]:
num_labels = trainY.shape[1]

train_model(trainAttrX, trainImagesX, trainY, testAttrX, testImagesX, testY, num_labels, 128, 50, True)

#model = keras.models.load_model("model")

#preds = model.predict([testImagesX, testAttrX])


Model: "model_6"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_13 (InputLayer)          [(None, 157, 64, 1)  0           []                               
                                ]                                                                 
                                                                                                  
 conv2d_30 (Conv2D)             (None, 155, 62, 16)  160         ['input_13[0][0]']               
                                                                                                  
 conv2d_31 (Conv2D)             (None, 153, 60, 32)  4640        ['conv2d_30[0][0]']              
                                                                                                  
 conv2d_32 (Conv2D)             (None, 151, 58, 64)  18496       ['conv2d_31[0][0]']        

Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50

KeyboardInterrupt: 

In [None]:
prediction = predict_file("./Data/Audio/aldfly/XC477348.ogg", model, label_encoder,
                              42.0458, -79.441, "2019-05-27")
prediction