In [1]:
import os
import datetime
import pandas as pd
import numpy as np
import soundfile as sf
from matplotlib import pyplot as plt

import tensorflow as tf
import tensorflow_io as tfio
from tensorboard import notebook

import keras.models
from keras import regularizers
from keras.models import Sequential
from keras.layers import Conv2D, Dense, Flatten
from keras.layers import Dropout
from keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import MaxPooling2D
from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Input
from tensorflow.keras.models import Model
from tensorflow.keras.layers import concatenate

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import MinMaxScaler


from tensorflow.keras.layers import Input, Conv2D
from tensorflow.keras.layers import MaxPool2D, Flatten, Dense, Lambda
from tensorflow.keras import Model
from tensorflow.keras.applications.vgg16 import VGG16

2022-11-03 09:20:12.091812: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
def load_audio(file_name):
    audio_data, sample_rate = sf.read(file_name)
    return audio_data[::4]

In [3]:
def preprocess(file_path):
    wav = load_audio(file_path)
    wav = wav[:120000]
    zero_padding = tf.zeros([120000] - tf.shape(wav), dtype=tf.float32)
    wav = tf.concat([zero_padding, wav], 0)

    spectrogram = tfio.audio.spectrogram(
        wav, nfft=1024, window=1024, stride=536)

    mel_spectrogram = tfio.audio.melscale(
        spectrogram, rate=8000, mels=224, fmin=0, fmax=4000)

    dbscale_mel_spectrogram = tfio.audio.dbscale(
        mel_spectrogram, top_db=80)

    freq_mask = tfio.audio.freq_mask(dbscale_mel_spectrogram, param=5)

    time_mask = tfio.audio.time_mask(freq_mask, param=5)
    time_mask = tf.expand_dims(time_mask, axis=2)
    return time_mask

In [4]:
def create_model(num_labels):
    vgg = VGG16(input_shape=[224, 224, 3], weights='imagenet', include_top=False)
    
    for layer in vgg.layers:
        layer.trainable = False

    # Fully connected layers  
    image = Flatten()(vgg.output)
    image = Dense(1024, activation="relu")(image)
    
    info_input = Input((2,))
    info = Dense(16, activation="relu")(info_input)
    info = Dense(32, activation="relu")(info)
    info = Dense(64, activation="relu")(info)
    
    combined = concatenate([image, info])
    output = Dense(units = num_labels, activation ='softmax')(combined)
    
    model = Model([vgg.input, info_input], [output])
    model.summary()
    optimizer = keras.optimizers.Adam()
    model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=["accuracy"])
    return model

In [5]:
def train_model(trainAttrX, trainImagesX, trainY, testAttrX, testImagesX, testY, num_labels, batch_amt, epoch_amt, save):
    model = create_model(num_labels)

    log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)


    model.fit( x=[trainImagesX, trainAttrX], y=trainY, validation_data=([testImagesX, testAttrX], testY),
        epochs=epoch_amt, batch_size=batch_amt, callbacks=[tensorboard_callback])
    
    if save:
        model.save("model")

    return model

In [6]:
def predict_file(file_name, model, label_encoder, latitude, longitude, date):
    image = preprocess(file_name)

    data = [[image, latitude, longitude, date]]
    df = pd.DataFrame(data, columns=['image', 'latitude', 'longitude', 'date'])
    continuous = ["latitude", "longitude"]
    
    cs = MinMaxScaler()
    predictContinuous = cs.fit_transform(df[continuous])
    
    predicted_label = model.predict([image.numpy().reshape(1, 157, 64, 1), predictContinuous])
    classes_x = np.argmax(predicted_label, axis=1)
    prediction_class = labelencoder.inverse_transform(classes_x)
    return prediction_class

In [7]:
def process_images(df, inputPath):
    images = []
    for index_num, row in df.iterrows():
        images.append(preprocess(inputPath + row["primary_label"] + "/" + row["filename"]))
    return np.asarray(images).astype(np.float32)

In [8]:
def load_attributes(inputPath):
    cols = ["latitude", "longitude", "date", "time", "filename", "primary_label"]
    df = pd.read_csv(inputPath, skipinitialspace=True, usecols=cols)
    df = df.loc[df['primary_label'] <= "amecro"]
    df = shuffle(df)
    df.reset_index(inplace=True, drop=True)
    return df

In [9]:
def process_attributes(inputPath, train, test):
    continuous = ["latitude", "longitude"]
    
    cs = MinMaxScaler()
    trainContinuous = cs.fit_transform(train[continuous])
    testContinuous = cs.transform(test[continuous])
    
    
    dateBinarizer = LabelBinarizer().fit(df["date"])
    trainCategorical = dateBinarizer.transform(train["date"])
    testCategorical = dateBinarizer.transform(test["date"])
    
    trainX = trainContinuous
    testX = testContinuous
    
    return (trainX, testX)

In [10]:
df = load_attributes("./Data/train_metadata.csv")

In [11]:
images = process_images(df, "./Data/Audio/")

2022-11-03 09:20:26.605032: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [12]:
images.shape

(822, 224, 224, 1)

In [13]:
images = np.repeat(images,repeats=3,axis=3)
images.shape

(822, 224, 224, 3)

In [14]:
images = images / np.max(np.abs(images))

In [15]:
split = train_test_split(df, images, test_size=0.2, random_state=0)
(trainAttrX, testAttrX, trainImagesX, testImagesX) = split
label_encoder = LabelEncoder()
trainY = to_categorical(label_encoder.fit_transform(trainAttrX["primary_label"]))
testY = to_categorical(label_encoder.fit_transform(testAttrX["primary_label"]))

(trainAttrX, testAttrX) = process_attributes(df,
    trainAttrX, testAttrX)


In [16]:
%load_ext tensorboard

In [17]:
%tensorboard --logdir=logs/fit --host localhost --port 8088

In [18]:
num_labels = trainY.shape[1]

In [None]:
train_model(trainAttrX, trainImagesX, trainY, testAttrX, testImagesX, testY, num_labels, 32, 5, True)

#model = keras.models.load_model("model")

#preds = model.predict([testImagesX, testAttrX])

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 224, 224, 3  0           []                               
                                )]                                                                
                                                                                                  
 block1_conv1 (Conv2D)          (None, 224, 224, 64  1792        ['input_1[0][0]']                
                                )                                                                 
                                                                                                  
 block1_conv2 (Conv2D)          (None, 224, 224, 64  36928       ['block1_conv1[0][0]']           
                                )                                                             

In [None]:
prediction = predict_file("./Data/Audio/aldfly/XC477348.ogg", model, label_encoder,
                              42.0458, -79.441, "2019-05-27")
prediction