In [1]:
import os

import pandas as pd
import numpy as np
import soundfile as sf
from matplotlib import pyplot as plt

import tensorflow as tf
import tensorflow_io as tfio

import keras.models
from keras import regularizers
from keras.models import Sequential
from keras.layers import Conv2D, Dense, Flatten
from keras.layers import Dropout
from keras.utils import to_categorical

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

In [2]:
def load_audio(file_name):
    audio_data, sample_rate = sf.read(file_name)
    return audio_data[::8]

In [3]:
def preprocess(file_path):
    wav = load_audio(file_path)
    wav = wav[:80000]
    zero_padding = tf.zeros([80000] - tf.shape(wav), dtype=tf.float32)
    wav = tf.concat([zero_padding, wav], 0)

    spectrogram = tfio.audio.spectrogram(
        wav, nfft=512, window=512, stride=512)

    mel_spectrogram = tfio.audio.melscale(
        spectrogram, rate=8000, mels=64, fmin=0, fmax=4000)

    dbscale_mel_spectrogram = tfio.audio.dbscale(
        mel_spectrogram, top_db=80)

    freq_mask = tfio.audio.freq_mask(dbscale_mel_spectrogram, param=5)

    time_mask = tfio.audio.time_mask(freq_mask, param=5)
    time_mask = tf.expand_dims(time_mask, axis=2)
    return time_mask

In [16]:
def create_model(num_labels):
    model = Sequential()
    model.add(Conv2D(16, (3, 3), activation='relu', input_shape=(157, 64, 1), kernel_regularizer=regularizers.l2(0.01)))
    model.add(Conv2D(32, (3, 3), activation='relu', kernel_regularizer=regularizers.l2(0.01)))
    model.add(Conv2D(64, (3, 3), activation='relu', kernel_regularizer=regularizers.l2(0.01)))
    model.add(Conv2D(128, (3, 3), activation='relu', kernel_regularizer=regularizers.l2(0.01)))
    model.add(Conv2D(128, (3, 3), activation='relu', kernel_regularizer=regularizers.l2(0.01)))
    model.add(Conv2D(128, (3, 3), activation='relu', kernel_regularizer=regularizers.l2(0.01)))
    model.add(Conv2D(128, (3, 3), activation='relu', kernel_regularizer=regularizers.l2(0.01)))
    model.add(Flatten())
    model.add(Dropout(0.2))
    model.add(Dense(512, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
    model.add(Dense(128, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
    model.add(Dense(num_labels, activation='softmax'))
    model.summary()
    optimizer = keras.optimizers.Adam(learning_rate=0.0005)
    model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=["accuracy"])
    return model

In [5]:
def train_model(X_train, y_train, X_test, y_test, num_labels, batch_amt, epoch_amt, save):
    model = create_model(num_labels)

    hist = model.fit(X_train, y_train, batch_size=batch_amt, epochs=epoch_amt, validation_data=(X_test, y_test), verbose=1)

    if save:
        model.save("model")
    return model

In [6]:
def predict_file(file_name, model, label_encoder):
    input = preprocess("./Data/Audio/acafly/XC136290.ogg")
    predicted_label = model.predict(input.numpy().reshape(1, 157, 64, 1))
    classes_x = np.argmax(predicted_label, axis=1)
    prediction_class = labelencoder.inverse_transform(classes_x)

In [7]:
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

In [8]:
audio_path = "./Data/Audio/"
metadata = pd.read_csv('./Data/train_metadata.csv')

In [9]:
extracted_features = []
for index_num, row in metadata.iterrows():
    if row["primary_label"] == "banswa":
        break
    file_name = os.path.join(audio_path + row["primary_label"] + "/" + row["filename"])
    final_class_labels = row["primary_label"]
    data = preprocess(file_name)
    extracted_features.append([data, final_class_labels])

In [10]:
extracted_features_df = pd.DataFrame(extracted_features, columns=['feature', 'class'])
extracted_features_df = shuffle(extracted_features_df)
extracted_features_df.reset_index(inplace=True, drop=True)

In [11]:
X = np.array(extracted_features_df['feature'].tolist())
y = np.array(extracted_features_df['class'].tolist())

In [12]:
labelencoder = LabelEncoder()
y = to_categorical(labelencoder.fit_transform(y))

In [17]:
num_labels = y.shape[1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)
train_model(X_train, y_train, X_test, y_test, num_labels, 16, 25, True)

#model = keras.models.load_model("model")

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_9 (Conv2D)           (None, 155, 62, 16)       160       
                                                                 
 conv2d_10 (Conv2D)          (None, 153, 60, 32)       4640      
                                                                 
 conv2d_11 (Conv2D)          (None, 151, 58, 64)       18496     
                                                                 
 conv2d_12 (Conv2D)          (None, 149, 56, 128)      73856     
                                                                 
 conv2d_13 (Conv2D)          (None, 147, 54, 128)      147584    
                                                                 
 conv2d_14 (Conv2D)          (None, 145, 52, 128)      147584    
                                                                 
 conv2d_15 (Conv2D)          (None, 143, 50, 128)     



INFO:tensorflow:Assets written to: model\assets


INFO:tensorflow:Assets written to: model\assets


<keras.engine.sequential.Sequential at 0x2038ad589d0>

In [None]:
test_accuracy = model.evaluate(X_test, y_test)