In [1]:
import os
import librosa
from keras.models import load_model
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
from sklearn.model_selection import train_test_split

train_audio_path = '../data/'
labels = ["house", "five", "six", "tree", "zero"]

all_wave = []
all_label = []
for label in labels:
    print(label)
    waves = [f for f in os.listdir(train_audio_path + '/'+ label) if f.endswith('.wav')]
    for wav in waves:
        samples, sample_rate = librosa.load(train_audio_path + '/' + label + '/' + wav, sr = 16000)
        samples = librosa.resample(samples, sample_rate, 8000)
        if(len(samples)== 8000) : 
            all_wave.append(samples)
            all_label.append(label)

house
five
six
tree
zero


In [2]:
import numpy as np

le = LabelEncoder()
y=le.fit_transform(all_label)
classes= list(le.classes_)

y=np_utils.to_categorical(y, num_classes=len(labels))
all_wave = np.array(all_wave).reshape(-1,8000,1)

In [3]:
from keras.layers import Dense, Dropout, Flatten, Conv1D, Input, MaxPooling1D
from keras.models import Model
from keras.callbacks import EarlyStopping, ModelCheckpoint

base_model = load_model('bestModel_set1_80.hdf5')
for layer in base_model.layers[:-3]:
    layer.trainable = False
print(base_model.summary())

model2= Model(inputs=base_model.input, outputs=base_model.layers[-4].output)
print(model2.summary())

x = model2.output
x = Dense(128, activation='relu', name='dense_1')(x)
x = Dropout(0.3, name='dropout_5')(x)
x = Dense(len(labels), activation='softmax', name='dense_2')(x)

final_model = Model(inputs=model2.input, outputs=x)
final_model.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 8000, 1)]         0         
_________________________________________________________________
conv1d (Conv1D)              (None, 7988, 8)           112       
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 2662, 8)           0         
_________________________________________________________________
dropout (Dropout)            (None, 2662, 8)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 2652, 16)          1424      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 884, 16)           0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 884, 16)          

In [5]:
x_train, x_test, y_train, y_test = train_test_split(np.array(all_wave),np.array(y),stratify=y,
                                            test_size = 0.8,random_state=777,shuffle=True)

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10, min_delta=0.0001) 
mc = ModelCheckpoint('bestModel_set2_20_tlOn_noReinit.hdf5', monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')

# base_model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
history = base_model.fit(x_train, y_train, epochs=5, callbacks=[es,mc], batch_size=32, validation_data=(x_test,y_test))

Epoch 1/5
Epoch 00001: val_accuracy improved from -inf to 0.57439, saving model to bestModel_set2_20_tlOn_noReinit.hdf5
Epoch 2/5
Epoch 00002: val_accuracy improved from 0.57439 to 0.71591, saving model to bestModel_set2_20_tlOn_noReinit.hdf5
Epoch 3/5
Epoch 00003: val_accuracy improved from 0.71591 to 0.75894, saving model to bestModel_set2_20_tlOn_noReinit.hdf5
Epoch 4/5
Epoch 00004: val_accuracy improved from 0.75894 to 0.80236, saving model to bestModel_set2_20_tlOn_noReinit.hdf5
Epoch 5/5
Epoch 00005: val_accuracy improved from 0.80236 to 0.82348, saving model to bestModel_set2_20_tlOn_noReinit.hdf5
