In [1]:
from tensorflow.keras.applications import resnet50
from tensorflow import keras
def create_model(input_shape, weights=None):
    base_model = resnet50.ResNet50(input_shape=input_shape, include_top=False, weights=weights)
    x1 = keras.layers.GlobalAveragePooling2D()(base_model.output)
    x2 = keras.layers.GlobalMaxPooling2D()(base_model.output)
    x = keras.layers.concatenate([x1,x2])
    x = keras.layers.BatchNormalization()(x)
    x = keras.layers.Dropout(0.7)(x)
    x = keras.layers.Dense(1024, activation='relu')(x)
    x = keras.layers.BatchNormalization()(x)
    x = keras.layers.Dropout(0.7)(x)
    output = keras.layers.Dense(1, activation='sigmoid')(x)
    model = keras.models.Model(inputs=[base_model.input], outputs=[output])
    return model

In [2]:
import librosa
import numpy as np
from functools import lru_cache
def trim_silence(x, pad=0, db_max=50):
    _, ints = librosa.effects.trim(x, top_db=db_max, frame_length=256, hop_length=64)
    start   = int(max(ints[0]-pad, 0))
    end     = int(min(ints[1]+pad, len(x)))
    return x[start:end]

def process_file(path, chunk=3):
    x, sr = librosa.load(path, sr=None)
    if len(x)/sr < 0.3 or len(x)/sr > 30:
        print(len(x), sr, len(x) / sr, path)
        return None,None
   
    x = trim_silence(x, pad=0.25*sr, db_max=50) 
    x = x[:np.floor(chunk*sr).astype(int)]
    
    #pads to chunk size if smaller
    x_pad = np.zeros(int(sr*chunk))
    x_pad[:min(len(x_pad), len(x))] = x[:min(len(x_pad), len(x))]
    
    hop_length = np.floor(0.010*sr).astype(int) 
    win_length = np.floor(0.020*sr).astype(int) 
    return x_pad, sr, hop_length, win_length

@lru_cache(maxsize=4000)
def get_MFCCS(path, final_dim=(300,200)):
    audio, sr, hop_length, win_length = process_file(path)
    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mels=200, n_mfcc=200, n_fft=2048, 
                                hop_length=hop_length)
    mfcc = np.swapaxes(mfcc, 0, 1)
    mfcc = mfcc[:final_dim[0], :final_dim[1]]
    mfcc = np.expand_dims(mfcc, -1)
    return mfcc

In [3]:
import random
class DataGenerator(keras.utils.Sequence):
    def __init__(self, X, y, batch_size):
        self.X = X
        self.y = y
        self.batch_size = batch_size  
        self.indexes = set(np.arange(len(X)))
        
    def __len__(self):
        return len(self.indexes)

    def __getitem__(self, _):
        indexes = random.sample(self.indexes, self.batch_size)
        X = []
        y = []
        for index in indexes:
            X.append(get_MFCCS(self.X[index]))
            y.append(self.y[index])
        return np.array(X), np.array(y, dtype=int)
    

In [4]:
import glob
X1 = glob.glob("audio/with_them/*.wav")
X0 = glob.glob("audio/without_them/*.wav")
X = X1 + X0
y = [1]*len(X1) + [0] * len(X0)

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8, random_state=69, stratify=y)

In [10]:
train_generator = DataGenerator(X_train, y_train, 8)
val_generator = DataGenerator(X_val, y_val, 8)

In [11]:
model = create_model(input_shape=(300,200,1))
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=['acc'])

In [12]:
history = model.fit(train_generator, steps_per_epoch=30, validation_steps=50, epochs=50, validation_data=val_generator,)
          # callbacks=[keras.callbacks.EarlyStopping(monitor='val_acc', patience=3)])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
