In [12]:
import os, glob
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from pyannote.audio import Model, Inference
# from speechbrain.pretrained import SepformerSeparation

In [13]:
embedding_model = Model.from_pretrained(
                                        "pyannote/embedding", 
                                        use_auth_token="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"
                                        )

# denoiser = SepformerSeparation.from_hparams(
#                                         source="speechbrain/sepformer-wham-enhancement", 
#                                         savedir='pretrained_models/sepformer-wham-enhancement'
#                                         )

embedding_inference = Inference(
                                embedding_model, 
                                window="whole"
                                )

class_dict = {
            'autism': 0,
            'non-autism': 1
            }

In [14]:
# wavFile = 'data/pronouncing-evaluation/reference/1.wav'
# denoiser.separate_file(path=wavFile) 
# denoised_wavFile = f"results/denoised/{wavFile.split('/')[-1].split('.')[0]}_denoised.wav"
# denoiser.save_file(denoised_wavFile)

In [15]:
def load_dataset(ref_audio_dir='data/pronouncing-evaluation/reference'):
    ref_voice_files = glob.glob(f'{ref_audio_dir}/*.wav')
    ref_voice_files = [voice_file.replace('\\', '/') for voice_file in ref_voice_files]
    aut_voice_files = [voice_file.replace('/reference/', '/autism/') for voice_file in ref_voice_files]
    non_aut_voice_files = [voice_file.replace('/reference/', '/non-autism/') for voice_file in ref_voice_files]

    embeddings_01 = np.zeros((len(ref_voice_files) * 2, 512))
    embeddings_02 = np.zeros((len(ref_voice_files) * 2, 512))
    labels = np.zeros(len(ref_voice_files) * 2)

    errorneous_idxs = []

    for idx in range(len(ref_voice_files)):
        try:
            embeddings_01[idx] = embedding_inference(ref_voice_files[idx])
            embeddings_02[idx] = embedding_inference(aut_voice_files[idx])
            labels[idx] = 0
        except:
            errorneous_idxs.append(idx)
            print('Errorneous reference file: ', ref_voice_files[idx])
            print('Errorneous autism file: ', aut_voice_files[idx])

        try:
            embeddings_01[idx + len(ref_voice_files)] = embedding_inference(ref_voice_files[idx])
            embeddings_02[idx + len(ref_voice_files)] = embedding_inference(non_aut_voice_files[idx])
            labels[idx + len(ref_voice_files)] = 1
        except:
            errorneous_idxs.append(idx)
            print('Errorneous reference file: ', ref_voice_files[idx])
            print('Errorneous non-autism file: ', non_aut_voice_files[idx])

    labels = np.array(labels)
    
    embeddings_01 = np.delete(embeddings_01, errorneous_idxs, axis=0)
    embeddings_02 = np.delete(embeddings_02, errorneous_idxs, axis=0)
    labels = np.delete(labels, errorneous_idxs, axis=0)

    random_idxs = np.random.permutation(len(labels))
    embeddings_01 = embeddings_01[random_idxs]
    embeddings_02 = embeddings_02[random_idxs]
    labels = labels[random_idxs]
    
    return embeddings_01, embeddings_02, labels

In [16]:
embeddings_01, embeddings_02, labels = load_dataset()


print("Embedding 01 shape: ", embeddings_01.shape)
print("Embedding 02 shape: ", embeddings_02.shape)
print("Labels shape: ", labels.shape)

Embedding 01 shape:  (14, 512)
Embedding 02 shape:  (14, 512)
Labels shape:  (14,)


In [17]:
def build_model():
        inputs01 = tf.keras.Input(shape=(512,))
        inputs02 = tf.keras.Input(shape=(512,))

        x1 = tf.keras.layers.Dense(256, activation='relu')(inputs01)
        x1 = tf.keras.layers.Dropout(0.2)(x1)
        x1 = tf.keras.layers.Dense(128, activation='relu')(x1)
        x1 = tf.keras.layers.Dropout(0.2)(x1)
        x1 = tf.keras.layers.Dense(64, activation='relu')(x1)
    
        x2 = tf.keras.layers.Dense(256, activation='relu')(inputs02)
        x2 = tf.keras.layers.Dropout(0.2)(x2)
        x2 = tf.keras.layers.Dense(128, activation='relu')(x2)
        x2 = tf.keras.layers.Dropout(0.2)(x2)
        x2 = tf.keras.layers.Dense(64, activation='relu')(x2)
    
        x = tf.keras.layers.concatenate([x1, x2])
        x = tf.keras.layers.Dense(32, activation='relu')(x)
        x = tf.keras.layers.Dropout(0.2)(x)
        outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)

        model = tf.keras.Model(
                                inputs=[inputs01, inputs02], 
                                outputs=outputs
                                )
    
        model.compile(
                        optimizer='adam',
                        loss='binary_crossentropy',
                        metrics=[
                                tf.keras.metrics.BinaryAccuracy(name='accuracy'),
                                tf.keras.metrics.Precision(name='precision'),
                                tf.keras.metrics.Recall(name='recall'),
                                tf.keras.metrics.AUC(name='auc')
                                ]
                        )
        return model

In [18]:
model = build_model()
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 512)]        0           []                               
                                                                                                  
 input_4 (InputLayer)           [(None, 512)]        0           []                               
                                                                                                  
 dense_8 (Dense)                (None, 256)          131328      ['input_3[0][0]']                
                                                                                                  
 dense_11 (Dense)               (None, 256)          131328      ['input_4[0][0]']                
                                                                                            

In [19]:
model.fit(
        [embeddings_01, embeddings_02],
        labels,
        epochs=100,
        batch_size=2,
        callbacks=[
            tf.keras.callbacks.EarlyStopping(
                                            monitor='loss',
                                            patience=10,
                                            restore_best_weights=True
                                            )
            ]   
        )

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100


<keras.callbacks.History at 0x1bb19423790>

In [20]:
model.save('models/pronounce-validation.h5')

In [21]:
def inference_pronounce_validation(
                                    audio_file01,
                                    audio_file02
                                    ):
    embedding01 = embedding_inference(audio_file01)
    embedding02 = embedding_inference(audio_file02)

    embedding01 = np.expand_dims(embedding01, axis=0)
    embedding02 = np.expand_dims(embedding02, axis=0)

    prediction = model.predict([embedding01, embedding02])
    prediction = prediction.squeeze()

    print(prediction)    

In [22]:
response = inference_pronounce_validation(
                                        'data/pronouncing-evaluation/reference/1.wav',
                                        'data/pronouncing-evaluation/non-autism/1.wav'
                                        )
response

1.0
