In [1]:
import numpy as np
import fasttext, glob
import tensorflow as tf
from datasets import Dataset, Audio
from transformers import WhisperProcessor, WhisperForConditionalGeneration

ModuleNotFoundError: No module named 'fasttext'

In [None]:
s2t_processor = WhisperProcessor.from_pretrained("Subhaka/whisper-small-Sinhala-Fine_Tune")
s2t_model = WhisperForConditionalGeneration.from_pretrained("Subhaka/whisper-small-Sinhala-Fine_Tune")
s2t_forced_decoder_ids = s2t_processor.get_decoder_prompt_ids(
                                                            language="sinhala", 
                                                            task="transcribe"
                                                            )
embedding_model = fasttext.load_model("models/cc.si.300.bin")



In [None]:
def load_audio(audio_file):

    audio_data = Dataset.from_dict(
                                    {"audio": [audio_file]}
                                    ).cast_column("audio", Audio())
    audio_data = audio_data.cast_column(
                                        "audio", 
                                        Audio(sampling_rate=16000)
                                        )
    audio_data = audio_data[0]['audio']['array']
    return audio_data

def transcribe(audio_file):
    audio_data = load_audio(audio_file)
    input_features = s2t_processor(
                                audio_data, 
                                sampling_rate=16000, 
                                return_tensors="pt"
                                ).input_features
    predicted_ids = s2t_model.generate(
                                    input_features, 
                                    forced_decoder_ids=s2t_forced_decoder_ids
                                    )
    
    # transcription = s2t_processor.batch_decode(predicted_ids)
    transcription = s2t_processor.batch_decode(
                                                predicted_ids, 
                                                skip_special_tokens=True
                                                )
    return transcription[0]

In [None]:
transcription = transcribe('data/pronouncing-evaluation/reference/1.wav')
transcription



'අක්යෙ කාවෙ'

In [None]:
def load_dataset(ref_audio_dir='data/answering-evaluation/reference'):
    ref_voice_files = glob.glob(f'{ref_audio_dir}/*.wav')
    ref_voice_files = [voice_file.replace('\\', '/') for voice_file in ref_voice_files]
    aut_voice_files = [voice_file.replace('/reference/', '/autism/') for voice_file in ref_voice_files]
    non_aut_voice_files = [voice_file.replace('/reference/', '/non-autism/') for voice_file in ref_voice_files]

    embeddings_01 = np.zeros((len(ref_voice_files) * 2, 300))
    embeddings_02 = np.zeros((len(ref_voice_files) * 2, 300))
    labels = np.zeros(len(ref_voice_files) * 2)

    errorneous_idxs = []

    for idx in range(len(ref_voice_files)):
        try:
            transcription_01 = transcribe(ref_voice_files[idx])
            transcription_02 = transcribe(aut_voice_files[idx])

            embeddings_01[idx] = embedding_model.get_sentence_vector(transcription_01)
            embeddings_02[idx] = embedding_model.get_sentence_vector(transcription_02)
            labels[idx] = 0
        except:
            errorneous_idxs.append(idx)
            print('Errorneous reference file: ', ref_voice_files[idx])
            print('Errorneous autism file: ', aut_voice_files[idx])

        try:
            transcription_01 = transcribe(ref_voice_files[idx])
            transcription_02 = transcribe(non_aut_voice_files[idx])    

            embeddings_01[idx + len(ref_voice_files)] = embedding_model.get_sentence_vector(transcription_01)
            embeddings_02[idx + len(ref_voice_files)] = embedding_model.get_sentence_vector(transcription_02)
            labels[idx + len(ref_voice_files)] = 1
        except:
            errorneous_idxs.append(idx)
            print('Errorneous reference file: ', ref_voice_files[idx])
            print('Errorneous non-autism file: ', non_aut_voice_files[idx])

    labels = np.array(labels)
    
    embeddings_01 = np.delete(embeddings_01, errorneous_idxs, axis=0)
    embeddings_02 = np.delete(embeddings_02, errorneous_idxs, axis=0)
    labels = np.delete(labels, errorneous_idxs, axis=0)

    random_idxs = np.random.permutation(len(labels))
    embeddings_01 = embeddings_01[random_idxs]
    embeddings_02 = embeddings_02[random_idxs]
    labels = labels[random_idxs]
    
    return embeddings_01, embeddings_02, labels

In [None]:
embeddings_01, embeddings_02, labels = load_dataset()


print("Embedding 01 shape: ", embeddings_01.shape)
print("Embedding 02 shape: ", embeddings_02.shape)
print("Labels shape: ", labels.shape)

Embedding 01 shape:  (14, 300)
Embedding 02 shape:  (14, 300)
Labels shape:  (14,)


In [None]:
def build_model():
        inputs01 = tf.keras.Input(shape=(300,))
        inputs02 = tf.keras.Input(shape=(300,))

        x1 = tf.keras.layers.Dense(300, activation='relu')(inputs01)
        x1 = tf.keras.layers.Dropout(0.2)(x1)
        x1 = tf.keras.layers.Dense(150, activation='relu')(x1)
        x1 = tf.keras.layers.Dropout(0.2)(x1)
        x1 = tf.keras.layers.Dense(30, activation='relu')(x1)
    
        x2 = tf.keras.layers.Dense(300, activation='relu')(inputs02)
        x2 = tf.keras.layers.Dropout(0.2)(x2)
        x2 = tf.keras.layers.Dense(150, activation='relu')(x2)
        x2 = tf.keras.layers.Dropout(0.2)(x2)
        x2 = tf.keras.layers.Dense(30, activation='relu')(x2)
    
        x = tf.keras.layers.concatenate([x1, x2])
        x = tf.keras.layers.Dense(30, activation='relu')(x)
        x = tf.keras.layers.Dropout(0.2)(x)
        outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)

        model = tf.keras.Model(
                                inputs=[inputs01, inputs02], 
                                outputs=outputs
                                )
    
        model.compile(
                        optimizer='adam',
                        loss='binary_crossentropy',
                        metrics=[
                                tf.keras.metrics.BinaryAccuracy(name='accuracy'),
                                tf.keras.metrics.Precision(name='precision'),
                                tf.keras.metrics.Recall(name='recall'),
                                tf.keras.metrics.AUC(name='auc')
                                ]
                        )
        return model

In [None]:
model = build_model()
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 300)]        0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 300)]        0           []                               
                                                                                                  
 dense (Dense)                  (None, 300)          90300       ['input_1[0][0]']                
                                                                                                  
 dense_3 (Dense)                (None, 300)          90300       ['input_2[0][0]']                
                                                                                              

In [None]:
model.fit(
        [embeddings_01, embeddings_02],
        labels,
        epochs=100,
        batch_size=2,
        callbacks=[
            tf.keras.callbacks.EarlyStopping(
                                            monitor='loss',
                                            patience=10,
                                            restore_best_weights=True
                                            )
            ]   
        )

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100


<keras.callbacks.History at 0x2098985e7f0>

In [None]:
model.save('models/answering-evaluation.h5')

In [None]:
def inference_pronounce_validation(
                                    audio_file01,
                                    audio_file02
                                    ):
    transcription_01 = transcribe(audio_file01)
    transcription_02 = transcribe(audio_file02)

    embedding01 = embedding_model.get_sentence_vector(transcription_01)
    embedding02 = embedding_model.get_sentence_vector(transcription_02)

    embedding01 = np.expand_dims(embedding01, axis=0)
    embedding02 = np.expand_dims(embedding02, axis=0)

    prediction = model.predict([embedding01, embedding02])
    prediction = prediction.squeeze()

    print(prediction)    

In [None]:
response = inference_pronounce_validation(
                                        'data/answering-evaluation/reference/Answer2.wav',
                                        'data/answering-evaluation/non-autism/Answer2.wav'
                                        )
response



0.99976903
