In [1]:
import os
import sys
sys.path.append(os.path.abspath(os.path.join('../scripts')))
import pandas as pd
import IPython.display as ipd
import matplotlib.pyplot as plt
import numpy as np
from scipy.io import wavfile #for audio processing
import os
import pickle
import pandas as pd
from collections import Counter
import librosa
import tensorflow as tf
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import * 
from tensorflow.keras.callbacks import Callback, ModelCheckpoint
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import SGD, Adam, RMSprop
from tensorflow.keras import backend as K
from jiwer import wer
import random


import mlflow

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import helper
from data_gen import DataGenerator
from tokenizer import Tokenizer
from logspectrogram import LogMelSpectrogram
from ctc_loss import CTC_loss
from model_implementation import simple_rnn_model, CNN_net, BidirectionalRNN2, cnn_rnn_model

Initialize the CTC loss function

In [4]:
frame_step = 256
ctc = CTC_loss(frame_step)

Load the data prepared for modeling

In [5]:
translation_obj = helper.read_obj("../data/translation_obj.pkl")
audio_obj = helper.read_obj("../data/audio_dict.pkl")
metadata = pd.read_csv("../data/meta_data.csv")

In [6]:
sorted_metadata = metadata.sort_values(by="duration")
labels = sorted_metadata['label'].to_list()
sorted_metadata

Unnamed: 0,translation,label,channel,sample_rate,duration
1233,እኔ ጥሩ ኢትዮጵያዊ ነኝ,tr_7742_tr78043,1,32000,1.792
631,ይሄ ትክክል ነው,tr_6930_tr70031,1,32000,1.792
730,የ ኢትዮጵያ ን ሰራዊት ወነጀለ,tr_8006_tr81007,1,32000,1.920
3680,ደንቡ ም እንዲ ህ የሚ ል ነው,tr_7783_tr78084,1,32000,1.920
2336,ኢነጋማ ህጋዊ እውቅና አገኘ,tr_8030_tr81031,1,32000,2.048
...,...,...,...,...,...
4611,ከ ግዛታቸው ዋ ና ከተማ ጋሪ ስ ሆነው በ ስልክ ሚስተር ሞሪስ ከ ስደተኞ...,tr_2212_tr23013,1,32000,20.992
2773,የተ ለቀቁት ምርኮኞች በ አካባቢያቸው ሰላማዊ ኑሮ እንዲ ኖሩ የ ትራንስፖ...,tr_2560_tr26061,1,32000,21.120
2628,የ ትምህርት ደረጃቸው ንና የ አገልግሎት ሁኔታ ቸውን ስን መረምር የሚ ደ...,tr_2565_tr26066,1,32000,22.784
1408,ማጋነን ባይሆን ብኝ ለ ፕሮፌሰሩ የተሰጠ ውን አክብሮት ሳስበው የ አሜሪካ...,tr_6166_tr62067,1,32000,22.912


In [7]:
audios = []
for label in labels:
    audios.append(audio_obj[label][0])
    
translations = []
for label in labels:
    translations.append(translation_obj[label])

Tokenize each character into int

In [8]:
tokenizer = Tokenizer(translations)
# Build the charachter mapping
int_to_char, char_to_int = tokenizer.build_dict()
sample = translations[0]
encoded = tokenizer.encode(sample, char_to_int)
decoded = tokenizer.decode_text(encoded, int_to_char)

print(f"sample snt: {sample}")
print(f"encoded snt: {encoded}")
print(f"decoed snt: {decoded}")

sample snt: እኔ ጥሩ ኢትዮጵያዊ ነኝ
encoded snt: [11, 103, 1, 44, 52, 1, 36, 3, 38, 43, 6, 63, 1, 20, 100]
decoed snt: እኔ ጥሩ ኢትዮጵያዊ ነኝ


In [9]:
helper.serialize_obj('../data/int_to_char.pkl', int_to_char)
helper.serialize_obj('../data/char_to_int.pkl', char_to_int)

In [10]:
sample_rate = 32000
fft_size = 512
frame_step = 256
n_mels = 100

batch_size = 100
epochs = 20
data_len = len(translations)
output_dim = len(char_to_int) + 2


In [11]:
def build_model(output_dim, custom_model, preprocess_model, mfcc=False, calc=None):

    input_audios = Input(name='the_input', shape=(None,))
    pre = preprocess_model(input_audios)
    pre = tf.squeeze(pre, [3])

    y_pred = custom_model(pre)
    model = Model(inputs=input_audios, outputs=y_pred, name="model_builder")
    model.output_length = calc

    return model

In [12]:
def predict(model, audio, tokenizer, int_to_char, actual=None):
    
    pred_audios = tf.convert_to_tensor([audio])
    
    y_pred = model.predict(pred_audios)

    input_shape = tf.keras.backend.shape(y_pred)
    input_length = tf.ones(shape=input_shape[0]) * tf.keras.backend.cast(input_shape[1], 'float32')
    prediction = tf.keras.backend.ctc_decode(y_pred, input_length, greedy=False)[0][0]
        
    pred = K.eval(prediction).flatten().tolist()
    pred = [i for i in pred if i != -1]
    
    predicted_text = tokenizer.decode_text(pred, int_to_char)
    
    error = None
    if actual != None:
        error = wer(actual, predicted_text)
   
    return predicted_text, error

In [13]:
def preprocessing_model(sample_rate, fft_size, frame_step, n_mels, mfcc=False):

    input_data = Input(name='input', shape=(None,), dtype="float32")
    featLayer = LogMelSpectrogram(
        fft_size=fft_size,
        hop_size=frame_step,
        n_mels=n_mels,
        
        sample_rate=sample_rate,
        f_min=0.0,
        
        f_max=int(sample_rate / 2),
    )(input_data)
    
    x = BatchNormalization(axis=2)(featLayer)
    model = Model(inputs=input_data, outputs=x, name="preprocessing_model")

    return model

In [14]:
def train(model_builder, 
          data_gen,
          batch_size = 32,
          epochs=20, 
          verbose=1,
          save_path="../models/model.h5",
          optimizer=RMSprop(learning_rate=0.0001, decay=1e-6, clipnorm=5),
          ):    
      #     SGD(learning_rate=0.001, decay=1e-6, momentum=0.9, nesterov=True, clipnorm=5)
              
    model = ctc.add_ctc_loss(model_builder)

    checkpointer = ModelCheckpoint(filepath=save_path, verbose=0)
    model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=optimizer)
    print(model.summary())


    hist = model.fit_generator(generator=data_gen,
                               callbacks=[checkpointer],

                               epochs=epochs,
                               verbose=verbose, 
                               use_multiprocessing=False)
    return model

In [15]:
dg = DataGenerator(translations, audios, batch_size, shuffle=True)
preprocess_model = preprocessing_model(sample_rate, fft_size, frame_step, n_mels)
preprocess_model.summary()

2022-06-09 18:42:36.932036: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-06-09 18:42:36.977257: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-06-09 18:42:36.977540: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-06-09 18:42:36.978849: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

Model: "preprocessing_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input (InputLayer)          [(None, None)]            0         
                                                                 
 log_mel_spectrogram (LogMel  (None, None, 100, 1)     0         
 Spectrogram)                                                    
                                                                 
 batch_normalization (BatchN  (None, None, 100, 1)     400       
 ormalization)                                                   
                                                                 
Total params: 400
Trainable params: 200
Non-trainable params: 200
_________________________________________________________________


1. Simple RNN

In [16]:
speech_simple_rnn = simple_rnn_model(n_mels, output_dim)
speech_simple_rnn.summary()

Model: "simple_rnn_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 the_input (InputLayer)      [(None, None, 100)]       0         
                                                                 
 rnn (GRU)                   (None, None, 224)         219072    
                                                                 
 batch_normalization_1 (Batc  (None, None, 224)        896       
 hNormalization)                                                 
                                                                 
 time_distributed (TimeDistr  (None, None, 224)        50400     
 ibuted)                                                         
                                                                 
 softmax (Activation)        (None, None, 224)         0         
                                                                 
Total params: 270,368
Trainable params: 269,920
No

In [17]:
simple_rnn_speech_model = build_model(output_dim, speech_simple_rnn, preprocess_model)
simple_rnn_speech_model.summary()

Model: "model_builder"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 the_input (InputLayer)      [(None, None)]            0         
                                                                 
 preprocessing_model (Functi  (None, None, 100, 1)     400       
 onal)                                                           
                                                                 
 tf.compat.v1.squeeze (TFOpL  (None, None, 100)        0         
 ambda)                                                          
                                                                 
 simple_rnn_model (Functiona  (None, None, 224)        270368    
 l)                                                              
                                                                 
Total params: 270,768
Trainable params: 270,120
Non-trainable params: 648
_____________________________________________

In [18]:
mlflow.set_experiment('Speech Model-RNN-baseline')
mlflow.tensorflow.autolog()
train(simple_rnn_speech_model, dg, epochs=20, save_path="../models/simple_rnn_model.h5",  batch_size=batch_size)

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 the_input (InputLayer)         [(None, None)]       0           []                               
                                                                                                  
 preprocessing_model (Functiona  (None, None, 100, 1  400        ['the_input[0][0]']              
 l)                             )                                                                 
                                                                                                  
 tf.compat.v1.squeeze (TFOpLamb  (None, None, 100)   0           ['preprocessing_model[0][0]']    
 da)                                                                                              
                                                                                              

2022/06/09 18:42:42 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'b6424de76e2d4ca28e51da82422595b7', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current tensorflow workflow
2022-06-09 18:42:42.623405: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 101580800 exceeds 10% of free system memory.
2022-06-09 18:42:42.809911: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 101580800 exceeds 10% of free system memory.


Epoch 1/20


2022-06-09 18:42:46.339741: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 42598400 exceeds 10% of free system memory.
2022-06-09 18:42:46.378995: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 42598400 exceeds 10% of free system memory.
2022-06-09 18:42:46.439298: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 34406400 exceeds 10% of free system memory.
2022-06-09 18:42:48.713053: I tensorflow/stream_executor/cuda/cuda_dnn.cc:368] Loaded cuDNN version 8400


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


2022-06-09 18:52:05.836943: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: /tmp/tmp0v72zlj3/model/data/model/assets


INFO:tensorflow:Assets written to: /tmp/tmp0v72zlj3/model/data/model/assets


<keras.engine.functional.Functional at 0x7fb45831e6d0>

In [20]:

simple_rnn_speech_model.load_weights("../models/simple_rnn_model.h5")


actual_translation = translations[1401]
sample_test_audio = audios[0]
predicted, error = predict(simple_rnn_speech_model, sample_test_audio , tokenizer, int_to_char, actual=actual_translation)

print("actual", actual_translation)
print("predicted", predicted)
print("WER: ", error)


actual ለ ወይዘሪት አዲስ ና ለ ሌሎች ተወዳዳሪ ዎች ስጦታ ተዘጋጅ ቷል
predicted  
WER:  1.0


2. CNN + RNN

In [16]:
speech_cnn_rnn = cnn_rnn_model(n_mels, 250, 4, 1, 'same', 400, output_dim)
speech_cnn_rnn.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 the_input (InputLayer)      [(None, None, 100)]       0         
                                                                 
 conv1d (Conv1D)             (None, None, 250)         100250    
                                                                 
 bn_conv_1d (BatchNormalizat  (None, None, 250)        1000      
 ion)                                                            
                                                                 
 rnn (SimpleRNN)             (None, None, 400)         260400    
                                                                 
 batch_normalization_1 (Batc  (None, None, 400)        1600      
 hNormalization)                                                 
                                                                 
 time_distributed (TimeDistr  (None, None, 224)        89824 

In [17]:
speech_cnn_rnn_model = build_model(output_dim, speech_cnn_rnn, preprocess_model)
speech_cnn_rnn_model.summary()

Model: "model_builder"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 the_input (InputLayer)      [(None, None)]            0         
                                                                 
 preprocessing_model (Functi  (None, None, 100, 1)     400       
 onal)                                                           
                                                                 
 tf.compat.v1.squeeze (TFOpL  (None, None, 100)        0         
 ambda)                                                          
                                                                 
 model (Functional)          (None, None, 224)         453074    
                                                                 
Total params: 453,474
Trainable params: 451,974
Non-trainable params: 1,500
_________________________________________________________________


In [18]:
mlflow.set_experiment('Speech Model-CNN + RNN-baseline')
mlflow.tensorflow.autolog()
train(speech_cnn_rnn_model, dg, epochs=20, save_path="../models/cnn_rnn_model.h5",  batch_size=batch_size)

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 the_input (InputLayer)         [(None, None)]       0           []                               
                                                                                                  
 preprocessing_model (Functiona  (None, None, 100, 1  400        ['the_input[0][0]']              
 l)                             )                                                                 
                                                                                                  
 tf.compat.v1.squeeze (TFOpLamb  (None, None, 100)   0           ['preprocessing_model[0][0]']    
 da)                                                                                              
                                                                                            

2022/06/09 16:25:05 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '8da17ce1c0c5457c841174f26b6b600a', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current tensorflow workflow
2022-06-09 16:25:05.493214: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 93388800 exceeds 10% of free system memory.
2022-06-09 16:25:05.605814: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 93388800 exceeds 10% of free system memory.


Epoch 1/20


2022-06-09 16:25:08.016503: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 67174400 exceeds 10% of free system memory.
2022-06-09 16:25:08.056170: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 67174400 exceeds 10% of free system memory.
2022-06-09 16:25:08.125866: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 70451200 exceeds 10% of free system memory.
2022-06-09 16:25:10.042784: I tensorflow/stream_executor/cuda/cuda_dnn.cc:368] Loaded cuDNN version 8400
2022-06-09 16:25:11.721993: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


2022-06-09 16:44:40.550351: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: /tmp/tmpt7xz28hc/model/data/model/assets




<keras.engine.functional.Functional at 0x7f5dac579940>

In [20]:
speech_cnn_rnn_model.load_weights("../models/cnn_rnn_model.h5")

for k in range(10):
    

    i = random.randint(0, 1000)
    
    actual_translation = translations[i]
    sample_test_audio = audios[i]

    predicted, error = predict(speech_cnn_rnn_model, sample_test_audio,
                               tokenizer, int_to_char, actual=actual_translation)
   
    print("actual", actual_translation)
    print("predicted", predicted)
    print(f"WER: {error:.2f}")

    print()

actual በ ፍራቻ ነው እንዳን ጫወት የ ተደረገው
predicted 
WER: 1.00

actual አንዱ በ ፔናልቲ የተገኘ ነው
predicted 
WER: 1.00

actual እንዲ ህ ና በ እንዲ ህም እለቱ ታ ደረ
predicted 
WER: 1.00

actual ኢትዮጵያውያ ን ከ አስመራ እየ ተባረሩ ነው
predicted 
WER: 1.00

actual በ አደባባይ ም ባሪያዬ ነው እያ ልክ አት ሟገት
predicted 
WER: 1.00

actual አሁን ግን ህክምና የማ ገኘው ከ እናት ተፈጥሮ ነው አሉ
predicted 
WER: 1.00

actual ኢትዮጵያ ሀገራችን መኩሪያ ችን ና ት
predicted 
WER: 1.00

actual ታፈሰ ልብሱ ን ሲያ ጥብ ቆየ
predicted 
WER: 1.00

actual ሀይሌ ን እንደሚ ፈሩ ማን ንም አይፈሩ ም
predicted 
WER: 1.00

actual አዲሱ ቦታው እንዳል ተስማማ ው ይናገራል
predicted 
WER: 1.00



3. CNN + BRNN

In [29]:
# since this model requires expenisive resource for training, we minimize the batch size to 32

batch_size = 16
dg = DataGenerator(translations, audios, batch_size, shuffle=True)

In [30]:
cnn_model, cnn_shape = CNN_net(n_mels)
cnn_model.summary(), cnn_shape

Model: "cnn"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 the_input (InputLayer)      [(None, None, 128, 1)]    0         
                                                                 
 conv2d (Conv2D)             (None, None, 128, 128)    6400      
                                                                 
 activation (Activation)     (None, None, 128, 128)    0         
                                                                 
 batch_normalization_3 (Batc  (None, None, 128, 128)   512       
 hNormalization)                                                 
                                                                 
 max_pooling2d (MaxPooling2D  (None, None, 64, 128)    0         
 )                                                               
                                                                 
 conv2d_1 (Conv2D)           (None, None, 64, 64)      204864  

(None, TensorShape([None, None, 1024]))

In [31]:
BI_RNN_2 = BidirectionalRNN2(1024, batch_size=batch_size, output_dim=output_dim)
BI_RNN_2.summary()

Model: "BidirectionalRNN"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 the_input (InputLayer)      [(None, None, 1024)]      0         
                                                                 
 bidirectional (Bidirectiona  (None, None, 800)        4560000   
 l)                                                              
                                                                 
 batch_normalization_6 (Batc  (None, None, 800)        3200      
 hNormalization)                                                 
                                                                 
 dropout (Dropout)           (None, None, 800)         0         
                                                                 
 bidirectional_1 (Bidirectio  (None, None, 800)        3843200   
 nal)                                                            
                                                  

In [32]:
def build_model2(output_dim, cnn_model, custom_model, preprocess_model, mfcc=False, calc=None):

    input_audios = Input(name='the_input', shape=(None,))
    pre = preprocess_model(input_audios)
    pre = tf.squeeze(pre, [3])

    cnn_output = cnn_model(pre)

    y_pred = custom_model(cnn_output)
    model = Model(inputs=input_audios, outputs=y_pred, name="model_builder")
    model.output_length = calc

    return model

In [33]:
cnn_bi_rnn_model = build_model2(output_dim, cnn_model, BI_RNN_2, preprocess_model)
cnn_bi_rnn_model.summary()

Model: "model_builder"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 the_input (InputLayer)      [(None, None)]            0         
                                                                 
 preprocessing_model (Functi  (None, None, 128, 1)     512       
 onal)                                                           
                                                                 
 tf.compat.v1.squeeze_2 (TFO  (None, None, 128)        0         
 pLambda)                                                        
                                                                 
 cnn (Functional)            (None, None, 1024)        249216    
                                                                 
 BidirectionalRNN (Functiona  (None, None, 222)        16280222  
 l)                                                              
                                                     

In [None]:
mlflow.set_experiment('Speech Model-CNN + BRNN-baseline')
mlflow.tensorflow.autolog()
train(cnn_bi_rnn_model, dg, epochs=20, save_path="../models/cnn_bi_rnn_model.h5",  batch_size=batch_size)


In [None]:

cnn_bi_rnn_model.load_weights("../models/cnn-bi-rnn.h5")
for k in range(10):
    

    i = random.randint(0, 3000)
    
    actual_translation = translations[i]
    sample_test_audio = audios[i]

    predicted, error = predict(cnn_bi_rnn_model, sample_test_audio,
                               tokenizer, int_to_char, actual=actual_translation)
   
    print("actual", actual_translation)
    print("predicted", predicted)
    print(f"WER: {error:.2f}")

    print()