In [1]:
import io
import os
import sys
import cv2
import pickle
import librosa
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.io as pio

import plotly.express as px
import matplotlib.pyplot as plt
from IPython.display import Image
import plotly.graph_objects as go
from keras.utils.vis_utils import plot_model
from sklearn.utils import shuffle

import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.preprocessing.text import Tokenizer

import pyspark
from pyspark import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.streaming import StreamingContext
import pyspark.sql.types as tp
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.feature import StopWordsRemover, Word2Vec, RegexTokenizer
from pyspark.ml.classification import LogisticRegression
from pyspark.sql import Row

2021-09-14 06:52:50.519302: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-09-14 06:52:50.519331: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [1]:
from pyspark import SparkContext, SparkConf
conf = SparkConf().setAppName("Collinear Points")
sc = SparkContext('local',conf=conf)    
from pyspark.rdd import RDD

21/09/14 13:56:26 WARN Utils: Your hostname, michael resolves to a loopback address: 127.0.1.1; using 192.168.43.177 instead (on interface wlo1)
21/09/14 13:56:26 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
21/09/14 13:56:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [2]:
# Create a binary rdd file from the audio files
binary_wave_rdd = sc.binaryFiles('../data/wav/*.wav')

In [3]:
# Transfomer binary_wave_rdd to a tuple rdd with location of file and numpy array
rdd = binary_wave_rdd.map(lambda x : (x[0].split('/')[-1].split('.')[0], librosa.load(io.BytesIO(x[1]))))

In [24]:
class CleanAudio():
    """Clean audio data by removing dead spaces, ...
    """

    def __init__(self):
        pass

    def normalize_audio(self, signal):
        feats_mean = np.mean(signal, axis=0)
        feats_std = np.std(signal, axis=0)
        signal = (signal - feats_mean) / (feats_std + 1e-14)
        return signal

    def trim_audio(self, signal, trim_db=None):
        signal, index = librosa.effects.trim(signal, top_db=trim_db)
        return signal

    def split_audio(self, signal, clean_db=None):
        yt = librosa.effects.split(signal, top_db=clean_db)
        clean_signal = []
        for start_i, end_i in yt:
            clean_signal.append(signal[start_i: end_i])
        signal = np.concatenate(np.array(clean_signal), axis=0)
        return signal

In [25]:
import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model, Sequential

class LogMelgramLayer(Layer):
    def __init__(self, num_fft, hop_length, num_mels, sample_rate, f_min, f_max, eps, **kwargs):
        super(LogMelgramLayer, self).__init__(**kwargs)

        self.num_fft = num_fft
        self.hop_length = hop_length
        self.num_mels = num_mels
        self.sample_rate = sample_rate
        self.f_min = f_min
        self.f_max = f_max
        self.eps = eps
        self.num_freqs = num_fft // 2 + 1
        lin_to_mel_matrix = tf.signal.linear_to_mel_weight_matrix(
            num_mel_bins=self.num_mels,
            num_spectrogram_bins=self.num_freqs,
            sample_rate=self.sample_rate,
            lower_edge_hertz=self.f_min,
            upper_edge_hertz=self.f_max,
        )

        self.lin_to_mel_matrix = lin_to_mel_matrix

    
    def call(self, input):

        def _tf_log10(x):
            numerator = tf.math.log(x)
            denominator = tf.math.log(tf.constant(10, dtype=numerator.dtype))
            return numerator / denominator

        stfts = tf.signal.stft(
            input,
            frame_length=self.num_fft,
            frame_step=self.hop_length,
            pad_end=False,  # librosa test compatibility
        )
        mag_stfts = tf.abs(stfts)

        melgrams = tf.tensordot(  # assuming channel_first, so (b, c, f, t)
            tf.square(mag_stfts), self.lin_to_mel_matrix, axes=[2, 0]
        )
        log_melgrams = _tf_log10(melgrams + self.eps)
        return tf.expand_dims(log_melgrams, 3)

In [26]:
import os
import sys
import pickle

import warnings

import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model, Sequential

sr = 8000
fft_size = 256
hop_size = 128
n_mels = 128

def preprocessing_model(fft_size, hop_size, n_mels):
    
    input_data = Input(name='input', shape=(None,), dtype="float32")
    spec = LogMelgramLayer(
        num_fft=fft_size,
        hop_length=hop_size,
        num_mels=n_mels,
        sample_rate=sr,
        f_min=0.0,
        f_max=sr // 2,
        eps=1e-6)(input_data)
    x = BatchNormalization(axis=2)(spec)
    # x = Permute((2, 1, 3), name='permute', dtype="float32")(x)
    model = Model(inputs=input_data, outputs=x, name="preprocessin_model")
    
    return model

def build_model(melspecModel, output_dim, custom_model, calc=None):

    input_audios = Input(name='the_input', shape=(None,))
    pre = melspecModel(input_audios)
    pre.trainable = False  # Freeze the layer
    pre = tf.squeeze(pre, [3])

    y_pred = custom_model(pre)
    model = Model(inputs=input_audios, outputs=y_pred, name="model_builder")
    model.output_length = calc

    return model


def cnn_output_length(input_length, kernel_list, pool_sizes, cnn_stride, mx_stride, padding='same'):

    if padding == 'same':
        output_length = input_length
        for i, j in zip(cnn_stride, pool_sizes):
            output_length = (output_length)/i
            if j != 0:
                output_length = (output_length - j)/mx_stride + 1

        return tf.math.ceil(output_length)

    elif padding == 'valid':

        output_length = input_length
        for i, j in zip(kernel_list, pool_sizes):
            output_length = (output_length - i)/cnn_stride + 1
            if j != 0:
                output_length = (output_length - j)/mx_stride + 1

        return tf.math.floor(output_length)


def block(filters, inp):
    x = BatchNormalization()(inp)
    x = LeakyReLU(.1)(x)
    x = Dropout(.4)(x)
    x = Conv2D(filters, (3, 3), padding='same')(x)
    x = BatchNormalization()(x)
    x = LeakyReLU(.1)(x)
    x = Dropout(.4)(x)
    x = Conv2D(filters, (3, 3), padding='same')(x)
    return(x)


def resnet(input_dim, output_dim=224, units=256,  num_birnn=2):

    filters = [32, 32, 32]
    kernels = [3, 3, 3]
    pool_sizes = [0, 0, 2]
    cnn_stride = [1, 1, 1]
    mx_stride = 2

    input_data = Input(name='the_input', shape=(None, input_dim))
    x = Reshape((-1, input_dim, 1), dtype="float32")(input_data)

    x = Conv2D(filters[0], (3, 3), padding='same')(x)
    x = MaxPooling2D((1, 2), strides=(1, 2), padding='same')(x)

    x = Add()([block(filters[0], x), x])
    x = Add()([block(filters[0], x), x])
    x = Add()([block(filters[0], x), x])

    x = Conv2D(filters[1], (3, 3), padding='same')(x)
    x = MaxPooling2D((1, 2), strides=(1, 2), padding='same')(x)

    x = Add()([block(filters[1], x), x])
    x = Add()([block(filters[1], x), x])
    x = Add()([block(filters[1], x), x])

    x = Conv2D(filters[2], (3, 3), padding='same')(x)
    x = MaxPooling2D((1, 2), strides=(1, 2), padding='same')(x)

    x = Add()([block(filters[2], x), x])
    x = Add()([block(filters[2], x), x])
    x = Add()([block(filters[2], x), x])

    # x = MaxPooling2D((2,2), strides=2, padding = 'same')(x)
    x = AveragePooling2D((2, 2), strides=2, padding='same')(x)
    x = Reshape((-1, x.shape[-1] * x.shape[-2]))(x)

    # GRULayer
    for i in range(num_birnn):
        x = Bidirectional(GRU(units=units, return_sequences=True,
                          implementation=2, name='rnn_{}'.format(i)))(x)
        x = Dropout(.4)(x)
        x = LeakyReLU(.1)(x)
        x = BatchNormalization(name='bn_rnn_{}'.format(i))(x)

    x = TimeDistributed(Dense(output_dim))(x)
    y_pred = Activation('softmax', name='softmax')(x)

    model = Model(inputs=input_data, outputs=y_pred, name="custom_model")

    def output_length_calculater(x): return cnn_output_length(
        x, kernels, pool_sizes, cnn_stride, mx_stride)

    return model, output_length_calculater


In [27]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


class TokenizerWrap(Tokenizer):

    def tokens_to_string(self, tokens):
        words = [self.index_to_word[token] for token in tokens if token != 0]
        text = "".join(words)
        return text

In [28]:
handle = open('../models/char_tokenizer_amharic.pickle', 'rb')
tokenizer = pickle.load(handle)

In [29]:
import pickle
import librosa
import warnings
from jiwer import wer
import matplotlib.pyplot as plt
from IPython.display import Image
import plotly.graph_objects as go
from keras.utils.vis_utils import plot_model
from sklearn.utils import shuffle
from copy import copy

import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras.layers import *

import warnings
warnings.filterwarnings("ignore")


class Predict():
    def __init__(self):

        self.clean_audio = CleanAudio()

    def get_audio(self, audio_file):
        sr = 8000
        wav, rate = audio_file
        y = librosa.resample(wav, rate, sr)
        return y

    def get_clean_audio(self, wav):
        y = self.clean_audio.normalize_audio(wav)
        y = self.clean_audio.split_audio(y, 30)
        return y

    def predict(self, audio_signal):
        y = audio_signal.reshape(1, -1)
        fft_size = 256
        hop_size = 128
        n_mels = 128
        melspecModel = preprocessing_model(fft_size, hop_size, n_mels)
        resnet_, calc = resnet(n_mels, 224, 512, 4)
        model = build_model(melspecModel, 224, resnet_, calc)
        model.load_weights('../models/resnet_v3.h5')
#         models =tf.keras.models.clone_model(model)
#         models.load_weights('../models/resnet_v3.h5')
        y_pred = model.predict(y)

        input_shape = tf.keras.backend.shape(y_pred)
        input_length = tf.ones(
            shape=input_shape[0]) * tf.keras.backend.cast(input_shape[1], 'float32')
        prediction = tf.keras.backend.ctc_decode(
            y_pred, input_length, greedy=False)[0][0]

        pred = K.eval(prediction).flatten().tolist()
        pred = list(filter(lambda a: a != -1, pred))
#         handle = open('../models/char_tokenizer_amharic.pickle', 'rb', encoding='UTF-8')
#         tokenizer = pickle.load(handle)
        return ''.join(tokenizer.tokens_to_string(pred))

def validate(rdd):
    
    predict = Predict()
    audio_file_rdd = rdd.map(lambda x : (x[0], predict.get_audio(x[1])))
#     coll_aud = audio_file_rdd.collect()
#     print(coll_aud[0])
    clean_audio_file_rdd = audio_file_rdd.map(lambda x : (x[0], predict.get_clean_audio(x[1])))
#     coll_file = clean_audio_file_rdd.collect()
#     print(coll_file[0])
    predicted_txt_rdd = clean_audio_file_rdd.map(lambda x: (x[0], predict.predict(x[1])))
#     coll_pred = predicted_txt_rdd.collect()
    return predicted_txt_rdd, clean_audio_file_rdd

predicted_rdd, clean_audio_file_rdd = validate(rdd)

In [30]:
# get collection of audio wave file and turn it to dictionary
coll_clean = clean_audio_file_rdd.collect()
dct_clean = dict((y, x) for y, x in coll_clean)

2021-09-14 10:07:21.776073: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-09-14 10:07:21.776115: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
                                                                                

In [32]:
# overwrite clean audio to file
from scipy.io.wavfile import write
import scipy.io.wavfile
for i,j in dct_clean.items():
    scipy.io.wavfile.write('../data/test_data/audio_test/'+i+'.wav', 8000,j)

In [33]:
# Get collection of predicted amharic txt with it's audio name
coll_pred = predicted_rdd. collect()

2021-09-14 10:07:55.831901: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2021-09-14 10:07:55.831933: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2021-09-14 10:07:55.831956: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (ip-172-31-38-53): /proc/driver/nvidia/version does not exist
2021-09-14 10:07:55.832212: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, 

In [34]:
# turn collection of prediction to collection of dictionary
dct_pred = dict((y, x) for y, x in coll_pred)

In [35]:
dct_pred

{'1_tr_10000_tr097082': 'የተላይ የትግራይ አወራጃ ተወላጆች ገንዘባቸውን አዋድተው የልማት ተቋማትን እንዲመሰሩቱ ትልማ አይፈግድ ም',
 '2_tr_10001_tr097083': 'የጠመንጃ ተኩስ ተከፈተና አራት የኤርትራ ወታደሮች ተገደሉ',
 '3_tr_10002_tr097084': 'ላነሷቸው ጥያቄዎች የሰጡትን መልስ አቅርበ ነዋል',
 '4_tr_10003_tr097085': 'እብዱ አስፋልቱ ላይየ ኰለኰ ለው ድንጋይ መኪና አላሳልፍ አለ',
 '5_tr_10004_tr097086': 'ጠጁን ኰ መኰ መ ኰ መ ኰ መና ሚስቱን ሲ ያሰቃያት አደረ',
 '6_tr_10005_tr097087': 'ድንቹ በድንብ ስለተኰተቆተ በጥሩ ሁኔታ ኰረተ',
 '7_tr_10006_tr097088': 'በድህነቱ ላይ ይህ ክፉ በሽታ ስለያዘው ሰውነቱ በጣም ኰ ሰ',
 '8_tr_10007_tr097089': 'በሩን እንዲህ በሀይል አታንኳኲ ብዬ አልነበረም እንዴ'}

In [42]:
# load original text file .json
import json
f = open('../data/test_data/data.json')
text = json.load(f)

In [47]:
# Get invalid audios
bad_aud = []
from jiwer import wer
for j,k in dct_pred.items():
    for l,m in text.items():
        ids = j.split('_')[0]
        if l == ids:
            error = wer(m,k)
            print(error)
            if error < 0.6:
                bad_aud.append(j)
print(bad_aud)

0.7777777777777778
0.4444444444444444
0.375
0.6153846153846154
0.5333333333333333
0.625
0.4166666666666667
0.6
[]


In [19]:
# Remove audios that invalid
import os
for i in bad_aud:
    try:
        os.remove('../data/test_data/audio_test/'+i+'.wav')
    except FileNotFoundError:
        continue