In [37]:
import tensorflow as tf
import keras
from utils import constants
from utils.preprocessor import Preprocessor
from tqdm import tqdm
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from modules.data_loader import DataLoader
from modules.losses import masked_loss
from modules.diacritizer import Diacritizer

In [3]:
ca_train_df, ca_val_df, ca_test_df = pd.read_csv('./dataset/Tashkeela-clean/CA/CA_train.csv'), pd.read_csv('./dataset/Tashkeela-clean/CA/CA_val.csv'), pd.read_csv('./dataset/Tashkeela-clean/CA/CA_test.csv')
msa_train_df, msa_val_df, msa_test_df = pd.read_csv('./dataset/Tashkeela-clean/MSA/MSA_train.csv'), pd.read_csv('./dataset/Tashkeela-clean/MSA/MSA_val.csv'), pd.read_csv('./dataset/Tashkeela-clean/MSA/MSA_test.csv')

In [None]:
ca_train_df

Unnamed: 0,text,words,chars,chars_no_diac
0,وَفِي بَعْضِ النُّسَخِ بِالْإِضَافَةِ فَصَالَح...,60,479,285
1,غَيْرِ وَلِيِّ مَنْ ذُكِرَ دَفْعُ سِنٍّ أَعْلَ...,14,100,60
2,وَمِمَّنْ حَكَى أَنْ يُعَلِّمُ بِمَعْنَى أَعْل...,9,90,51
3,وَيَخْرُجُ بِتَعْبِيرِ الْمَالِ الْمَنْفَعَةُ ...,15,156,91
4,زَوْجَانِ كَافِرَانِ أَسْلَمَتْ الْمَرْأَةُ وَ...,28,248,145
...,...,...,...,...
2365073,إنْ ظَهَرَتْ قَرِينَةٌ تُقَوِّي صِدْقَ السَّيّ...,25,202,117
2365074,لِأَنَّهُ يُعَارِضُ ظَاهِرَ الزَّوْجِ بِالْيَد...,8,70,41
2365075,وَإِلَّا لَمْ يَقَعْ فَرْضًا وَلَا نَفْلًا قَا...,9,73,43
2365076,رَوَاهُ مُسْلِمٌ فِى الصَّحِيحِ عَنْ عَمْرٍو ا...,9,70,43


In [2]:
letters_tok = keras.layers.TextVectorization(
    ragged=True,
    standardize=lambda x:tf.concat([["s"], x, ["e"]], axis=-1),
    split=None,
    vocabulary=constants.get_letters_vocabulary(),
    )
diac_tok = keras.layers.TextVectorization(
    standardize=lambda x:tf.concat([[" "], x, [" "]], axis=-1),
    ragged=True,
    split=None,
    vocabulary=constants.get_diac_vocabulary()
    )

In [3]:
print(letters_tok.vocabulary_size(), letters_tok.get_vocabulary())

41 ['', '[UNK]', ' ', 's', 'e', 'ء', 'آ', 'أ', 'ؤ', 'إ', 'ئ', 'ا', 'ب', 'ة', 'ت', 'ث', 'ج', 'ح', 'خ', 'د', 'ذ', 'ر', 'ز', 'س', 'ش', 'ص', 'ض', 'ط', 'ظ', 'ع', 'غ', 'ف', 'ق', 'ك', 'ل', 'م', 'ن', 'ه', 'و', 'ى', 'ي']


In [4]:
print(diac_tok.vocabulary_size(), diac_tok.get_vocabulary())

17 ['', '[UNK]', ' ', 'ٌ', 'ِ', 'َ', 'ٍ', 'ْ', 'ّ', 'ً', 'ُ', 'ٌّ', 'ِّ', 'َّ', 'ٍّ', 'ًّ', 'ُّ']


In [7]:
data_loader = DataLoader(letters_tok, diac_tok)

In [10]:
ca_train_raw = data_loader.from_csv('./dataset/Tashkeela-clean/CA/CA_train.csv')
ca_val_raw = data_loader.from_csv('./dataset/Tashkeela-clean/CA/CA_val.csv')
ca_test_raw = data_loader.from_csv('./dataset/Tashkeela-clean/CA/CA_test.csv')

msa_train_raw = data_loader.from_csv('./dataset/Tashkeela-clean/MSA/MSA_train.csv')
msa_val_raw = data_loader.from_csv('./dataset/Tashkeela-clean/MSA/MSA_val.csv')
msa_test_raw = data_loader.from_csv('./dataset/Tashkeela-clean/MSA/MSA_test.csv')

In [28]:
sample = next(iter(ca_train_raw))
sample

<tf.Tensor: shape=(), dtype=string, numpy=b'\xd9\x88\xd9\x8e\xd9\x81\xd9\x90\xd9\x8a \xd8\xa7\xd9\x84\xd9\x92\xd8\xa3\xd9\x8e\xd9\x88\xd9\x91\xd9\x8e\xd9\x84\xd9\x90 \xd8\xa7\xd9\x84\xd9\x92\xd9\x85\xd9\x8e\xd9\x82\xd9\x92\xd8\xaa\xd9\x8f\xd9\x88\xd9\x84\xd9\x8f \xd9\x83\xd9\x8e\xd8\xa7\xd9\x86\xd9\x8e \xd9\x82\xd9\x8e\xd8\xa7\xd8\xb5\xd9\x90\xd8\xaf\xd9\x8b\xd8\xa7 \xd8\xa5\xd9\x84\xd9\x8e\xd9\x89 \xd9\x82\xd9\x8e\xd8\xaa\xd9\x92\xd9\x84\xd9\x90 \xd8\xb5\xd9\x8e\xd8\xa7\xd8\xad\xd9\x90\xd8\xa8\xd9\x90\xd9\x87\xd9\x90'>

In [30]:
sample.numpy().decode('utf-8')

'وَفِي الْأَوَّلِ الْمَقْتُولُ كَانَ قَاصِدًا إلَى قَتْلِ صَاحِبِهِ'

In [31]:
ca_train = data_loader.process_ds(ca_train_raw)
ca_val = data_loader.process_ds(ca_val_raw)
ca_test = data_loader.process_ds(ca_test_raw)

msa_train = data_loader.process_ds(msa_train_raw)
msa_val = data_loader.process_ds(msa_val_raw)
msa_test = data_loader.process_ds(msa_test_raw)

In [34]:
sen, diac = next(iter(msa_train))

In [35]:
sen

<tf.Tensor: shape=(32, 132), dtype=int64, numpy=
array([[ 3, 33, 11, ...,  0,  0,  0],
       [ 3, 31, 11, ...,  0,  0,  0],
       [ 3, 31, 40, ...,  0,  0,  0],
       ...,
       [ 3, 38, 34, ..., 11, 36,  4],
       [ 3, 18, 34, ...,  0,  0,  0],
       [ 3, 11, 34, ...,  0,  0,  0]], dtype=int64)>

In [36]:
diac

<tf.Tensor: shape=(32, 132), dtype=int64, numpy=
array([[2, 5, 2, ..., 0, 0, 0],
       [2, 5, 2, ..., 0, 0, 0],
       [2, 4, 2, ..., 0, 0, 0],
       ...,
       [2, 5, 5, ..., 2, 4, 2],
       [2, 4, 5, ..., 0, 0, 0],
       [2, 2, 2, ..., 0, 0, 0]], dtype=int64)>

In [38]:
diacritizer = Diacritizer(letters_tok, diac_tok)

In [39]:
dec_sen = diacritizer.decode_sentences(sen)
dec_diac = diacritizer.decode_diacritics(diac)

In [47]:
"".join(dec_sen[0])[1:-1]

'كان العمل السائد القائم عملا سلبيا هداما يرتد على الجميع ضررا وإنهاكا وتبديدا للقوى'

In [48]:
"".join(dec_diac[0])[1:-1]

'َ َ  ََُْ   َّ ُِ  َْ ُِ ًََ  ًَِّْ  ََّ ً  ََُّْ ََ   َِْ ِ ًََ  ََِْ ً  ََِْ ً  َُِْ '

In [42]:
eof_idx = dec_sen[0].index('e')
Preprocessor.combine_tashkeel(dec_sen[0][1:eof_idx], dec_diac[0][1:eof_idx])

'كَانَ الْعَمَلُ السَّائِدُ الْقَائِمُ عَمَلًا سَلْبِيًّا هَدَّامًا يَرْتَدُّ عَلَى الْجَمِيعِ ضَرَرًا وَإِنْهَاكًا وَتَبْدِيدًا لِلْقُوَى'

In [21]:
def create_model():
    inputs = keras.layers.Input(shape=(None,), dtype=tf.int64)
    x = keras.layers.Embedding(len(letters_tok.get_vocabulary())+1, 128, mask_zero=True)(inputs)
    x = keras.layers.Bidirectional(keras.layers.LSTM(128, return_sequences=True, dropout=0.4))(x)
    x = keras.layers.Bidirectional(keras.layers.LSTM(128, return_sequences=True, dropout=0.4))(x)
    x = keras.layers.Dense(512, activation='relu')(x)
    x = keras.layers.Dense(len(diac_tok.get_vocabulary())+1, activation='relu')(x)
    model = keras.models.Model(inputs, x)
    return model

In [22]:
model = create_model()

In [23]:
model.compile(optimizer=keras.optimizers.Adam(0.001), loss=masked_loss)

In [24]:
model.summary()

In [25]:
model.fit(
    msa_train.repeat(),
    steps_per_epoch=200,
    validation_data=msa_val,
    epochs=2,
    )

Epoch 1/2
[1m 15/200[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m54s[0m 296ms/step - loss: 2.6516

KeyboardInterrupt: 

In [190]:
model.evaluate(msa_test)

[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 105ms/step - loss: 0.3005


0.29762324690818787

In [191]:
smp = next(iter(msa_test))

In [240]:
smp

(<tf.Tensor: shape=(32, 143), dtype=int64, numpy=
 array([[19, 22,  5, ...,  0,  0,  0],
        [19,  8,  6, ...,  0,  0,  0],
        [19, 10, 11, ...,  0,  0,  0],
        ...,
        [19,  8,  3, ...,  0,  0,  0],
        [19, 11,  4, ...,  0,  0,  0],
        [19,  8, 15, ...,  0,  0,  0]], dtype=int64)>,
 <tf.Tensor: shape=(32, 143), dtype=int64, numpy=
 array([[2, 3, 5, ..., 0, 0, 0],
        [2, 3, 4, ..., 0, 0, 0],
        [2, 6, 3, ..., 0, 0, 0],
        ...,
        [2, 3, 2, ..., 0, 0, 0],
        [2, 3, 3, ..., 0, 0, 0],
        [2, 3, 3, ..., 0, 0, 0]], dtype=int64)>)

In [192]:
res = model(smp[0])

In [193]:
decoded_sentences = decode_sentences(smp[0])
decoded_diacritics = decode_diacritics(smp[1])
decoded_diacritics_p = decode_diacritics(tf.argmax(res, -1))

In [220]:
idx = 4

In [221]:
sen = tf.strings.reduce_join(decoded_sentences[idx], axis=-1).numpy()[1:-1]
sen.decode('utf-8'), len(sen)

('وإن غلب جوهر ظلمته على جوهر نوره وظهرت جسمانيته على روحانيته فقد فضل على الشيطان',
 146)

In [222]:
a = Preprocessor.combine_tashkeel(decoded_sentences[idx][1:decoded_sentences[idx].index('e')], decoded_diacritics[idx][1:decoded_sentences[idx].index('e')])
a

'وَإِنْ غَلَبَ جَوْهَرُ ظُلْمَتِهِ عَلَى جَوْهَرِ نُورِهِ وَظَهَرَتْ جِسْمَانِيَّتُهُ عَلَى رُوحَانِيَّتِهِ فَقَدْ فَضَلَ عَلَى الشَّيْطَانِ'

In [223]:
b = Preprocessor.combine_tashkeel(decoded_sentences[idx][1:decoded_sentences[idx].index('e')], decoded_diacritics_p[idx][1:decoded_sentences[idx].index('e')])
b

'وَإِنْ غَلَبَ جَوهَرَ ظَلَمَتَهُ عَلَى جُوهَرِ نُوْرِهِ وَظَهَرَتْ جُسْمَانِيَّتَهُ عَلَى رُوحَانِيَّتِهِ فَقَدْ فَضَلَ عَلَى الشَّيْطَانِ'

In [224]:
from diacritization_evaluation import wer, der

In [277]:
def calculate_metrics(model, ds):
    # Works on batched datasets only
    def combine_per_sen(sen, diac):
        eof_idx = sen.index('e')
        res = Preprocessor.combine_tashkeel(sen[1:eof_idx], diac[1:eof_idx])
        return res
    
    ds_len = (len(ds) * ds._batch_size.numpy())
    res = {"wer":0, "wer*":0, "der":0, "der*":0}
    for sen, diac in tqdm(ds):
        preds = model(sen)
        decoded_sentences = decode_sentences(sen)
        decoded_diacritics_t = decode_diacritics(diac)
        decoded_diacritics_p = decode_diacritics(tf.argmax(preds, -1))
        idx = 0
        while idx < len(decoded_sentences):
            true_diac = combine_per_sen(decoded_sentences[idx], decoded_diacritics_t[idx])
            pred_diac = combine_per_sen(decoded_sentences[idx], decoded_diacritics_p[idx])
            res["wer"] += wer.calculate_wer(true_diac, pred_diac) / ds_len
            res["wer*"] += wer.calculate_wer(true_diac, pred_diac, case_ending=False) / ds_len
            res["der"] += der.calculate_der(true_diac, pred_diac) / ds_len
            res["der*"] += der.calculate_der(true_diac, pred_diac, case_ending=False) / ds_len
            idx+=1
    
    return res

In [278]:
res=calculate_metrics(model, msa_test)

100%|██████████| 47/47 [02:10<00:00,  2.78s/it]


In [279]:
res

{'wer': 37.763051861702166,
 'wer*': 25.14950797872337,
 'der': 10.22831117021276,
 'der*': 8.052779255319125}

: 