In [1]:
import tensorflow as tf
import keras
from utils import constants
from utils.preprocessor import Preprocessor
from tqdm import tqdm
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from modules.data_loader import DataLoader
from modules.losses import masked_loss
from modules.diacritizer import Diacritizer

In [2]:
ca_train_df, ca_val_df, ca_test_df = pd.read_csv('./dataset/Tashkeela-clean/CA/CA_train.csv'), pd.read_csv('./dataset/Tashkeela-clean/CA/CA_val.csv'), pd.read_csv('./dataset/Tashkeela-clean/CA/CA_test.csv')
msa_train_df, msa_val_df, msa_test_df = pd.read_csv('./dataset/Tashkeela-clean/MSA/MSA_train.csv'), pd.read_csv('./dataset/Tashkeela-clean/MSA/MSA_val.csv'), pd.read_csv('./dataset/Tashkeela-clean/MSA/MSA_test.csv')

In [3]:
ca_train_df

Unnamed: 0,text,words,chars,chars_no_diac
0,وَفِي بَعْضِ النُّسَخِ بِالْإِضَافَةِ فَصَالَح...,60,479,285
1,غَيْرِ وَلِيِّ مَنْ ذُكِرَ دَفْعُ سِنٍّ أَعْلَ...,14,100,60
2,وَمِمَّنْ حَكَى أَنْ يُعَلِّمُ بِمَعْنَى أَعْل...,9,90,51
3,وَيَخْرُجُ بِتَعْبِيرِ الْمَالِ الْمَنْفَعَةُ ...,15,156,91
4,زَوْجَانِ كَافِرَانِ أَسْلَمَتْ الْمَرْأَةُ وَ...,28,248,145
...,...,...,...,...
2365073,إنْ ظَهَرَتْ قَرِينَةٌ تُقَوِّي صِدْقَ السَّيّ...,25,202,117
2365074,لِأَنَّهُ يُعَارِضُ ظَاهِرَ الزَّوْجِ بِالْيَد...,8,70,41
2365075,وَإِلَّا لَمْ يَقَعْ فَرْضًا وَلَا نَفْلًا قَا...,9,73,43
2365076,رَوَاهُ مُسْلِمٌ فِى الصَّحِيحِ عَنْ عَمْرٍو ا...,9,70,43


In [4]:
letters_tok = keras.layers.TextVectorization(
    ragged=True,
    standardize=lambda x:tf.concat([["s"], x, ["e"]], axis=-1),
    split=None,
    vocabulary=constants.get_letters_vocabulary(),
    )

diac_tok = keras.layers.TextVectorization(
    standardize=lambda x:tf.concat([[" "], x, [" "]], axis=-1),
    ragged=True,
    split=None,
    vocabulary=constants.get_diac_vocabulary()
    )

In [5]:
print(letters_tok.vocabulary_size(), letters_tok.get_vocabulary())

41 ['', '[UNK]', ' ', 's', 'e', 'ء', 'آ', 'أ', 'ؤ', 'إ', 'ئ', 'ا', 'ب', 'ة', 'ت', 'ث', 'ج', 'ح', 'خ', 'د', 'ذ', 'ر', 'ز', 'س', 'ش', 'ص', 'ض', 'ط', 'ظ', 'ع', 'غ', 'ف', 'ق', 'ك', 'ل', 'م', 'ن', 'ه', 'و', 'ى', 'ي']


In [6]:
print(diac_tok.vocabulary_size(), diac_tok.get_vocabulary())

17 ['', '[UNK]', ' ', 'ْ', 'ٌ', 'ً', 'ٍ', 'ِ', 'َ', 'ّ', 'ُ', 'ٌّ', 'ٍّ', 'ُّ', 'ِّ', 'َّ', 'ًّ']


In [7]:
data_loader = DataLoader(letters_tok, diac_tok)

In [8]:
ca_train_raw = data_loader.from_csv('./dataset/Tashkeela-clean/CA/CA_train.csv')
ca_val_raw = data_loader.from_csv('./dataset/Tashkeela-clean/CA/CA_val.csv')
ca_test_raw = data_loader.from_csv('./dataset/Tashkeela-clean/CA/CA_test.csv')

msa_train_raw = data_loader.from_csv('./dataset/Tashkeela-clean/MSA/MSA_train.csv')
msa_val_raw = data_loader.from_csv('./dataset/Tashkeela-clean/MSA/MSA_val.csv')
msa_test_raw = data_loader.from_csv('./dataset/Tashkeela-clean/MSA/MSA_test.csv')

In [9]:
sample = next(iter(ca_train_raw))
sample

<tf.Tensor: shape=(), dtype=string, numpy=b'\xd9\x88\xd9\x8e\xd9\x81\xd9\x90\xd9\x8a \xd8\xa8\xd9\x8e\xd8\xb9\xd9\x92\xd8\xb6\xd9\x90 \xd8\xa7\xd9\x84\xd9\x86\xd9\x91\xd9\x8f\xd8\xb3\xd9\x8e\xd8\xae\xd9\x90 \xd8\xa8\xd9\x90\xd8\xa7\xd9\x84\xd9\x92\xd8\xa5\xd9\x90\xd8\xb6\xd9\x8e\xd8\xa7\xd9\x81\xd9\x8e\xd8\xa9\xd9\x90 \xd9\x81\xd9\x8e\xd8\xb5\xd9\x8e\xd8\xa7\xd9\x84\xd9\x8e\xd8\xad\xd9\x8e \xd8\xa7\xd9\x84\xd8\xb1\xd9\x91\xd9\x8e\xd8\xac\xd9\x8f\xd9\x84\xd9\x8f \xd8\xa7\xd9\x84\xd9\x92\xd9\x85\xd9\x8e\xd8\xb1\xd9\x90\xd9\x8a\xd8\xb6\xd9\x8e \xd8\xb9\xd9\x8e\xd9\x84\xd9\x8e\xd9\x89 \xd8\xac\xd9\x8f\xd8\xb1\xd9\x92\xd8\xad\xd9\x90\xd9\x87\xd9\x90 \xd9\x81\xd9\x90\xd9\x8a \xd8\xad\xd9\x8e\xd8\xa7\xd9\x84\xd9\x90 \xd9\x85\xd9\x8e\xd8\xb1\xd9\x8e\xd8\xb6\xd9\x90\xd9\x87\xd9\x90 \xd9\x85\xd9\x90\xd9\x86\xd9\x92 \xd8\xa7\xd9\x84\xd9\x92\xd8\xac\xd9\x8f\xd8\xb1\xd9\x92\xd8\xad\xd9\x90 \xd8\xa8\xd9\x90 \xd9\x85\xd9\x8e\xd8\xa7\xd9\x84\xd9\x8d \xd9\x82\xd9\x8e\xd8\xaf\xd9\x92\xd8\xb1\xd9\x90 \xd

In [10]:
sample.numpy().decode('utf-8')

'وَفِي بَعْضِ النُّسَخِ بِالْإِضَافَةِ فَصَالَحَ الرَّجُلُ الْمَرِيضَ عَلَى جُرْحِهِ فِي حَالِ مَرَضِهِ مِنْ الْجُرْحِ بِ مَالٍ قَدْرِ أَرْشِهِ أَيْ دِيَةِ الْجُرْحِ أَوْ غَيْرِهِ أَيْ الْأَرْشِ صَادِقٌ بِأَقَلَّ وَأَكْثَرَ مِنْهُ ثُمَّ مَاتَ الْمَرِيضُ مِنْ مَرَضِهِ مِنْ ذَلِكَ الْجُرْحِ جَازَ صُلْحُهُ ابْتِدَاءً وَلَزِمَ صُلْحُهُ بَعْدَ وُقُوعِهِ فَلَيْسَ لِوَارِثِهِ نَقْضُهُ إذْ لِلْمَرِيضِ الْعَفْوُ عَنْ جَارِحِهِ عَمْدًا عُدْوَانًا مَجَّانًا وَإِنْ لَمْ يَكُنْ لَهُ مَالٌ'

In [11]:
ca_train = data_loader.process_ds(ca_train_raw)
ca_val = data_loader.process_ds(ca_val_raw)
ca_test = data_loader.process_ds(ca_test_raw)

msa_train = data_loader.process_ds(msa_train_raw)
msa_val = data_loader.process_ds(msa_val_raw)
msa_test = data_loader.process_ds(msa_test_raw)

In [14]:
msa_train

<_PrefetchDataset element_spec=(TensorSpec(shape=(None, None), dtype=tf.int64, name=None), TensorSpec(shape=(None, None), dtype=tf.int64, name=None))>

In [15]:
sen, diac = next(iter(msa_train))

In [16]:
sen

<tf.Tensor: shape=(32, 262), dtype=int64, numpy=
array([[ 3, 36, 23, ...,  0,  0,  0],
       [ 3, 31, 24, ...,  0,  0,  0],
       [ 3, 38, 32, ...,  0,  0,  0],
       ...,
       [ 3, 11, 34, ...,  0,  0,  0],
       [ 3,  9, 34, ...,  0,  0,  0],
       [ 3, 38, 16, ...,  0,  0,  0]], dtype=int64)>

In [17]:
diac

<tf.Tensor: shape=(32, 262), dtype=int64, numpy=
array([[ 2,  7,  3, ...,  0,  0,  0],
       [ 2,  8,  7, ...,  0,  0,  0],
       [ 2,  8,  8, ...,  0,  0,  0],
       ...,
       [ 2,  2, 15, ...,  0,  0,  0],
       [ 2,  7,  8, ...,  0,  0,  0],
       [ 2,  8,  8, ...,  0,  0,  0]], dtype=int64)>

In [19]:
diacritizer = Diacritizer(None, letters_tok, diac_tok)

In [20]:
dec_sen = diacritizer.decode_sentences(sen)
dec_diac = diacritizer.decode_diacritics(diac)

In [21]:
"".join(dec_sen[0])[1:-1]

'نسبة إلى الكعك الذي تملك هذا الحيز منه'

In [22]:
"".join(dec_diac[0])[1:-1]

'ًَِْ  َ   َِْْ  َِّ  َُِْ ََ   ََِّْ ُِْ'

In [23]:
eof_idx = dec_sen[0].index('e')
Preprocessor.combine_tashkeel(dec_sen[0][1:eof_idx], dec_diac[0][1:eof_idx])

'نِسْبَةً إلَى الْكَعْكِ الَّذِي تَمْلِكُ هَذَا الْحَيِّزَ مِنْهُ'

In [24]:
from models.rnn_model import RNNModel

In [25]:
model = RNNModel(embedding_dims=128, rnn_type='lstm', rnn_layers=2, rnn_units=128, dense_layers=1, dense_units=512)

In [None]:
def create_model():
    inputs = keras.layers.Input(shape=(None,), dtype=tf.int64)
    x = keras.layers.Embedding(len(letters_tok.get_vocabulary())+1, 128, mask_zero=True)(inputs)
    x = keras.layers.Bidirectional(keras.layers.LSTM(128, return_sequences=True, dropout=0.4))(x)
    x = keras.layers.Bidirectional(keras.layers.LSTM(128, return_sequences=True, dropout=0.4))(x)
    x = keras.layers.Dense(512, activation='relu')(x)
    x = keras.layers.Dense(len(diac_tok.get_vocabulary())+1, activation='relu')(x)
    model = keras.models.Model(inputs, x)
    return model

In [None]:
model = create_model()

In [26]:
model.compile(optimizer=keras.optimizers.Adam(0.001), loss=masked_loss)

In [27]:
model.build(input_shape=(None, None))

In [28]:
model.summary()

In [30]:
model.fit(
    msa_train.repeat(),
    steps_per_epoch=200,
    validation_data=msa_val,
    epochs=2,
    )

Epoch 1/2


[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m87s[0m 389ms/step - loss: 1.6180 - val_loss: 0.7215
Epoch 2/2
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 361ms/step - loss: 0.6398 - val_loss: 0.4669


<keras.src.callbacks.history.History at 0x26ad9d6be60>

In [31]:
model.evaluate(msa_test)

[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 119ms/step - loss: 0.4700


0.4669398069381714

In [32]:
smp = next(iter(msa_test))

In [33]:
smp

(<tf.Tensor: shape=(32, 186), dtype=int64, numpy=
 array([[ 3, 38, 33, ...,  0,  0,  0],
        [ 3, 40, 37, ...,  0,  0,  0],
        [ 3, 34, 40, ...,  0,  0,  0],
        ...,
        [ 3, 35, 29, ...,  0,  0,  0],
        [ 3, 38, 12, ...,  0,  0,  0],
        [ 3,  7, 40, ...,  0,  0,  0]], dtype=int64)>,
 <tf.Tensor: shape=(32, 186), dtype=int64, numpy=
 array([[2, 8, 8, ..., 0, 0, 0],
        [2, 8, 3, ..., 0, 0, 0],
        [2, 7, 8, ..., 0, 0, 0],
        ...,
        [2, 8, 8, ..., 0, 0, 0],
        [2, 8, 7, ..., 0, 0, 0],
        [2, 8, 7, ..., 0, 0, 0]], dtype=int64)>)

In [34]:
res = model(smp[0])

In [36]:
decoded_sentences = diacritizer.decode_sentences(smp[0])
decoded_diacritics = diacritizer.decode_diacritics(smp[1])
decoded_diacritics_p = diacritizer.decode_diacritics(tf.argmax(res, -1))

In [37]:
idx = 4

In [38]:
sen = tf.strings.reduce_join(decoded_sentences[idx], axis=-1).numpy()[1:-1]
sen.decode('utf-8'), len(sen)

('والأحمر يمينه والأسود على الطريق من المدينة إلى بريدة والأحمر يناظره من الجنوب',
 144)

In [39]:
a = Preprocessor.combine_tashkeel(decoded_sentences[idx][1:decoded_sentences[idx].index('e')], decoded_diacritics[idx][1:decoded_sentences[idx].index('e')])
a

'وَالْأَحْمَرَ يَمِينَهُ وَالْأَسْوَدُ عَلَى الطَّرِيقِ مِنْ الْمَدِينَةِ إلَى بُرَيْدَةَ وَالْأَحْمَرُ يُنَاظِرُهُ مِنْ الْجُنُوبِ'

In [40]:
b = Preprocessor.combine_tashkeel(decoded_sentences[idx][1:decoded_sentences[idx].index('e')], decoded_diacritics_p[idx][1:decoded_sentences[idx].index('e')])
b

'وَالْأَحْمَرُ يَمِينَهُ وَالْأَسُودُ عَلَى الطَّرِيقِ مِنَ الْمَدِينَةِ إِلَى بَرِيدَةً وَالْأَحْمَرِ يَنَاظَرُهُ مِنَ الْجُنُوبِ'

In [None]:
from diacritization_evaluation import wer, der

In [None]:
def calculate_metrics(model, ds):
    # Works on batched datasets only
    def combine_per_sen(sen, diac):
        eof_idx = sen.index('e')
        res = Preprocessor.combine_tashkeel(sen[1:eof_idx], diac[1:eof_idx])
        return res
    
    ds_len = (len(ds) * ds._batch_size.numpy())
    res = {"wer":0, "wer*":0, "der":0, "der*":0}
    for sen, diac in tqdm(ds):
        preds = model(sen)
        decoded_sentences = decode_sentences(sen)
        decoded_diacritics_t = decode_diacritics(diac)
        decoded_diacritics_p = decode_diacritics(tf.argmax(preds, -1))
        idx = 0
        while idx < len(decoded_sentences):
            true_diac = combine_per_sen(decoded_sentences[idx], decoded_diacritics_t[idx])
            pred_diac = combine_per_sen(decoded_sentences[idx], decoded_diacritics_p[idx])
            res["wer"] += wer.calculate_wer(true_diac, pred_diac) / ds_len
            res["wer*"] += wer.calculate_wer(true_diac, pred_diac, case_ending=False) / ds_len
            res["der"] += der.calculate_der(true_diac, pred_diac) / ds_len
            res["der*"] += der.calculate_der(true_diac, pred_diac, case_ending=False) / ds_len
            idx+=1
    
    return res

In [None]:
res=calculate_metrics(model, msa_test)

100%|██████████| 47/47 [02:10<00:00,  2.78s/it]


In [None]:
res

{'wer': 37.763051861702166,
 'wer*': 25.14950797872337,
 'der': 10.22831117021276,
 'der*': 8.052779255319125}

: 