In [79]:
import tensorflow as tf
import keras
from utils import constants
from utils.preprocessor import Preprocessor
from tqdm import tqdm
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [2]:
ca_train_df, ca_val_df, ca_test_df = pd.read_csv('./dataset/Tashkeela-clean-V2.0/CA/CA_train.csv'), pd.read_csv('./dataset/Tashkeela-clean-V2.0/CA/CA_val.csv'), pd.read_csv('./dataset/Tashkeela-clean-V2.0/CA/CA_test.csv')
msa_train_df, msa_val_df, msa_test_df = pd.read_csv('./dataset/Tashkeela-clean-V2.0/MSA/MSA_train.csv'), pd.read_csv('./dataset/Tashkeela-clean-V2.0/MSA/MSA_val.csv'), pd.read_csv('./dataset/Tashkeela-clean-V2.0/MSA/MSA_test.csv')

In [3]:
ca_train_df

Unnamed: 0,text,words,chars,chars_no_diac
0,وَفِي بَعْضِ النُّسَخِ بِالْإِضَافَةِ فَصَالَح...,60,479,285
1,غَيْرِ وَلِيِّ مَنْ ذُكِرَ دَفْعُ سِنٍّ أَعْلَ...,14,100,60
2,وَمِمَّنْ حَكَى أَنْ يُعَلِّمُ بِمَعْنَى أَعْل...,9,90,51
3,وَيَخْرُجُ بِتَعْبِيرِ الْمَالِ الْمَنْفَعَةُ ...,15,156,91
4,زَوْجَانِ كَافِرَانِ أَسْلَمَتْ الْمَرْأَةُ وَ...,28,248,145
...,...,...,...,...
2365073,إنْ ظَهَرَتْ قَرِينَةٌ تُقَوِّي صِدْقَ السَّيّ...,25,202,117
2365074,لِأَنَّهُ يُعَارِضُ ظَاهِرَ الزَّوْجِ بِالْيَد...,8,70,41
2365075,وَإِلَّا لَمْ يَقَعْ فَرْضًا وَلَا نَفْلًا قَا...,9,73,43
2365076,رَوَاهُ مُسْلِمٌ فِى الصَّحِيحِ عَنْ عَمْرٍو ا...,9,70,43


In [4]:
ca_train_raw = tf.data.Dataset.from_tensor_slices(ca_train_df['text'])
ca_val_raw = tf.data.Dataset.from_tensor_slices(ca_val_df['text'])
ca_test_raw = tf.data.Dataset.from_tensor_slices(ca_test_df['text'])

msa_train_raw = tf.data.Dataset.from_tensor_slices(msa_train_df['text'])
msa_val_raw = tf.data.Dataset.from_tensor_slices(msa_val_df['text'])
msa_test_raw = tf.data.Dataset.from_tensor_slices(msa_test_df['text'])

In [5]:
def tf_strip_tashkeel(inputs):
    @tf.py_function(Tout=(tf.string, tf.string))
    def strip_tashkeel(inputs):
        text = inputs.numpy().decode('utf-8')
        text, tashkeel = Preprocessor.strip_tashkeel(text)
        text = tf.convert_to_tensor(text, dtype=tf.string)
        tashkeel = tf.convert_to_tensor(tashkeel, dtype=tf.string)
        return text, tashkeel
    text, tashkeel = strip_tashkeel(inputs)
    text.set_shape((None, ))
    tashkeel.set_shape((None, ))
    return text, tashkeel

In [6]:
letters_tok = keras.layers.TextVectorization(
    ragged=True,
    standardize=lambda x:tf.concat([["s"], x, ["e"]], axis=-1),
    split=None,
    )
diac_tok = keras.layers.TextVectorization(
    standardize=lambda x:tf.concat([[" "], x, [" "]], axis=-1),
    ragged=True,
    split=None
    )

In [7]:
letters_tok.adapt(msa_train_raw.take(200).map(tf_strip_tashkeel).map(lambda x, y: x))
diac_tok.adapt(msa_train_raw.take(200).map(tf_strip_tashkeel).map(lambda x, y: y))

In [8]:
lt_word_to_idx = keras.layers.StringLookup(vocabulary=letters_tok.get_vocabulary(), mask_token='')
lt_idx_to_word = keras.layers.StringLookup(vocabulary=letters_tok.get_vocabulary(), mask_token='', invert=True)

In [9]:
diac_word_to_idx = keras.layers.StringLookup(vocabulary=diac_tok.get_vocabulary(), mask_token='')
diac_idx_to_word = keras.layers.StringLookup(vocabulary=diac_tok.get_vocabulary(), mask_token='', invert=True)

In [10]:
def process_ds(ds, batch_size=32, shuffle_buffer=1000):
    ds = (
        ds.map(tf_strip_tashkeel, tf.data.AUTOTUNE)
        .map(lambda x, y: (letters_tok(x), diac_tok(y)), tf.data.AUTOTUNE)
        .shuffle(shuffle_buffer)
        .padded_batch(batch_size)
        )
    
    return ds

In [11]:
msa_train = process_ds(msa_train_raw)
msa_val = process_ds(msa_val_raw)
msa_test = process_ds(msa_test_raw)

ca_train = process_ds(ca_train_raw)
ca_val = process_ds(ca_val_raw)
ca_test = process_ds(ca_test_raw)

In [33]:
sample = next(iter(msa_train))

In [159]:
def decode_sentences(sentences):
    # work on batch size
    return [[char.decode('utf-8') for char in l.numpy()] for l in lt_idx_to_word(sentences)]

def decode_diacritics(diacritics):
    # work on batch size
    return [[char.decode('utf-8') for char in d.numpy()] for d in diac_idx_to_word(diacritics)]

In [160]:
dec_sen = decode_sentences(sample[0])
dec_diac = decode_diacritics(sample[1])

In [162]:
sen = tf.strings.reduce_join(dec_sen[0], axis=-1).numpy()[1:-1]
sen.decode('utf-8'), len(sen)

('غير أن الإسلام زحف على بلادهم من زمن بعيد فأسلمت أطرافها من كل اتجاه', 123)

In [163]:
eof_idx = dec_sen[0].index('e')
Preprocessor.combine_tashkeel(dec_sen[0][1:eof_idx], dec_diac[0][1:eof_idx])

'غَيْرَ أَنَّ الْإِسْلَامَ زَحَفَ عَلَى بِلَادِهِمْ مِنْ زَمَنٍ بَعِيدٍ فَأَسْلَمَتْ أَطْرَافُهَا مِنْ كُلِّ اتِّجَاهٍ'

In [42]:
next(iter(ca_train))

(<tf.Tensor: shape=(32, 293), dtype=int64, numpy=
 array([[19,  8, 23, ...,  0,  0,  0],
        [19,  8, 12, ...,  0,  0,  0],
        [19,  8, 12, ...,  0,  0,  0],
        ...,
        [19, 33,  6, ...,  0,  0,  0],
        [19,  8, 24, ...,  0,  0,  0],
        [19,  8, 28, ...,  0,  0,  0]], dtype=int64)>,
 <tf.Tensor: shape=(32, 293), dtype=int64, numpy=
 array([[2, 3, 3, ..., 0, 0, 0],
        [2, 3, 4, ..., 0, 0, 0],
        [2, 3, 6, ..., 0, 0, 0],
        ...,
        [2, 6, 7, ..., 0, 0, 0],
        [2, 3, 4, ..., 0, 0, 0],
        [2, 3, 3, ..., 0, 0, 0]], dtype=int64)>)

In [184]:
def create_model():
    inputs = keras.layers.Input(shape=(None,), dtype=tf.int64)
    x = keras.layers.Embedding(len(letters_tok.get_vocabulary())+1, 128, mask_zero=True)(inputs)
    x = keras.layers.Bidirectional(keras.layers.LSTM(128, return_sequences=True, dropout=0.4))(x)
    x = keras.layers.Bidirectional(keras.layers.LSTM(128, return_sequences=True, dropout=0.4))(x)
    x = keras.layers.Dense(512, activation='relu')(x)
    x = keras.layers.Dense(len(diac_tok.get_vocabulary())+1, activation='relu')(x)
    model = keras.models.Model(inputs, x)
    return model

In [185]:
model = create_model()

In [186]:
def masked_loss(labels, preds):
    loss_fn = keras.losses.SparseCategoricalCrossentropy(
        from_logits=True, reduction=None
    )
    loss = loss_fn(labels, preds)

    mask = labels != 0
    mask = tf.cast(mask, loss.dtype)

    loss = loss * mask
    loss = tf.reduce_sum(loss) / tf.reduce_sum(mask)
    return loss

In [237]:
model.compile(optimizer=keras.optimizers.Adam(0.001), loss=masked_loss)

In [234]:
model.summary()

In [238]:
model.fit(
    msa_train.repeat(),
    steps_per_epoch=200,
    validation_data=msa_val,
    epochs=1,
    )

AttributeError: in user code:

    File "C:\Users\PrinceEGY\AppData\Local\Temp\ipykernel_28220\2738802094.py", line 3, in custom_met  *
        labels = labels.numpy()

    AttributeError: 'SymbolicTensor' object has no attribute 'numpy'


In [190]:
model.evaluate(msa_test)

[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 105ms/step - loss: 0.3005


0.29762324690818787

In [191]:
smp = next(iter(msa_test))

In [240]:
smp

(<tf.Tensor: shape=(32, 143), dtype=int64, numpy=
 array([[19, 22,  5, ...,  0,  0,  0],
        [19,  8,  6, ...,  0,  0,  0],
        [19, 10, 11, ...,  0,  0,  0],
        ...,
        [19,  8,  3, ...,  0,  0,  0],
        [19, 11,  4, ...,  0,  0,  0],
        [19,  8, 15, ...,  0,  0,  0]], dtype=int64)>,
 <tf.Tensor: shape=(32, 143), dtype=int64, numpy=
 array([[2, 3, 5, ..., 0, 0, 0],
        [2, 3, 4, ..., 0, 0, 0],
        [2, 6, 3, ..., 0, 0, 0],
        ...,
        [2, 3, 2, ..., 0, 0, 0],
        [2, 3, 3, ..., 0, 0, 0],
        [2, 3, 3, ..., 0, 0, 0]], dtype=int64)>)

In [192]:
res = model(smp[0])

In [193]:
decoded_sentences = decode_sentences(smp[0])
decoded_diacritics = decode_diacritics(smp[1])
decoded_diacritics_p = decode_diacritics(tf.argmax(res, -1))

In [220]:
idx = 4

In [221]:
sen = tf.strings.reduce_join(decoded_sentences[idx], axis=-1).numpy()[1:-1]
sen.decode('utf-8'), len(sen)

('وإن غلب جوهر ظلمته على جوهر نوره وظهرت جسمانيته على روحانيته فقد فضل على الشيطان',
 146)

In [222]:
a = Preprocessor.combine_tashkeel(decoded_sentences[idx][1:decoded_sentences[idx].index('e')], decoded_diacritics[idx][1:decoded_sentences[idx].index('e')])
a

'وَإِنْ غَلَبَ جَوْهَرُ ظُلْمَتِهِ عَلَى جَوْهَرِ نُورِهِ وَظَهَرَتْ جِسْمَانِيَّتُهُ عَلَى رُوحَانِيَّتِهِ فَقَدْ فَضَلَ عَلَى الشَّيْطَانِ'

In [223]:
b = Preprocessor.combine_tashkeel(decoded_sentences[idx][1:decoded_sentences[idx].index('e')], decoded_diacritics_p[idx][1:decoded_sentences[idx].index('e')])
b

'وَإِنْ غَلَبَ جَوهَرَ ظَلَمَتَهُ عَلَى جُوهَرِ نُوْرِهِ وَظَهَرَتْ جُسْمَانِيَّتَهُ عَلَى رُوحَانِيَّتِهِ فَقَدْ فَضَلَ عَلَى الشَّيْطَانِ'

In [224]:
from diacritization_evaluation import wer, der

In [277]:
def calculate_metrics(model, ds):
    # Works on batched datasets only
    def combine_per_sen(sen, diac):
        eof_idx = sen.index('e')
        res = Preprocessor.combine_tashkeel(sen[1:eof_idx], diac[1:eof_idx])
        return res
    
    ds_len = (len(ds) * ds._batch_size.numpy())
    res = {"wer":0, "wer*":0, "der":0, "der*":0}
    for sen, diac in tqdm(ds):
        preds = model(sen)
        decoded_sentences = decode_sentences(sen)
        decoded_diacritics_t = decode_diacritics(diac)
        decoded_diacritics_p = decode_diacritics(tf.argmax(preds, -1))
        idx = 0
        while idx < len(decoded_sentences):
            true_diac = combine_per_sen(decoded_sentences[idx], decoded_diacritics_t[idx])
            pred_diac = combine_per_sen(decoded_sentences[idx], decoded_diacritics_p[idx])
            res["wer"] += wer.calculate_wer(true_diac, pred_diac) / ds_len
            res["wer*"] += wer.calculate_wer(true_diac, pred_diac, case_ending=False) / ds_len
            res["der"] += der.calculate_der(true_diac, pred_diac) / ds_len
            res["der*"] += der.calculate_der(true_diac, pred_diac, case_ending=False) / ds_len
            idx+=1
    
    return res

In [278]:
res=calculate_metrics(model, msa_test)

100%|██████████| 47/47 [02:10<00:00,  2.78s/it]


In [279]:
res

{'wer': 37.763051861702166,
 'wer*': 25.14950797872337,
 'der': 10.22831117021276,
 'der*': 8.052779255319125}