In [1]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Bidirectional, SimpleRNNCell, RNN, Dense,Layer

In [2]:
# import tensorflow as tf
# from tensorflow.keras.models import Model
# from tensorflow.keras.layers import Input, LSTM, Bidirectional, SimpleRNNCell, RNN, Dense,Layer
class SelectHiddenState(Layer):
    def __init__(self, **kwargs):
        super(SelectHiddenState, self).__init__(**kwargs)

    def call(self, lstm_output, scalar_input):
        # Ensure scalar_input is an integer for indexing
        timestep_index = tf.cast(tf.squeeze(scalar_input, axis=-1), tf.int32)
        # Gather the specific hidden state for each batch
        selected_state = tf.gather(lstm_output, timestep_index, batch_dims=1, axis=1)
        return selected_state

# Example usage with your model
# lstm_output is from your BiLSTM layer
# scalar_input is your additional input
max_sentence_length = 100  # Maximum length of sentence embeddings
embedding_dim = 100        # Dimension of sentence embeddings
max_char_length = 15       # Maximum length of a word in characters
char_vocab_size = 36       # Number of unique characters
num_diacritics = 8         # Number of possible diacritics for each character, including no diacritic

# Parameters
lstm_units = 32

# Character input
char_input = Input(shape=(max_char_length, char_vocab_size))

# Inputs
sentence_input = Input(shape=(max_sentence_length, embedding_dim))
scalar_input = Input(shape=(1,), name='scalar_input')

# BiLSTM layer with return_state
bi_lstm = Bidirectional(LSTM(lstm_units, return_sequences=True, return_state=True))
bi_lstm_output, forward_h, forward_c, backward_h, backward_c = bi_lstm(sentence_input)
print( forward_h.shape, forward_c.shape, backward_h.shape, backward_c.shape)
#, forward_h, forward_c, backward_h, backward_c
# Average the forward and backward states (or choose another method to combine them)
select_state_layer = SelectHiddenState()
hidden_state_nth_timestep = select_state_layer(bi_lstm_output, scalar_input)

#hidden_state_nth_timestep = bi_lstm_output[:, scalar_input[1], :]
print(bi_lstm_output.shape)

# RNN layer with initial state from BiLSTM
rnn_cell = SimpleRNNCell(64)
rnn_layer = RNN(rnn_cell, return_sequences=True)
rnn_output = rnn_layer(char_input,initial_state=hidden_state_nth_timestep)

# Output layer
output_layer = Dense(num_diacritics, activation='softmax')(rnn_output)

# Build and compile the model
# Assuming sentence_input and scalar_input are defined as Input layers
model = Model(inputs=[sentence_input, char_input,scalar_input], outputs=output_layer)

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Model summary
model.summary()

(None, 32) (None, 32) (None, 32) (None, 32)
(None, 100, 64)
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 100, 100)]   0           []                               
                                                                                                  
 bidirectional (Bidirectional)  [(None, 100, 64),    34048       ['input_2[0][0]']                
                                 (None, 32),                                                      
                                 (None, 32),                                                      
                                 (None, 32),                                                      
                                 (None, 32)]                                                      
                                  

In [3]:
# X = 
# Y = 
from preprocessing import *

trainSet = readFile('dataset/train.txt')

In [4]:
sentences_without_diacritics, diacritics = get_sentences(trainSet[:300])

In [5]:
print(sentences_without_diacritics)

[['قوله', 'أو', 'قطع', 'الأول', 'يده', 'إلخ', 'قال', 'الزركشي'], ['ابن', 'عرفة', 'قوله', 'بلفظ', 'يقتضيه', 'كإنكار', 'غير', 'حديث', 'بالإسلام', 'وجوب', 'ما', 'علم', 'وجوبه', 'من', 'الدين', 'ضرورة', 'كإلقاء', 'مصحف', 'بقذر', 'وشد', 'زنار', 'ابن', 'عرفة', 'قول']]


In [6]:
print(diacritics)

[[['َ', 'ْ', 'ُ', 'ُ'], ['َ', 'ْ'], ['َ', 'َ', 'َ'], ['_', 'ْ', 'َ', 'َّ', 'ُ'], ['َ', 'َ', 'ُ'], ['_', 'َ', 'ْ'], ['َ', '_', 'َ'], ['_', '_', 'َّ', 'ْ', 'َ', 'ِ', 'ُّ']], [['_', 'ْ', 'ُ'], ['َ', 'َ', 'َ', 'َ'], ['َ', 'ْ', 'ُ', 'ُ'], ['ِ', 'َ', 'ْ', 'ٍ'], ['َ', 'ْ', 'َ', 'ِ', '_', '_'], ['َ', 'ِ', 'ْ', 'َ', '_', 'ِ'], ['َ', 'ْ', 'ِ'], ['َ', 'ِ', '_', 'ٍ'], ['ِ', '_', 'ْ', 'ِ', 'ْ', 'َ', '_', 'ِ'], ['ُ', 'ُ', '_', 'َ'], ['َ', '_'], ['ُ', 'ِ', 'َ'], ['ُ', 'ُ', '_', 'ُ', 'ُ'], ['ِ', 'ْ'], ['_', '_', 'ِّ', '_', 'ِ'], ['َ', 'ُ', '_', 'َ', 'ً'], ['َ', 'ِ', 'ْ', 'َ', '_', 'ِ'], ['ُ', 'ْ', 'َ', 'ٍ'], ['ِ', 'َ', 'َ', 'ٍ'], ['َ', 'َ', 'ِّ'], ['ُ', 'َّ', '_', 'ٍ'], ['_', 'ْ', 'ُ'], ['َ', 'َ', 'َ', 'َ'], ['َ', 'ْ', '_']]]


In [7]:
word2vecmodel = makeWord2VecModel(sentences_without_diacritics)
keys = word2vecmodel.wv.key_to_index
print(keys)

{'قوله': 0, 'ابن': 1, 'عرفة': 2, 'غير': 3, 'أو': 4, 'قطع': 5, 'الأول': 6, 'يده': 7, 'إلخ': 8, 'قال': 9, 'الزركشي': 10, 'بلفظ': 11, 'يقتضيه': 12, 'كإنكار': 13, 'قول': 14, 'زنار': 15, 'بالإسلام': 16, 'وجوب': 17, 'ما': 18, 'علم': 19, 'وجوبه': 20, 'من': 21, 'الدين': 22, 'ضرورة': 23, 'كإلقاء': 24, 'مصحف': 25, 'بقذر': 26, 'وشد': 27, 'حديث': 28}


In [8]:
def getEmbeddingsSentences(sentences, word2vecmodel):
    embeddingSentences = [] # list of all sentences
    keys = word2vecmodel.wv.key_to_index
    for s in sentences:
        embeddingTemp = []  # list for one sentence
        for w in s:
            if w in keys:
                embeddingTemp.append(word2vecmodel.wv[w])
            ### unknown OOV till now
        embeddingSentences.append(embeddingTemp)
    return embeddingSentences
embeddingsSentences = getEmbeddingsSentences(sentences_without_diacritics, word2vecmodel)

In [23]:
harakat   = {1614:1,1615:2,1616:3,1618:4,1617:5,1611:6,1612:7,1613:8, 95:9}

def get_diacritic_hot_vector(haraka):
    vector = [0 for _ in range(9)]
    print("haraka:" + haraka)
    vector[harakat[ord(haraka)] - 1] = 1
    return vector

In [32]:
def getDiacriticEncoding(wordDi):
    word_embedding = list()
    for w in wordDi:
        if (len(w) > 1):
            print(ord(w[0]))
            print(ord(w[1]))
        word_embedding.append(get_diacritic_hot_vector(w))
    return word_embedding
# getDiacriticEncoding('ًُ')

In [34]:
X  = list() # input
Y  = list() # output
for i in range(len(embeddingsSentences)):
    for j in range(len(embeddingsSentences[i])):
        x = list()
        x.append(embeddingsSentences[i])
        x.append(getCharacterEncoding(sentences_without_diacritics[i][j]))
        x.append(j)
        X.append(x)
        ### y ###
        y = getDiacriticEncoding(diacritics[i][j])
        Y.append(y)

haraka:َ
haraka:ْ
haraka:ُ
haraka:ُ
haraka:َ
haraka:ْ
haraka:َ
haraka:َ
haraka:َ
haraka:_
haraka:ْ
haraka:َ
1614
1617
haraka:َّ


TypeError: ord() expected a character, but string of length 2 found

In [10]:
print(x)

[[array([-8.6113103e-03,  3.6643303e-03,  5.1892251e-03,  5.7416325e-03,
        7.4641667e-03, -6.1670588e-03,  1.1047244e-03,  6.0457336e-03,
       -2.8388158e-03, -6.1731804e-03, -4.0743896e-04, -8.3690602e-03,
       -5.5919746e-03,  7.1075466e-03,  3.3517345e-03,  7.2327172e-03,
        6.8065235e-03,  7.5300415e-03, -3.7867699e-03, -5.6608295e-04,
        2.3527492e-03, -4.5200596e-03,  8.3913244e-03, -9.8580075e-03,
        6.7662243e-03,  2.9147717e-03, -4.9349191e-03,  4.3984656e-03,
       -1.7367189e-03,  6.7129792e-03,  9.9656507e-03, -4.3575829e-03,
       -5.9142435e-04, -5.6945621e-03,  3.8516282e-03,  2.7857861e-03,
        6.8904776e-03,  6.0996446e-03,  9.5332973e-03,  9.2715397e-03,
        7.8911986e-03, -6.9845901e-03, -9.1541968e-03, -3.5555818e-04,
       -3.1015477e-03,  7.8925528e-03,  5.9359735e-03, -1.5494652e-03,
        1.5088364e-03,  1.7942366e-03,  7.8116776e-03, -9.5081180e-03,
       -2.1337373e-04,  3.4670706e-03, -9.3401561e-04,  8.3788102e-03,
    