## Read data from train.txt and filter it from unwanted patterns


In [1]:
import numpy as np
from keras.utils import to_categorical
from Embeddings import Word2Vec
from Preprocessing import utils, character_encoding
from Models import rnn
import config as conf

config = conf.ConfigLoader().load_config()

Configurations Read Successfully


## Preprocessing


In [2]:
training_set = utils.read_data(f"./Dataset/train.txt")
print("training_set", training_set[0:500])
filtered_training_set = utils.filter_data(training_set)
print("filtered_training_set", filtered_training_set[0:500])


training_set قَوْلُهُ : ( أَوْ قَطَعَ الْأَوَّلُ يَدَهُ إلَخْ ) قَالَ الزَّرْكَشِيُّ( 14 / 123 )
ابْنُ عَرَفَةَ : قَوْلُهُ : بِلَفْظٍ يَقْتَضِيه كَإِنْكَارِ غَيْرِ حَدِيثٍ بِالْإِسْلَامِ وُجُوبَ مَا عُلِمَ وُجُوبُهُ مِنْ الدِّينِ ضَرُورَةً ( كَإِلْقَاءِ مُصْحَفٍ بِقَذَرٍ وَشَدِّ زُنَّارٍ ) ابْنُ عَرَفَةَ : قَوْلُ ابْنِ شَاسٍ : أَوْ بِفِعْلٍ يَتَضَمَّنُهُ هُوَ كَلُبْسِ الزُّنَّارِ وَإِلْقَاءِ الْمُصْحَفِ فِي صَرِيحِ النَّجَاسَةِ وَالسُّجُودِ لِلصَّنَمِ وَنَحْوِ ذَلِكَ ( وَسِحْرٍ ) مُحَمَّدٌ : قَوْلُ مَالِكٍ و
filtered_training_set قَوْلُهُ أَوْ قَطَعَ الْأَوَّلُ يَدَهُ إلَخْ قَالَ الزَّرْكَشِيُّ 
ابْنُ عَرَفَةَ قَوْلُهُ بِلَفْظٍ يَقْتَضِيه كَإِنْكَارِ غَيْرِ حَدِيثٍ بِالْإِسْلَامِ وُجُوبَ مَا عُلِمَ وُجُوبُهُ مِنْ الدِّينِ ضَرُورَةً كَإِلْقَاءِ مُصْحَفٍ بِقَذَرٍ وَشَدِّ زُنَّارٍ ابْنُ عَرَفَةَ قَوْلُ ابْنِ شَاسٍ أَوْ بِفِعْلٍ يَتَضَمَّنُهُ هُوَ كَلُبْسِ الزُّنَّارِ وَإِلْقَاءِ الْمُصْحَفِ فِي صَرِيحِ النَّجَاسَةِ وَالسُّجُودِ لِلصَّنَمِ وَنَحْوِ ذَلِكَ وَسِحْرٍ مُحَمَّدٌ قَوْلُ مَالِكٍ 

In [3]:
words_set = utils.split_data_to_words(filtered_training_set)
text_without_diacritics = []
diacritic_list = []
# Preparing Training Set
text_without_diacritics, diacritic_list = character_encoding.PrepareData(words_set[0:10000])

# Assume this is a test set
text_without_diacritics_test, diacritic_list_test = character_encoding.PrepareData(words_set[10000:15000])

In [4]:
# print("size of text without diacritics", len(text_without_diacritics))
# print("size of diacritic list", len(diacritic_list))
# print("size of original words", len(words_set))
# for i in range(len(diacritic_list)):
#     print(i,"- original : ",words_set[i] ,"   - without : ",text_without_diacritics[i] , "   - diacritic : ", character_encoding.map_text_to_diacritic(diacritic_list[i]))


## Feature Extraction


In [5]:
# Split data to sentences and remove diacritics from each sentence  
sentences = utils.split_data_to_sentences(filtered_training_set)
list_of_sentences = character_encoding.RemoveDiacriticFromSentence(sentences)

In [6]:
file_path = './Embeddings/word2vec_model.bin'
# Create Word Embedding model
embedding_model = Word2Vec.W2V(list_of_sentences, vector_size = config["embedding_vector_size"])

In [7]:

is_training = config["is_training"]    # Change this to False if you want to load the model and not train it again

if embedding_model.is_model_saved(file_path) and is_training == False:
    embedding_model.load_model(file_path)
else:
    embedding_model.train()
    embedding_model.save_model(file_path)

Word2Vec model saved to :  ./Embeddings/word2vec_model.bin


In [8]:

concatinated_vector_train, diacritic_list = utils.concatinate_word_char_embeddings(text_without_diacritics, diacritic_list, embedding_model = embedding_model)
concatinated_vector_test, diacritic_list_test = utils.concatinate_word_char_embeddings(text_without_diacritics_test, diacritic_list_test, embedding_model = embedding_model)


# calculate total character diacritic list for the assert
count_train = 0
for d in diacritic_list:
    count_train += len(d)
count_test = 0
for d in diacritic_list_test:
    count_test += len(d)   
assert (len(concatinated_vector_train) == count_train), f"Error : Train Set Len ({len(concatinated_vector_train)}) != Len diacritic ({count_train}) list have different sizes, "
assert (len(concatinated_vector_test) == count_test), f"Error : Test Set Len ({len(concatinated_vector_test)}) != Len diacritic ({count_test}) list have different sizes, "

## Building The Model


In [9]:
input_size = len(concatinated_vector_train[0])
output_size = len(character_encoding.DIACRITICS)
print("input size : ", input_size)
print("output size : ", output_size)

input size :  86
output size :  15


In [10]:
# Create an instance of the LSTM class
model = rnn.RNN(input_shape=(None, 1), output_shape = output_size)

  super().__init__(**kwargs)


In [11]:
# Convert the training data to the required format
X_train = concatinated_vector_train # np.array([[[character_encoding.CharToOneHOt(char)]] for word in text_without_diacritics for char in word])

y_train = []
for word_diacritic in diacritic_list:
    for diacritic in word_diacritic:
        #print(utils.map_text_to_diacritic(diacritic))
        index = character_encoding.DIACRITICS.index(diacritic)
        y_train.append(to_categorical(index, num_classes=output_size))
y_train = np.array(y_train)
X_train = np.array(X_train)
print("X_train size : ", X_train.shape)
print("y_train size : ", y_train.shape)

X_train size :  (39493, 86)
y_train size :  (39493, 15)


In [12]:
# Train the model
model.train(X_train, y_train, epochs = config["num_epochs"], batch_size = config["batch_size"])

Epoch 1/5
[1m618/618[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 34ms/step - accuracy: 0.3568 - loss: 1.8937
Epoch 2/5
[1m618/618[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 42ms/step - accuracy: 0.3567 - loss: 1.7723
Epoch 3/5
[1m618/618[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 49ms/step - accuracy: 0.4068 - loss: 1.6606
Epoch 4/5
[1m618/618[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 40ms/step - accuracy: 0.4911 - loss: 1.4324
Epoch 5/5
[1m618/618[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 42ms/step - accuracy: 0.5026 - loss: 1.3901


In [13]:
# Convert the test data to the required format
X_test = concatinated_vector_test # np.array([[[character_encoding.CharToOneHOt(char)]] for word in text_without_diacritics_test for char in word])

y_test = []
for word_diacritic in diacritic_list:
    for diacritic in word_diacritic:
        index = character_encoding.DIACRITICS.index(diacritic)
        y_test.append(to_categorical(index, num_classes=output_size))

y_test = np.array(y_train)
X_test = np.array(X_train)
print("X_test size : ", X_test.shape)
print("y_test size : ", y_test.shape)

X_test size :  (39493, 86)
y_test size :  (39493, 15)


In [14]:
# Predict the diacritics of the test data
y_pred = model.predict(X_test)

[1m1235/1235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 14ms/step


## Model Evaluation


In [15]:
# Evaluate the model
lost , accuracy = model.evaluate(X_test, y_test)

[1m1235/1235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 14ms/step - accuracy: 0.5169 - loss: 1.3566
