## Read data from train.txt and filter it from unwanted patterns


In [46]:
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from Preprocessing import utils, character_encoding
from Models import rnn_pytorch
import config as conf

# config = conf.ConfigLoader().load_config()

In [47]:
VECTOR_SIZE = 10

NUM_TRAIN_LINES = 10
NUM_TEST_LINES = 2
PADDING_SIZE = 150

MODEL = 'lstm'
NUM_EPOCHS = 200
HIDDEN_SIZE = 1000
LEARNING_RATE = 0.001
BATCH_SIZE = 1

## Preprocessing


## Feature Extraction


Split training data to sentences and remove diacritics from each sentence


In [48]:
class CustomDataset(Dataset):
    def __init__(self, test = False, eval = False, testdata = None):
        if test:
            dataset = testdata
            self.filtered_dataset = utils.filter_data(dataset)
            self.data = utils.split_data_to_sentences(self.filtered_dataset)[0:NUM_TEST_LINES]
        elif eval:
            dataset = utils.read_data("./dataset/val.txt")
            self.filtered_dataset = utils.filter_data(dataset)
            self.data = utils.split_data_to_sentences(self.filtered_dataset)[0:NUM_TRAIN_LINES]
        else:
            dataset = utils.read_data("./dataset/train.txt")
            self.filtered_dataset = utils.filter_data(dataset)
            self.data = utils.split_data_to_sentences(self.filtered_dataset)[0:NUM_TRAIN_LINES]
        self.max_length = PADDING_SIZE

    def __getitem__(self, index):
        sentence = self.data[index]
        # separate data (sentence) and label (diacritic of each character)
        sentence, diactritic = character_encoding.remove_diacritics(sentence, True)
        # get sentence vector
        sentence = character_encoding.getSentenceVector(sentence)
        # get diacritic vector
        diactritic = character_encoding.getDiacriticVector(diactritic)
        # add padding to sentence vector or clip it
        sentence,original_length = character_encoding.padding(sentence, len(character_encoding.ARABIC_ALPHABIT) +1,max_length=self.max_length)
        diactritic,_ = character_encoding.padding(diactritic, len(character_encoding.DIACRITICS),max_length=self.max_length)
        # convert to tensor
        sentence = torch.tensor(sentence, dtype=(torch.float32))
        diactritic = torch.tensor(diactritic, dtype=(torch.float32))
        return sentence, diactritic, original_length 

    def __len__(self):
        return len(self.data)

## Building The Model


#### connect to GPU if available


In [49]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Device = " ,device)
print("Cuda : ",torch.cuda.is_available())
print("Number of Cuda devices :", torch.cuda.device_count())

Device =  cuda:0
Cuda :  True
Number of Cuda devices : 1


In [50]:
input_size = len(character_encoding.ARABIC_ALPHABIT) + 1
hidden_size = HIDDEN_SIZE
output_size = len(character_encoding.DIACRITICS)

In [51]:
# Create an instance of the RNN classifier
if MODEL == 'rnn':
    model = rnn_pytorch.RNNClassifier(input_size, hidden_size, output_size)

# Creare an instance of the LSTM classifier
elif MODEL == 'lstm':
    model = rnn_pytorch.LSTMClassifier(input_size, hidden_size, output_size)
model.to(device)

LSTMClassifier(
  (lstm): LSTM(38, 1000, batch_first=True)
  (out): Sequential(
    (0): Linear(in_features=1000, out_features=64, bias=True)
    (1): Tanh()
    (2): Linear(in_features=64, out_features=15, bias=True)
  )
)

In [52]:
batch_size = BATCH_SIZE
dataset = CustomDataset()
# Create a dataloader to handle batching and shuffling
train_dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [53]:
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [54]:
num_epochs = NUM_EPOCHS
# Training loop
for epoch in range(num_epochs):
    for i, (sentences, labels, _) in enumerate(train_dataloader): 
        # Reshape input and labels to (batch_size, seq_length, input_size)
        sentences = sentences.view(batch_size, -1,input_size).to(device)
        labels = labels.view(batch_size, -1, output_size).to(device)
        # RNN
        if MODEL == 'rnn':
            hidden_state = model.init_hidden(batch_size=batch_size).to(device) # RNN has one hidden state
            optimizer.zero_grad()
            outputs = model(sentences, hidden_state)
        
        # LSTM
        elif MODEL == 'lstm':
            hidden_state, cell_state = model.init_hidden(batch_size=batch_size)  # LSTM has two hidden states
            hidden_state = hidden_state.to(device)
            cell_state = cell_state.to(device)
            optimizer.zero_grad()
            outputs = model(sentences, (hidden_state, cell_state))

        
        
        # calculate the loss
        loss = criterion(outputs, labels)
        # backward pass
        loss.backward()
        # update the weights
        optimizer.step()
        

    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}')
# torch.save(model.state_dict(), 'model.pth')

Epoch 1/200, Loss: 13.977930068969727
Epoch 2/200, Loss: 22.512304306030273
Epoch 3/200, Loss: 49.167633056640625
Epoch 4/200, Loss: 48.43729019165039
Epoch 5/200, Loss: 45.93044662475586
Epoch 6/200, Loss: 11.65626049041748
Epoch 7/200, Loss: 28.40334129333496
Epoch 8/200, Loss: 43.9199333190918
Epoch 9/200, Loss: 37.273799896240234
Epoch 10/200, Loss: 17.44894790649414
Epoch 11/200, Loss: 44.824790954589844
Epoch 12/200, Loss: 26.138437271118164
Epoch 13/200, Loss: 9.391480445861816
Epoch 14/200, Loss: 43.589820861816406
Epoch 15/200, Loss: 25.496326446533203
Epoch 16/200, Loss: 9.021190643310547
Epoch 17/200, Loss: 3.015533685684204
Epoch 18/200, Loss: 42.85832595825195
Epoch 19/200, Loss: 7.821408271789551
Epoch 20/200, Loss: 2.494396686553955
Epoch 21/200, Loss: 24.895715713500977
Epoch 22/200, Loss: 7.977907657623291
Epoch 23/200, Loss: 24.187211990356445
Epoch 24/200, Loss: 23.618595123291016
Epoch 25/200, Loss: 23.2801513671875
Epoch 26/200, Loss: 23.004425048828125
Epoch 27/20

## Model Evaluation


Preparing Validation data to be passed into the `model.evaluate()`


# Testing


testing on a given sentence


In [55]:
# Assuming you have a test dataset prepared in the same format as your training dataset
eval_dataset = CustomDataset(eval=True)  # You'll need to modify your CustomDataset class to accept this parameter and load the eval data
eval_dataloader = DataLoader(eval_dataset, batch_size = batch_size)

# Switch the model to evaluation mode
model.eval()

# Initialize the test loss
test_loss = 0

# We don't need to compute gradients during evaluation, so we wrap this in torch.no_grad()
with torch.no_grad():
    for sentences, labels, _ in eval_dataloader:
        sentences = sentences.view(batch_size, -1, input_size).to(device)
        labels = labels.view(batch_size, -1, output_size).to(device)
        # RNN
        if MODEL == 'rnn':
            hidden_state = model.init_hidden(batch_size=batch_size).to(device) # RNN has one hidden state
            outputs = model(sentences, hidden_state)
        
        # LSTM
        elif MODEL == 'lstm':
            hidden_state, cell_state = model.init_hidden(batch_size=batch_size)  # LSTM has two hidden states
            hidden_state = hidden_state.to(device)
            cell_state = cell_state.to(device)
            outputs = model(sentences, (hidden_state, cell_state))

        # Compute the loss
        loss = criterion(outputs, labels)

        # Accumulate the test loss
        test_loss += loss.item()

# Compute the average test loss
avg_test_loss = test_loss / len(eval_dataloader)

print(f'Average Evaluation Loss: {avg_test_loss}')


Average Evaluation Loss: 43.768162155151366


In [56]:
def index_to_char(indeces_list):
    char_list = []
    for index in indeces_list:
        char_list.append(character_encoding.DIACRITICS[index])
    return char_list

In [57]:
training_set = utils.read_data(f"./Dataset/train.txt")
filtered_training_set = utils.filter_data(training_set)
test_sentences = utils.split_data_to_sentences(filtered_training_set)[0:1]

In [58]:
# Assuming you have a test dataset prepared in the same format as your training dataset
test_dataset = CustomDataset(test=True, testdata = training_set) 
test_dataloader = DataLoader(test_dataset, batch_size = batch_size)

# Switch the model to evaluation mode
model.eval()

# We don't need to compute gradients during evaluation, so we wrap this in torch.no_grad()
sentences_diacritics_prediction = []
sentences_without_diacritics = []
original_diacritics = []
original_sentences_len = []
with torch.no_grad():
    for sentences, labels, sentence_length in test_dataloader:
        original_sentences_len.append(sentence_length)
        sentences_without_diacritics.extend(sentences)
        original_diacritics.extend(labels)
        sentences = sentences.view(batch_size, -1, input_size).to(device)
        labels = labels.view(batch_size, -1, output_size).to(device)
        # RNN
        if MODEL == 'rnn':
            hidden_state = model.init_hidden(batch_size=batch_size).to(device) # RNN has one hidden state
            outputs = model(sentences, hidden_state)
        
        
        # LSTM
        elif MODEL == 'lstm':
            hidden_state, cell_state = model.init_hidden(batch_size=batch_size)  # LSTM has two hidden states
            hidden_state = hidden_state.to(device)
            cell_state = cell_state.to(device)
            outputs = model(sentences, (hidden_state, cell_state))

        sentences_diacritics_prediction.extend(outputs.argmax(dim=2).cpu()) 

sentences_diacritics_prediction = np.array(sentences_diacritics_prediction)
sentences_without_diacritics = np.array(sentences_without_diacritics)
original_diacritics = np.array(original_diacritics)

print("Sentences diacritics prediction : ",sentences_diacritics_prediction.shape)
print("Sentences Without diacritics    : ",sentences_without_diacritics.shape)
print("Original diacritics             : ",original_diacritics.shape)

Sentences diacritics prediction :  (2, 150)
Sentences Without diacritics    :  (2, 150, 38)
Original diacritics             :  (2, 150, 15)


In [59]:
# # Switch the model to evaluation mode
# model.eval()
# # Assume 'input_sentence' is your input sentence
# input_sentence = test_sentences[0]
# print("Input sentence : ",input_sentence)

# # Process the input_sentence in the same way as you did for your training data
# sentence_without_diacritics, original_diacritics = character_encoding.remove_diacritics(input_sentence, True)
# sentence = character_encoding.getSentenceVector(sentence_without_diacritics)
# sentence,_ = character_encoding.padding(sentence, len(character_encoding.ARABIC_ALPHABIT) + 2, max_length=PADDING_SIZE)
# diacritic = character_encoding.getDiacriticVector(original_diacritics)
# diacritic,_ = character_encoding.padding(diacritic, len(character_encoding.DIACRITICS), max_length=PADDING_SIZE)
# sentence = torch.tensor(sentence, dtype=(torch.float32)).unsqueeze(0).to(device)  # Add an extra dimension for batch and move to device

# # We don't need to compute gradients during evaluation, so we wrap this in torch.no_grad()
# with torch.no_grad():
#     hidden = model.init_hidden(batch_size=1).to(device)  # Batch size is 1 for inference
#     # Forward pass
#     output = model(sentence, hidden)
# print(sentence.shape)

# # The output is the model's prediction, you might want to post-process this output to convert it back into a readable format
# prediction = output.argmax(dim=2)  # This gives you the index of the highest value in the output tensor


In [60]:
# predicted_diacritics = index_to_char(prediction[0])
# der, miss = diacritics_error_rate(original_diacritics, predicted_diacritics)
# print("Diacritics error rate : ", der, "%")
# print("Correct diacritics rate : ", 100 - der, "%")
# print("Number of miss : ", miss, "out of ", len(original_diacritics))
# print("Original Sentence : ", input_sentence)
# restored_sentence = character_encoding.restore_diacritics(sentence_without_diacritics,predicted_diacritics)
# print("Restored Sentence : ", restored_sentence)
# print("Original diacritics : ", character_encoding.map_text_to_diacritic(original_diacritics))
# print("Predicted diacritics : ",character_encoding.map_text_to_diacritic( predicted_diacritics))

In [61]:
# Convert the prediction tensor to the corresponding diacritics
predicted_diacritics = []
diacritic_error_rate = 0
number_of_mis_classified = 0
number_of_char_to_classify = 0
for i, p in enumerate(sentences_diacritics_prediction):
    pred = index_to_char(p)
    predicted_diacritics.append(pred)
    s = character_encoding.oneHot_to_sentence(sentences_without_diacritics[i][0:original_sentences_len[i]]) # sentence without diacritics
    d = character_encoding.oneHot_to_diacritic(original_diacritics[i][0:original_sentences_len[i]])         # original diacritics of the sentence
    original_text = character_encoding.restore_diacritics(s, d)
    print("Original Sentence : ", original_text)
    restored_text = character_encoding.restore_diacritics(s, pred)
    print("Restored Sentence : ", restored_text)
    diac, miss = character_encoding.diacritics_error_rate(d, pred)
    print(diac,"%")
    diacritic_error_rate += diac
    number_of_mis_classified += miss
    number_of_char_to_classify += int(original_sentences_len[i][0]) if len(pred) > original_sentences_len[i] else len(pred)


diacritic_error_rate /= len(sentences_diacritics_prediction)
print("Diacritic Error Rate = ", diacritic_error_rate, "%")
print("Diacritic Correct Rate = ", 100 - diacritic_error_rate, "%")
print("Number of Misclassified = ", number_of_mis_classified, "out of", number_of_char_to_classify)

Original Sentence :  قَوْلُهُ أَوْ قَطَعَ الْأَوَّلُ يَدَهُ إلَخْ قَالَ الزَّرْكَشِيُّ 
Restored Sentence :  قَوْلُهُ أَوْ قَطَعَ الْأَوَّلُ يَدَهُ إلَخْ قَالَ الزَّرْكَشِيُّ 
0.0 %
Original Sentence :  ابْنُ عَرَفَةَ قَوْلُهُ بِلَفْظٍ يَقْتَضِيه كَإِنْكَارِ غَيْرِ حَدِيثٍ بِالْإِسْلَامِ وُجُوبَ مَا عُلِمَ وُجُوبُهُ مِنْ الدِّينِ ضَرُورَةً كَإِلْقَاءِ مُصْحَفٍ بِقَذَرٍ وَشَدِّ زُنَّارٍ ابْنُ عَرَفَةَ قَوْلُ ابْنِ شَاسٍ أَوْ بِفِعْلٍ يَتَضَمَّنُهُ هُوَ كَ
Restored Sentence :  ابْنُ عَرَفَةَ قَوْلُهُ بِلَفْظٍ يَقْتَضِيه كَإِنْكَارِ غَيْرِ حَدِيثٍ بِالْإِسْلَامِ وُجُوبَ مَا عُلِمَ وُجُوبُهُ مِنْ الدِّينِ ضَرُورَةً كَإِلْقَاءِ مُصْحَفٍ بِقَذَرٍ وَشَدِّ زُنَّارٍ ابْنُ عَرَفَةَ قَوْلُ ابْنِ شَاسٍ أَوْ بِفِعْلٍ يَتَضَمَّنُهُ هُوَ كَ
0.0 %
Diacritic Error Rate =  0.0 %
Diacritic Correct Rate =  100.0 %
Number of Misclassified =  0 out of 188
