## Read data from train.txt and filter it from unwanted patterns


In [37]:
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from Preprocessing import utils, character_encoding
from Models import rnn_pytorch
import config as conf

# config = conf.ConfigLoader().load_config()

In [38]:
VECTOR_SIZE = 10
NUM_TRAIN_LINES = 100
PADDING_SIZE = 150
HIDDEN_SIZE = 300
BATCH_SIZE = 10

## Preprocessing


## Feature Extraction


Split training data to sentences and remove diacritics from each sentence


In [39]:
class CustomDataset(Dataset):
    def __init__(self, test = False, eval = False, testdata = None):
        if test:
            dataset = testdata
        elif eval:
            dataset = utils.read_data("./dataset/val.txt")
        else:
            dataset = utils.read_data("./dataset/train.txt")
        self.filtered_dataset = utils.filter_data(dataset)
        self.data = utils.split_data_to_sentences(self.filtered_dataset)[0:NUM_TRAIN_LINES]
        self.max_length = PADDING_SIZE

    def __getitem__(self, index):
        sentence = self.data[index]
        # separate data (sentence) and label (diacritic of each character)
        sentence, diactritic = character_encoding.remove_diacritics(sentence, True)
        # get sentence vector
        sentence = character_encoding.getSentenceVector(sentence)
        # get diacritic vector
        diactritic = character_encoding.getDiacriticVector(diactritic)
        # add padding to sentence vector or clip it
        sentence = character_encoding.padding(sentence, len(character_encoding.ARABIC_ALPHABIT) +2,max_length=self.max_length)
        diactritic = character_encoding.padding(diactritic, len(character_encoding.DIACRITICS),max_length=self.max_length)
        # convert to tensor
        sentence = torch.tensor(sentence, dtype=(torch.float32))
        diactritic = torch.tensor(diactritic, dtype=(torch.float32))
        return sentence, diactritic

    def __len__(self):
        return len(self.data)

## Building The Model


#### connect to GPU if available


In [40]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Device = " ,device)
print("Cuda : ",torch.cuda.is_available())
print("Number of Cuda devices :", torch.cuda.device_count())

Device =  cuda:0
Cuda :  True
Number of Cuda devices : 1


In [41]:
input_size = len(character_encoding.ARABIC_ALPHABIT) + 2
hidden_size = HIDDEN_SIZE
output_size = len(character_encoding.DIACRITICS)

In [42]:
# Create an instance of the RNN classifier
# model = rnn_pytorch.RNNClassifier(input_size, hidden_size, output_size)

# Creare an instance of the LSTM classifier
model = rnn_pytorch.LSTMClassifier(input_size, hidden_size, output_size)
model.to(device)

LSTMClassifier(
  (lstm): LSTM(38, 300, batch_first=True)
  (out): Sequential(
    (0): Linear(in_features=300, out_features=64, bias=True)
    (1): Tanh()
    (2): Linear(in_features=64, out_features=15, bias=True)
  )
)

In [43]:
batch_size = BATCH_SIZE
dataset = CustomDataset()
# Create a dataloader to handle batching and shuffling
train_dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [44]:
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [45]:
num_epochs = 100
# Training loop
for epoch in range(num_epochs):
    for i, (sentences, labels) in enumerate(train_dataloader): 
        # Reshape input and labels to (batch_size, seq_length, input_size)
        input = sentences.view(batch_size, -1,input_size).to(device)
        labels = labels.view(batch_size, -1, output_size).to(device)
        # RNN
        # hidden_state = model.init_hidden(batch_size=batch_size).to(device) # RNN has one hidden state
        # optimizer.zero_grad()
        # outputs = model(input, hidden_state)
        
        # LSTM
        hidden_state, cell_state = model.init_hidden(batch_size=batch_size)  # LSTM has two hidden states
        hidden_state = hidden_state.to(device)
        cell_state = cell_state.to(device)
        optimizer.zero_grad()
        outputs = model(input, (hidden_state, cell_state))

        
        
        # calculate the loss
        loss = criterion(outputs, labels)
        # backward pass
        loss.backward()
        # update the weights
        optimizer.step()
        

    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}')
# torch.save(model.state_dict(), 'model.pth')

Epoch 1/100, Loss: 44.87592697143555
Epoch 2/100, Loss: 42.99579620361328
Epoch 3/100, Loss: 41.67938995361328
Epoch 4/100, Loss: 8.14149284362793
Epoch 5/100, Loss: 41.47943878173828
Epoch 6/100, Loss: 8.693621635437012
Epoch 7/100, Loss: 7.073116779327393
Epoch 8/100, Loss: 6.125248432159424
Epoch 9/100, Loss: 41.2519645690918
Epoch 10/100, Loss: 25.252731323242188
Epoch 11/100, Loss: 2.217221260070801
Epoch 12/100, Loss: 8.881248474121094
Epoch 13/100, Loss: 40.20249557495117
Epoch 14/100, Loss: 9.963299751281738
Epoch 15/100, Loss: 39.79909133911133
Epoch 16/100, Loss: 39.410491943359375
Epoch 17/100, Loss: 39.420345306396484
Epoch 18/100, Loss: 13.74252986907959
Epoch 19/100, Loss: 38.41596603393555
Epoch 20/100, Loss: 38.73453140258789
Epoch 21/100, Loss: 3.5453364849090576
Epoch 22/100, Loss: 9.334505081176758
Epoch 23/100, Loss: 37.98280334472656
Epoch 24/100, Loss: 37.21031188964844
Epoch 25/100, Loss: 24.920244216918945
Epoch 26/100, Loss: 15.8903169631958
Epoch 27/100, Loss:

## Model Evaluation


Preparing Validation data to be passed into the `model.evaluate()`


# Testing


testing on a given sentence


In [46]:
# Assuming you have a test dataset prepared in the same format as your training dataset
eval_dataset = CustomDataset(eval=True)  # You'll need to modify your CustomDataset class to accept this parameter and load the eval data
eval_dataloader = DataLoader(eval_dataset, batch_size = batch_size)

# Switch the model to evaluation mode
model.eval()

# Initialize the test loss
test_loss = 0

# We don't need to compute gradients during evaluation, so we wrap this in torch.no_grad()
with torch.no_grad():
    for sentences, labels in eval_dataloader:
        sentences = sentences.view(batch_size, -1, input_size).to(device)
        labels = labels.view(batch_size, -1, output_size).to(device)
        # RNN
        # hidden_state = model.init_hidden(batch_size=batch_size).to(device) # RNN has one hidden state
        # outputs = model(input, hidden_state)
        
        # LSTM
        hidden_state, cell_state = model.init_hidden(batch_size=batch_size)  # LSTM has two hidden states
        hidden_state = hidden_state.to(device)
        cell_state = cell_state.to(device)
        outputs = model(input, (hidden_state, cell_state))

        # Compute the loss
        loss = criterion(outputs, labels)

        # Accumulate the test loss
        test_loss += loss.item()

# Compute the average test loss
avg_test_loss = test_loss / len(eval_dataloader)

print(f'Average Evaluation Loss: {avg_test_loss}')


Average Evaluation Loss: 34.41014338582754


In [47]:
def index_to_char(indeces_list):
    char_list = []
    for index in indeces_list:
        char_list.append(character_encoding.DIACRITICS[index])
    return char_list

In [48]:
training_set = utils.read_data(f"./Dataset/train.txt")
filtered_training_set = utils.filter_data(training_set)
test_sentences = utils.split_data_to_sentences(filtered_training_set)[0:10]

In [49]:
# Assuming you have a test dataset prepared in the same format as your training dataset
test_dataset = CustomDataset(test=True, testdata = training_set) 
test_dataloader = DataLoader(test_dataset, batch_size = batch_size)

# Switch the model to evaluation mode
model.eval()

# We don't need to compute gradients during evaluation, so we wrap this in torch.no_grad()
sentences_diacritics_prediction = []
sentences_without_diacritics = []
original_diacritics = []
with torch.no_grad():
    for sentences, labels in test_dataloader:
        sentences_without_diacritics.extend(sentences)
        original_diacritics.extend(labels)
        sentences = sentences.view(batch_size, -1, input_size).to(device)
        labels = labels.view(batch_size, -1, output_size).to(device)
        # RNN
        # hidden_state = model.init_hidden(batch_size=batch_size).to(device) # RNN has one hidden state
        # outputs = model(input, hidden_state)
        
        # LSTM
        hidden_state, cell_state = model.init_hidden(batch_size=batch_size)  # LSTM has two hidden states
        hidden_state = hidden_state.to(device)
        cell_state = cell_state.to(device)
        outputs = model(input, (hidden_state, cell_state))

        sentences_diacritics_prediction.extend(outputs.argmax(dim=2).cpu()) 

sentences_diacritics_prediction = np.array(sentences_diacritics_prediction)
sentences_without_diacritics = np.array(sentences_without_diacritics)
original_diacritics = np.array(original_diacritics)

print("Sentences diacritics prediction : ",sentences_diacritics_prediction.shape)
print("Sentences Without diacritics    : ",sentences_without_diacritics.shape)
print("Original diacritics             : ",original_diacritics.shape)

Sentences diacritics prediction :  (100, 150)
Sentences Without diacritics    :  (100, 150, 38)
Original diacritics             :  (100, 150, 15)


In [50]:
# # Switch the model to evaluation mode
# model.eval()
# # Assume 'input_sentence' is your input sentence
# input_sentence = test_sentences[5]

# # Process the input_sentence in the same way as you did for your training data
# sentence_without_diacritics, original_diacritics = character_encoding.remove_diacritics(input_sentence, True)
# sentence = character_encoding.getSentenceVector(sentence_without_diacritics)
# sentence = character_encoding.padding(sentence, len(character_encoding.ARABIC_ALPHABIT) + 2, max_length=PADDING_SIZE)
# diacritic = character_encoding.getDiacriticVector(original_diacritics)
# diacritic = character_encoding.padding(diacritic, len(character_encoding.DIACRITICS), max_length=PADDING_SIZE)
# sentence = torch.tensor(sentence, dtype=(torch.float32)).unsqueeze(0).to(device)  # Add an extra dimension for batch and move to device

# # We don't need to compute gradients during evaluation, so we wrap this in torch.no_grad()
# with torch.no_grad():
#     hidden = model.init_hidden(batch_size=1)  # Batch size is 1 for inference
#     # Forward pass
#     output = model(sentence, hidden)
# print(sentence.shape)

# # The output is the model's prediction, you might want to post-process this output to convert it back into a readable format
# prediction = output.argmax(dim=2)  # This gives you the index of the highest value in the output tensor


In [51]:
def oneHot_to_sentence(list_of_oneHot):
    sentence = ""
    l = character_encoding.ARABIC_ALPHABIT + " " + "\n"
    for oneHot in list_of_oneHot:
        sentence += l[oneHot.argmax()]
    return sentence

In [52]:
def oneHot_to_diacritic(list_of_oneHot):
    sentence = []
    l = character_encoding.DIACRITICS
    for oneHot in list_of_oneHot:
        sentence.append(l[oneHot.argmax()])
    return sentence

In [53]:
def diacritics_error_rate(original_diacritics, predicted_diacritics):
    error = 0
    for i in range(len(original_diacritics)):
        if original_diacritics[i] != predicted_diacritics[i]:
            error += 1
    return error / len(original_diacritics) * 100, error

In [54]:
# Convert the prediction tensor to the corresponding diacritics
predicted_diacritics = []
diacritic_error_rate = 0
number_of_mis_classified = 0
number_of_char_to_classify = 0
for i, p in enumerate(sentences_diacritics_prediction):
    pred = index_to_char(p)
    predicted_diacritics.append(pred)
    s = oneHot_to_sentence(sentences_without_diacritics[i])
    restored_text = character_encoding.restore_diacritics(s[0:len(pred)], pred)
    d = oneHot_to_diacritic(original_diacritics[i])
    diac, miss = diacritics_error_rate(d[0:len(predicted_diacritics)], pred)
    diacritic_error_rate += diac
    number_of_mis_classified += miss
    number_of_char_to_classify += len(predicted_diacritics)
    # print("Restored", restored_text)

diacritic_error_rate /= len(sentences_diacritics_prediction)



In [55]:
print("Diacritic Error Rate = ", diacritic_error_rate, "%")
print("Diacritic Correct Rate = ", 100 - diacritic_error_rate, "%")
print("Number of Misclassified = ", number_of_mis_classified, "out of", number_of_char_to_classify)

Diacritic Error Rate =  24.854964124893126 %
Diacritic Correct Rate =  75.14503587510687 %
Number of Misclassified =  1368 out of 5050
