## Read data from train.txt and filter it from unwanted patterns


In [154]:
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from Preprocessing import utils, character_encoding
from Models import rnn_pytorch
# import config as conf

# config = conf.ConfigLoader().load_config()

In [155]:
VECTOR_SIZE = 10

NUM_TRAIN_LINES = 10
NUM_TEST_LINES = 3
PADDING_SIZE = 150

MODEL = 'lstm'
TRAIN_MODEL = True # True if you want to train the model else load an existing model
NUM_EPOCHS = 200
HIDDEN_SIZE = 1000
LEARNING_RATE = 0.001
BATCH_SIZE = 1
MODEL_NAME = f'model_{MODEL}_{NUM_TRAIN_LINES}L_{NUM_EPOCHS}epoch_{HIDDEN_SIZE}Hidden.pth'

## Preprocessing

Clean data and save it (uncomment the following lines if you need to re-clean the data)


In [156]:
# def save_data(path: str, data: str):
#     with open(path, "w", encoding="utf-8") as f:
#         f.write(data)

# dataset = utils.read_data("./dataset/val.txt")
# filtered_dataset = utils.filter_data(dataset)
# save_data("./dataset/val_filtered.txt", filtered_dataset)

# dataset = utils.read_data("./dataset/train.txt")
# filtered_dataset = utils.filter_data(dataset)
# save_data("./dataset/train_filtered.txt", filtered_dataset)

## Feature Extraction


Split training data to sentences and remove diacritics from each sentence


In [157]:
class CustomDataset(Dataset):
    def __init__(self, test = False, eval = False, testdata = None):
        if test:
            dataset = testdata
            # self.filtered_dataset = utils.filter_data(dataset)
            self.data = utils.split_data_to_sentences(dataset)[0:NUM_TEST_LINES]
        elif eval:
            dataset = utils.read_data("./dataset/val_filtered.txt")
            # self.filtered_dataset = utils.filter_data(dataset)
            self.data = utils.split_data_to_sentences(dataset)[0:NUM_TRAIN_LINES]
        else:
            dataset = utils.read_data("./dataset/train_filtered.txt")
            # self.filtered_dataset = utils.filter_data(dataset)
            self.data = utils.split_data_to_sentences(dataset)[0:NUM_TRAIN_LINES]
        self.max_length = PADDING_SIZE

    def __getitem__(self, index):
        sentence = self.data[index]
        # separate data (sentence) and label (diacritic of each character)
        sentence, diactritic = character_encoding.remove_diacritics(sentence, True)
        # get sentence vector
        sentence = character_encoding.getSentenceVector(sentence)
        # get diacritic vector
        diactritic = character_encoding.getDiacriticVector(diactritic)
        # add padding to sentence vector or clip it
        sentence,original_length = character_encoding.padding(sentence, len(character_encoding.ARABIC_ALPHABIT) +1,max_length=self.max_length)
        diactritic,_ = character_encoding.padding(diactritic, len(character_encoding.DIACRITICS),max_length=self.max_length)
        # convert to tensor
        sentence = torch.tensor(sentence, dtype=(torch.float32))
        diactritic = torch.tensor(diactritic, dtype=(torch.float32))
        return sentence, diactritic, original_length 

    def __len__(self):
        return len(self.data)

## Building The Model


#### connect to GPU if available


In [158]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Device = " ,device)
print("Cuda : ",torch.cuda.is_available())
print("Number of Cuda devices :", torch.cuda.device_count())

Device =  cuda:0
Cuda :  True
Number of Cuda devices : 1


In [159]:
input_size = len(character_encoding.ARABIC_ALPHABIT) + 1
hidden_size = HIDDEN_SIZE
output_size = len(character_encoding.DIACRITICS)

In [160]:
# Create an instance of the RNN classifier
if MODEL == 'rnn':
    model = rnn_pytorch.RNNClassifier(input_size, hidden_size, output_size)

# Creare an instance of the LSTM classifier
elif MODEL == 'lstm':
    model = rnn_pytorch.LSTMClassifier(input_size, hidden_size, output_size)
model.to(device)

LSTMClassifier(
  (lstm): LSTM(38, 1000, batch_first=True)
  (out): Sequential(
    (0): Linear(in_features=1000, out_features=64, bias=True)
    (1): Tanh()
    (2): Linear(in_features=64, out_features=15, bias=True)
  )
)

In [161]:
batch_size = BATCH_SIZE
dataset = CustomDataset()
# Create a dataloader to handle batching and shuffling
train_dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [162]:
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

### Training Model


In [163]:
num_epochs = NUM_EPOCHS


# Training loop
if TRAIN_MODEL:
    for epoch in range(num_epochs):
        for i, (sentences, labels, _) in enumerate(train_dataloader): 
            # Reshape input and labels to (batch_size, seq_length, input_size)
            sentences = sentences.view(batch_size, -1,input_size).to(device)
            labels = labels.view(batch_size, -1, output_size).to(device)
            # RNN
            if MODEL == 'rnn':
                hidden_state = model.init_hidden(batch_size=batch_size).to(device) # RNN has one hidden state
                optimizer.zero_grad()
                outputs = model(sentences, hidden_state)
            
            # LSTM
            elif MODEL == 'lstm':
                hidden_state, cell_state = model.init_hidden(batch_size=batch_size)  # LSTM has two hidden states
                hidden_state = hidden_state.to(device)
                cell_state = cell_state.to(device)
                optimizer.zero_grad()
                outputs = model(sentences, (hidden_state, cell_state))

            
            
            # calculate the loss
            loss = criterion(outputs, labels)
            # backward pass
            loss.backward()
            # update the weights
            optimizer.step()
            

        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}')
    torch.save(model.state_dict(), f'./SavedModels/{MODEL_NAME}')
else:
    model.load_state_dict(torch.load( f'./SavedModels/{MODEL_NAME}'))
    print("Model loaded successfully")

Epoch 1/100, Loss: 50.88290023803711
Epoch 2/100, Loss: 42.7320671081543
Epoch 3/100, Loss: 49.835208892822266
Epoch 4/100, Loss: 49.17815017700195
Epoch 5/100, Loss: 47.977882385253906
Epoch 6/100, Loss: 46.13722610473633
Epoch 7/100, Loss: 28.772645950317383
Epoch 8/100, Loss: 27.94643783569336
Epoch 9/100, Loss: 18.302244186401367
Epoch 10/100, Loss: 27.17530632019043
Epoch 11/100, Loss: 45.189945220947266
Epoch 12/100, Loss: 8.86462688446045
Epoch 13/100, Loss: 36.65714645385742
Epoch 14/100, Loss: 8.553558349609375
Epoch 15/100, Loss: 44.411048889160156
Epoch 16/100, Loss: 36.02238082885742
Epoch 17/100, Loss: 3.0328292846679688
Epoch 18/100, Loss: 43.29643630981445
Epoch 19/100, Loss: 43.0249137878418
Epoch 20/100, Loss: 2.920105218887329
Epoch 21/100, Loss: 35.76701736450195
Epoch 22/100, Loss: 35.126495361328125
Epoch 23/100, Loss: 15.661724090576172
Epoch 24/100, Loss: 25.01476287841797
Epoch 25/100, Loss: 15.462905883789062
Epoch 26/100, Loss: 23.664382934570312
Epoch 27/100,

## Model Evaluation


Preparing Validation data to be passed into the `model.evaluate()`


In [164]:
# Assuming you have a test dataset prepared in the same format as your training dataset
eval_dataset = CustomDataset(eval=True)  # You'll need to modify your CustomDataset class to accept this parameter and load the eval data
eval_dataloader = DataLoader(eval_dataset, batch_size = batch_size)

# Switch the model to evaluation mode
model.eval()

# Initialize the test loss
test_loss = 0

# We don't need to compute gradients during evaluation, so we wrap this in torch.no_grad()
with torch.no_grad():
    for sentences, labels, _ in eval_dataloader:
        sentences = sentences.view(batch_size, -1, input_size).to(device)
        labels = labels.view(batch_size, -1, output_size).to(device)
        # RNN
        if MODEL == 'rnn':
            hidden_state = model.init_hidden(batch_size=batch_size).to(device) # RNN has one hidden state
            outputs = model(sentences, hidden_state)
        
        # LSTM
        elif MODEL == 'lstm':
            hidden_state, cell_state = model.init_hidden(batch_size=batch_size)  # LSTM has two hidden states
            hidden_state = hidden_state.to(device)
            cell_state = cell_state.to(device)
            outputs = model(sentences, (hidden_state, cell_state))

        # Compute the loss
        loss = criterion(outputs, labels)

        # Accumulate the test loss
        test_loss += loss.item()

# Compute the average test loss
avg_test_loss = test_loss / len(eval_dataloader)

print(f'Average Evaluation Loss: {avg_test_loss}')


Average Evaluation Loss: 41.80366373062134


# Testing


testing on a given sentence


In [166]:
test_set = utils.read_data(f"./Dataset/train_filtered.txt")
# test_set = utils.read_data(f"./Dataset/val_filtered.txt")
# filtered_training_set = utils.filter_data(training_set)
# test_sentences = utils.split_data_to_sentences(filtered_training_set)[0:1]

In [167]:
# Assuming you have a test dataset prepared in the same format as your training dataset
test_dataset = CustomDataset(test=True, testdata = test_set) 
test_dataloader = DataLoader(test_dataset, batch_size = batch_size)

# Switch the model to evaluation mode
model.eval()

# We don't need to compute gradients during evaluation, so we wrap this in torch.no_grad()
sentences_diacritics_prediction = []
sentences_without_diacritics = []
original_diacritics = []
original_sentences_len = []
with torch.no_grad():
    for sentences, labels, sentence_length in test_dataloader:
        original_sentences_len.append(sentence_length)
        sentences_without_diacritics.extend(sentences)
        original_diacritics.extend(labels)
        sentences = sentences.view(batch_size, -1, input_size).to(device)
        labels = labels.view(batch_size, -1, output_size).to(device)
        # RNN
        if MODEL == 'rnn':
            hidden_state = model.init_hidden(batch_size=batch_size).to(device) # RNN has one hidden state
            outputs = model(sentences, hidden_state)
        
        
        # LSTM
        elif MODEL == 'lstm':
            hidden_state, cell_state = model.init_hidden(batch_size=batch_size)  # LSTM has two hidden states
            hidden_state = hidden_state.to(device)
            cell_state = cell_state.to(device)
            outputs = model(sentences, (hidden_state, cell_state))

        sentences_diacritics_prediction.extend(outputs.argmax(dim=2).cpu()) 

sentences_diacritics_prediction = np.array(sentences_diacritics_prediction)
sentences_without_diacritics = np.array(sentences_without_diacritics)
original_diacritics = np.array(original_diacritics)

print("Sentences diacritics prediction : ",sentences_diacritics_prediction.shape)
print("Sentences Without diacritics    : ",sentences_without_diacritics.shape)
print("Original diacritics             : ",original_diacritics.shape)

Sentences diacritics prediction :  (3, 150)
Sentences Without diacritics    :  (3, 150, 38)
Original diacritics             :  (3, 150, 15)


In [170]:
def AverageDER(sentences_diacritics_prediction,sentences_without_diacritics,original_sentences_len):
    diacritic_error_rate = 0
    number_of_mis_classified = 0
    number_of_chars_to_classify = 0
    for i, p in enumerate(sentences_diacritics_prediction):
        pred = character_encoding.index_to_char(p)
        s = character_encoding.oneHot_to_sentence(sentences_without_diacritics[i][0:original_sentences_len[i]]) # sentence without diacritics
        d = character_encoding.oneHot_to_diacritic(original_diacritics[i][0:original_sentences_len[i]])         # original diacritics of the sentence
        original_text = character_encoding.restore_diacritics(s, d)
        restored_text = character_encoding.restore_diacritics(s, pred)
        diac, miss = character_encoding.diacritics_error_rate(d, pred)
        # print("Original Sentence : ", original_text)
        # print("Restored Sentence : ", restored_text)
        # print(f"DER sentence [{i}] = {diac} %")
        diacritic_error_rate += diac
        number_of_mis_classified += miss
        number_of_chars_to_classify += int(original_sentences_len[i][0]) if len(pred) > original_sentences_len[i] else len(pred)

    # diacritic_error_rate /= len(sentences_diacritics_prediction)
    diacritic_error_rate = number_of_mis_classified / number_of_chars_to_classify * 100
    print("Diacritic Error Rate = ", diacritic_error_rate, "%")
    print("Diacritic Correct Rate = ", 100 - diacritic_error_rate, "%")
    print("Number of Misclassified = ", number_of_mis_classified, "out of", number_of_chars_to_classify)
    return diacritic_error_rate

In [171]:
avg_der = AverageDER(sentences_diacritics_prediction,sentences_without_diacritics,original_sentences_len)

Diacritic Error Rate =  1.7751479289940828 %
Diacritic Correct Rate =  98.22485207100591 %
Number of Misclassified =  6 out of 338


In [168]:
# # Switch the model to evaluation mode
# model.eval()
# # Assume 'input_sentence' is your input sentence
# input_sentence = test_sentences[0]
# print("Input sentence : ",input_sentence)

# # Process the input_sentence in the same way as you did for your training data
# sentence_without_diacritics, original_diacritics = character_encoding.remove_diacritics(input_sentence, True)
# sentence = character_encoding.getSentenceVector(sentence_without_diacritics)
# sentence,_ = character_encoding.padding(sentence, len(character_encoding.ARABIC_ALPHABIT) + 2, max_length=PADDING_SIZE)
# diacritic = character_encoding.getDiacriticVector(original_diacritics)
# diacritic,_ = character_encoding.padding(diacritic, len(character_encoding.DIACRITICS), max_length=PADDING_SIZE)
# sentence = torch.tensor(sentence, dtype=(torch.float32)).unsqueeze(0).to(device)  # Add an extra dimension for batch and move to device

# # We don't need to compute gradients during evaluation, so we wrap this in torch.no_grad()
# with torch.no_grad():
#     hidden = model.init_hidden(batch_size=1).to(device)  # Batch size is 1 for inference
#     # Forward pass
#     output = model(sentence, hidden)
# print(sentence.shape)

# # The output is the model's prediction, you might want to post-process this output to convert it back into a readable format
# prediction = output.argmax(dim=2)  # This gives you the index of the highest value in the output tensor


In [169]:
# predicted_diacritics = character_encoding.index_to_char(prediction[0])
# der, miss = diacritics_error_rate(original_diacritics, predicted_diacritics)
# print("Diacritics error rate : ", der, "%")
# print("Correct diacritics rate : ", 100 - der, "%")
# print("Number of miss : ", miss, "out of ", len(original_diacritics))
# print("Original Sentence : ", input_sentence)
# restored_sentence = character_encoding.restore_diacritics(sentence_without_diacritics,predicted_diacritics)
# print("Restored Sentence : ", restored_sentence)
# print("Original diacritics : ", character_encoding.map_text_to_diacritic(original_diacritics))
# print("Predicted diacritics : ",character_encoding.map_text_to_diacritic( predicted_diacritics))