In [11]:
import torch 
from torch.utils.data import Dataset
from tqdm import tqdm
import re
import pickle as pkl

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class TashkeelDataset(Dataset):
    def __init__(self, name, path):
        self.name = name
        with open(path, 'r', encoding='utf-8') as file:
            self.lines = list(tqdm(file, f"Reading {self.name} Lines"))
        self._load_dicts()
        self.tokenized_lines = self._tokenize_lines()
        self.embedded_data = self._embedd_lines()

    def __len__(self):
        return len(self.embedded_data)

    def __getitem__(self, idx):
        x, y = self.embedded_data[idx]
        return torch.tensor(x).to(device), torch.tensor(y).to(device)
    
    def _remove_tashkeel(self,data):
        #double damma, double fatha, double kasera, damma, fatha, kasera, sukoon, shadd
        TASHKEEL_SET = {'ٌ', 'ً', 'ٍ', 'ُ', 'َ', 'ِ', 'ْ', 'ٌّ', 'ّ'}
        DIACRITICS_REGEX = re.compile('|'.join(TASHKEEL_SET))
        return re.sub(DIACRITICS_REGEX, '', data)
    
    def _one_hot_encode(self, indices, size):
        return [[1 if i == elem else 0 for i in range(size)] for elem in indices]
    
    def _chunk_text(self, text, chunk_size):
        chunks = []
        words = re.findall(r'\S+', text)

        current_chunk = ""
        for word in words:
            if len(current_chunk) + len(word) + 1 <= chunk_size:
                current_chunk += f"{word} "
            else:
                chunks.append(current_chunk.strip())
                current_chunk = f"{word} "

        if current_chunk:
            chunks.append(current_chunk.strip())

        return list(filter(None, chunks))
    
    def _tokenize_lines(self):
        # Define a pattern to match specific punctuation marks
        punctuation_pattern1 = r'([.,:;؛)\]}»،])'
        punctuation_pattern2 = r'([(\[{«])'
        tokenized_lines = []

        for line in tqdm(self.lines, f"Tokenizing {self.name} Lines"):
            # Replace matched punctuation marks with the same followed by a line break
            splitted_line = re.sub(punctuation_pattern1, r'\1\n', line)
            splitted_line = re.sub(punctuation_pattern2, r'\n\1', splitted_line)

            # Further split the splitted line into substrings based on line breaks
            for sub_line in splitted_line.split('\n'):
                cleaned_sub_line = self._remove_tashkeel(sub_line).strip()
                if 0 < len(cleaned_sub_line) <= 500:
                    tokenized_lines.append(sub_line.strip())

                elif len(cleaned_sub_line) > 500:
                    tokenized_lines.extend(self._chunk_text(sub_line.strip(), 500))
    
        return tokenized_lines

    def _load_dicts(self):
        with open( '../utilities/pickle_files/LETTERS.pickle', 'rb') as file:
            self.LETTERS = pkl.load(file)
        with open( '../utilities/pickle_files/DIACRITICS.pickle', 'rb') as file:
            self.DIACRITICS = pkl.load(file)
        with open( '../utilities/pickle_files/CHAR_TO_ID.pickle', 'rb') as file:
            self.CHAR_TO_ID = pkl.load(file)
        with open( '../utilities/pickle_files/DIACRITIC_TO_ID.pickle', 'rb') as file:
            self.DIACRITIC_TO_ID = pkl.load(file)
        
    def _embedd_lines(self):
        inputs_embeddings=[]
        for line in tqdm(self.tokenized_lines, f"Embedding {self.name} Lines"):
            x = [self.CHAR_TO_ID['<SOS>']]
            y = [self.DIACRITIC_TO_ID['<SOS>']]

            for index, char in enumerate(line):
                if char in self.CHAR_TO_ID:
                    x.append(self.CHAR_TO_ID[char])
                else:
                    if char not in self.DIACRITICS:
                        x.append(self.CHAR_TO_ID['<UNK>'])

                if char not in self.LETTERS:
                    y.append(self.DIACRITIC_TO_ID[''])
                else:
                    char_diac = ''
                    if index + 1 < len(line) and line[index + 1] in self.DIACRITICS:
                        char_diac = line[index + 1]
                        if index + 2 < len(line) and line[index + 2] in self.DIACRITICS and char_diac + line[index + 2] in self.DIACRITIC_TO_ID:
                            char_diac += line[index + 2]
                        elif index + 2 < len(line) and line[index + 2] in self.DIACRITICS and line[index + 2] + char_diac in self.DIACRITIC_TO_ID:
                            char_diac = line[index + 2] + char_diac
                    y.append(self.DIACRITIC_TO_ID[char_diac])

            x.append(self.CHAR_TO_ID['<EOS>'])
            y.append(self.DIACRITIC_TO_ID['<EOS>'])
            y = self._one_hot_encode(y, len(self.DIACRITIC_TO_ID))
            
            inputs_embeddings.append((x, y)) 
            
        return inputs_embeddings

In [12]:
train_dataset = TashkeelDataset('train dataset','test.txt')

Reading train dataset Lines: 2it [00:00, 1000.79it/s]
Tokenizing train dataset Lines: 100%|██████████| 2/2 [00:00<00:00, 1986.41it/s]
Embedding train dataset Lines: 100%|██████████| 2/2 [00:00<00:00, 993.32it/s]


In [13]:
print(train_dataset.tokenized_lines)


['هَذِهِ تَجْرِبَةٌ لِلتَّشْكِيلِ بِالذَّكَاءِ الِاصْطِناعيِّ', 'هَذِهِ تَجْرِبَةٌ لِلتَّشْكِيلِ بِالذَّكَاءِ الِاصْطِناعيِّ']


In [14]:
print(train_dataset.CHAR_TO_ID)

{'<PAD>': 0, '<SOS>': 1, '<EOS>': 2, '<UNK>': 3, '\n': 4, ' ': 5, '!': 6, '"': 7, '&': 8, "'": 9, '(': 10, ')': 11, '*': 12, '+': 13, ',': 14, '-': 15, '.': 16, '/': 17, '0': 18, '1': 19, '2': 20, '3': 21, '4': 22, '5': 23, '6': 24, '7': 25, '8': 26, '9': 27, ':': 28, ';': 29, '=': 30, '[': 31, ']': 32, '_': 33, '`': 34, '{': 35, '}': 36, '~': 37, '«': 38, '»': 39, '،': 40, '؛': 41, '؟': 42, 'ء': 43, 'آ': 44, 'أ': 45, 'ؤ': 46, 'إ': 47, 'ئ': 48, 'ا': 49, 'ب': 50, 'ة': 51, 'ت': 52, 'ث': 53, 'ج': 54, 'ح': 55, 'خ': 56, 'د': 57, 'ذ': 58, 'ر': 59, 'ز': 60, 'س': 61, 'ش': 62, 'ص': 63, 'ض': 64, 'ط': 65, 'ظ': 66, 'ع': 67, 'غ': 68, 'ف': 69, 'ق': 70, 'ك': 71, 'ل': 72, 'م': 73, 'ن': 74, 'ه': 75, 'و': 76, 'ى': 77, 'ي': 78, '٠': 79, '١': 80, '٢': 81, '٤': 82, '\u200d': 83, '\u200f': 84, '–': 85, '’': 86, '“': 87, '…': 88, '﴾': 89, '﴿': 90}


In [15]:
print(train_dataset.embedded_data[0][0])

[1, 75, 58, 75, 5, 52, 54, 59, 50, 51, 5, 72, 72, 52, 62, 71, 78, 72, 5, 50, 49, 72, 58, 71, 49, 43, 5, 49, 72, 49, 63, 65, 74, 49, 67, 78, 2]


In [31]:
print(train_dataset.embedded_data)

# List of tuples
# Each tuple containts one list representing x, and list of lists representing y values

[([1, 75, 58, 75, 5, 52, 54, 59, 50, 51, 5, 72, 72, 52, 62, 71, 78, 72, 5, 50, 49, 72, 58, 71, 49, 43, 5, 49, 72, 49, 63, 65, 74, 49, 67, 78, 2], [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0,

In [32]:
# Indexing first tuple
print(len(train_dataset.embedded_data[0]))

2


In [36]:
print(len(train_dataset.embedded_data[0][1]))

61


In [45]:
print(train_dataset[0][0].shape)
print(train_dataset[0][1].shape)

torch.Size([37])
torch.Size([61, 19])


In [30]:
print(len(train_dataset.embedded_data[0][0]))
print(len(train_dataset.embedded_data[0][1]))

37
61


In [46]:
with open( '../utilities/pickle_files/ID_TO_DIACRITIC.pickle', 'rb') as file:
    ID2DIAC = pkl.load(file)

print(ID2DIAC)

{0: '', 1: 'َ', 2: 'ً', 3: 'ُ', 4: 'ٌ', 5: 'ِ', 6: 'ٍ', 7: 'ْ', 8: 'ّ', 9: 'َّ', 10: 'ًّ', 11: 'ُّ', 12: 'ٌّ', 13: 'ِّ', 14: 'ٍّ', 15: '<PAD>', 16: '<SOS>', 17: '<EOS>', 18: '<N/A>'}


In [49]:


for one_hot in train_dataset[0][1]:
    print(f'{ID2DIAC[one_hot.argmax().item()]} ')

<SOS> 
َ 
 
ِ 
 
ِ 
 
 
َ 
 
ْ 
 
ِ 
 
َ 
 
ٌ 
 
 
ِ 
 
 
َّ 
 
 
ْ 
 
ِ 
 
 
ِ 
 
 
ِ 
 
 
 
َّ 
 
 
َ 
 
 
ِ 
 
 
 
ِ 
 
 
ْ 
 
ِ 
 
 
 
 
ِّ 
 
 
<EOS> 


In [29]:
for idx, x,y in enumerate(train_dataset.embedded_data):
    print(f'{len(x)} token in x: {x}\n')
    for i,diac in enumerate(y[idx]):
        print(f'y{i}: {diac}\n')

ValueError: not enough values to unpack (expected 3, got 2)

In [17]:
from torch.utils.data import DataLoader

import torch.nn.utils.rnn as rnn_utils

def collate_fn(batch):
    x_batch, y_batch = zip(*batch)
    x_padded = rnn_utils.pad_sequence(x_batch, batch_first=True, padding_value=train_dataset.CHAR_TO_ID['<PAD>'])
    y_padded = rnn_utils.pad_sequence(y_batch, batch_first=True, padding_value=train_dataset.DIACRITIC_TO_ID['<PAD>'])
    return x_padded, y_padded

# Create a DataLoader instance with collate_fn
dataloader_train = DataLoader(val_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn)
dataloader_test = DataLoader(val_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn)

NameError: name 'val_dataset' is not defined

In [None]:
import torch.nn as nn 

class MeshakkelatyModel(nn.Module):
    def __init__(self, char_to_id, diacritic_to_id):
        super().__init__()
        self.embedding = nn.Embedding(
            num_embeddings=len(char_to_id),
            embedding_dim=25,
            padding_idx=char_to_id['<PAD>']  
        )
        self.lstm1 = nn.LSTM(
            input_size=25,
            hidden_size=256,
            num_layers=2,
            bidirectional=True,
            dropout=0.5,
            batch_first=True  
        )
        self.linear1 = nn.Linear(2*256, 512)
        self.linear2 = nn.Linear(512, len(diacritic_to_id))

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm1(x)
        x = nn.functional.relu(self.linear1(x))
        x = self.linear2(x)
        return x

In [None]:
import torch.optim as optim 
from torchmetrics import Accuracy

meshakkelaty = MeshakkelatyModel(train_dataset.CHAR_TO_ID, train_dataset.DIACRITIC_TO_ID).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(meshakkelaty.parameters())
epochs = 10
metric = Accuracy(task="multiclass", num_classes=len(train_dataset.DIACRITIC_TO_ID)).to(device)

In [None]:
x,y=next(iter(dataloader_train))
print(y.shape)

torch.Size([64, 775, 19])


In [None]:
print(y)

tensor([[[ 0,  0,  0,  ...,  1,  0,  0],
         [ 1,  0,  0,  ...,  0,  0,  0],
         [ 1,  0,  0,  ...,  0,  0,  0],
         ...,
         [15, 15, 15,  ..., 15, 15, 15],
         [15, 15, 15,  ..., 15, 15, 15],
         [15, 15, 15,  ..., 15, 15, 15]],

        [[ 0,  0,  0,  ...,  1,  0,  0],
         [ 0,  1,  0,  ...,  0,  0,  0],
         [ 1,  0,  0,  ...,  0,  0,  0],
         ...,
         [15, 15, 15,  ..., 15, 15, 15],
         [15, 15, 15,  ..., 15, 15, 15],
         [15, 15, 15,  ..., 15, 15, 15]],

        [[ 0,  0,  0,  ...,  1,  0,  0],
         [ 0,  1,  0,  ...,  0,  0,  0],
         [ 1,  0,  0,  ...,  0,  0,  0],
         ...,
         [15, 15, 15,  ..., 15, 15, 15],
         [15, 15, 15,  ..., 15, 15, 15],
         [15, 15, 15,  ..., 15, 15, 15]],

        ...,

        [[ 0,  0,  0,  ...,  1,  0,  0],
         [ 0,  1,  0,  ...,  0,  0,  0],
         [ 1,  0,  0,  ...,  0,  0,  0],
         ...,
         [15, 15, 15,  ..., 15, 15, 15],
         [15, 15, 15, 

In [None]:
y_pred = meshakkelaty(x)
print(y_pred.shape)

torch.Size([64, 775, 19])


In [None]:
for epoch in range(epochs):
    meshakkelaty.train()
    total_batches = len(dataloader_train)
    total_accuracy = 0.0

    for batch_idx, (x_batch, y_batch) in enumerate(tqdm(dataloader_train, desc=f"Epoch {epoch + 1}/{epochs}")):
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        y_pred = meshakkelaty(x_batch)
        loss = criterion(y_pred, y_batch.float())
        loss.backward()
        optimizer.step()
        train_acc = metric(y_pred.argmax(dim=-1), y_batch)
        total_accuracy += train_acc.item()

        # Print intermediate accuracy
        if (batch_idx + 1) % 10 == 0 or (batch_idx + 1) == total_batches:
            intermediate_accuracy = total_accuracy / (batch_idx + 1)
            print(f'Epoch {epoch + 1}/{epochs}, Batch {batch_idx + 1}/{total_batches}, Train Accuracy: {intermediate_accuracy:.4f}')

    # Compute and print the accuracy for the entire epoch
    train_acc = metric.compute()
    print(f'Epoch {epoch + 1}/{epochs}, Train Accuracy: {train_acc:.4f}')


Epoch 1/10:   0%|          | 0/241 [00:00<?, ?it/s]


ValueError: Either `preds` and `target` both should have the (same) shape (N, ...), or `target` should be (N, ...) and `preds` should be (N, C, ...).

In [None]:
for epoch in range(epochs):
    meshakkelaty.train()
    for x_batch, y_batch in tqdm(dataloader_train, desc=f"Epoch {epoch + 1}/{epochs}"):
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        y_pred = meshakkelaty(x_batch)
        loss = criterion(y_pred, y_batch.float())
        loss.backward()
        optimizer.step()
        train_acc = metric(y_pred.argmax(dim=-1), y_batch)
    train_acc = metric.compute()       
    print(f'Epoch {epoch + 1}/{epochs}, Train Accuracy: {train_acc:.4f}')
   

Epoch 1/10: 100%|██████████| 241/241 [05:11<00:00,  1.29s/it]


Epoch 1/10, Train Accuracy: 0.0000


Epoch 2/10:   2%|▏         | 5/241 [00:05<04:30,  1.14s/it]


KeyboardInterrupt: 