# Arabic Diacritization Model
This notebook trains a small model to predict Arabic diacritics, exports to ONNX, and benchmarks performance.

## 1. Install Dependencies

In [1]:
#!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu --break-system-packages
!pip install onnx onnxruntime pyarabic --break-system-packages

[0m

## 2. Import Libraries

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import re
import time
from collections import Counter
import onnx
import onnxruntime as ort
from pathlib import Path

## 3. Configuration

Download dataset: https://huggingface.co/datasets/TigreGotico/arabic_diacritized_text

In [2]:
!ls /datasets/arabic-diacritics

test.txt  train.txt  val.txt


In [4]:
# File paths
TRAIN_FILE = '/datasets/arabic-diacritics/train.txt'  # Training data file
VAL_FILE = '/datasets/arabic-diacritics/val.txt'      # Validation data file
TEST_FILE = '/datasets/arabic-diacritics/test.txt'    # Test data file
MODEL_PATH = 'diacritization_model.pth'
ONNX_PATH = 'diacritization_model.onnx'

# Model hyperparameters
EMBEDDING_DIM = 64
HIDDEN_DIM = 128
NUM_LAYERS = 2
BATCH_SIZE = 32
EPOCHS = 10
LEARNING_RATE = 0.001
MAX_SEQ_LENGTH = 100

# Device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cuda


## 4. Arabic Diacritics Utilities

In [5]:
# Arabic diacritics
ARABIC_DIACRITICS = {
    'َ': 'FATHA',           # Fatha
    'ً': 'TANWIN_FATH',    # Tanwin Fath
    'ُ': 'DAMMA',           # Damma
    'ٌ': 'TANWIN_DAMM',    # Tanwin Damm
    'ِ': 'KASRA',           # Kasra
    'ٍ': 'TANWIN_KASR',    # Tanwin Kasr
    'ْ': 'SUKUN',           # Sukun
    'ّ': 'SHADDA',          # Shadda
}

DIACRITIC_CHARS = ''.join(ARABIC_DIACRITICS.keys())

def remove_diacritics(text):
    """Remove all Arabic diacritics from text"""
    return re.sub(f'[{DIACRITIC_CHARS}]', '', text)

def extract_diacritics(text):
    """Extract diacritics aligned with characters"""
    result = []
    i = 0
    while i < len(text):
        char = text[i]
        if char in DIACRITIC_CHARS:
            i += 1
            continue
        
        # Collect diacritics after this character
        diacritics = []
        j = i + 1
        while j < len(text) and text[j] in DIACRITIC_CHARS:
            diacritics.append(text[j])
            j += 1
        
        # Store character and its diacritics
        result.append((char, ''.join(diacritics)))
        i = j
    
    return result

print("Diacritics defined:", list(ARABIC_DIACRITICS.keys()))

Diacritics defined: ['َ', 'ً', 'ُ', 'ٌ', 'ِ', 'ٍ', 'ْ', 'ّ']


## 5. Load and Prepare Data

In [6]:
import random

def load_sentences(file_path):
    """Load sentences from a file"""
    with open(file_path, 'r', encoding='utf-8') as f:
        sentences = [line.strip() for line in f if line.strip()]
    random.shuffle(sentences)
    return sentences[:150000]

# Load all datasets
print("Loading datasets...")
train_sentences = load_sentences(TRAIN_FILE)
val_sentences = load_sentences(VAL_FILE)
test_sentences = load_sentences(TEST_FILE)

print(f"Loaded {len(train_sentences)} training sentences")
print(f"Loaded {len(val_sentences)} validation sentences")
print(f"Loaded {len(test_sentences)} test sentences")
print(f"\nSample training sentence: {train_sentences[0][:100]}...")

# Create paired dataset function
def create_paired_data(sentences):
    """Convert sentences to paired undiacritized/diacritized format"""
    paired_data = []
    for sentence in sentences:
        undiacritized = remove_diacritics(sentence)
        char_diac_pairs = extract_diacritics(sentence)
        
        if len(undiacritized) > 0 and len(char_diac_pairs) > 0:
            paired_data.append({
                'undiacritized': undiacritized,
                'diacritized': sentence,
                'char_diac_pairs': char_diac_pairs
            })
    return paired_data

# Create paired datasets
train_paired_data = create_paired_data(train_sentences)
val_paired_data = create_paired_data(val_sentences)
test_paired_data = create_paired_data(test_sentences)

print(f"\nCreated {len(train_paired_data)} training pairs")
print(f"Created {len(val_paired_data)} validation pairs")
print(f"Created {len(test_paired_data)} test pairs")
print(f"\nExample:")
print(f"Undiacritized: {train_paired_data[0]['undiacritized'][:50]}")
print(f"Diacritized: {train_paired_data[0]['diacritized'][:50]}")

Loading datasets...
Loaded 150000 training sentences
Loaded 150000 validation sentences
Loaded 150000 test sentences

Sample training sentence: مُنَاوِيٌّ .( 6 / 155 )...

Created 150000 training pairs
Created 150000 validation pairs
Created 150000 test pairs

Example:
Undiacritized: مناوي .( 6 / 155 )
Diacritized: مُنَاوِيٌّ .( 6 / 155 )


## 6. Build Vocabulary

In [7]:
# Build character vocabulary from all datasets
all_chars = set()
for item in train_paired_data + val_paired_data + test_paired_data:
    all_chars.update(item['undiacritized'])

# Special tokens
char_to_idx = {'<PAD>': 0, '<UNK>': 1}
for char in sorted(all_chars):
    char_to_idx[char] = len(char_to_idx)

idx_to_char = {v: k for k, v in char_to_idx.items()}

# Build diacritic vocabulary (combinations of diacritics) from all datasets
all_diacritics = set()
for item in train_paired_data + val_paired_data + test_paired_data:
    for _, diacs in item['char_diac_pairs']:
        all_diacritics.add(diacs)

# Include empty string for no diacritic
diac_to_idx = {'': 0}  # No diacritic
for diac in sorted(all_diacritics):
    if diac:  # Skip empty string as it's already added
        diac_to_idx[diac] = len(diac_to_idx)

idx_to_diac = {v: k for k, v in diac_to_idx.items()}

print(f"Character vocabulary size: {len(char_to_idx)}")
print(f"Diacritic vocabulary size: {len(diac_to_idx)}")
print(f"\nDiacritic classes: {list(diac_to_idx.keys())[:10]}...")

Character vocabulary size: 201
Diacritic vocabulary size: 28

Diacritic classes: ['', 'ً', 'ًّ', 'ٌ', 'ٌّ', 'ٍ', 'ٍّ', 'َ', 'ًَ', 'ََ']...


## 7. Dataset Class

In [8]:
class ArabicDiacritizationDataset(Dataset):
    def __init__(self, paired_data, char_to_idx, diac_to_idx, max_length):
        self.data = paired_data
        self.char_to_idx = char_to_idx
        self.diac_to_idx = diac_to_idx
        self.max_length = max_length
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data[idx]
        
        # Encode characters
        chars = item['undiacritized'][:self.max_length]
        char_indices = [self.char_to_idx.get(c, self.char_to_idx['<UNK>']) for c in chars]
        
        # Encode diacritics
        diac_indices = []
        for char, diacs in item['char_diac_pairs'][:self.max_length]:
            diac_idx = self.diac_to_idx.get(diacs, 0)
            diac_indices.append(diac_idx)
        
        # Pad sequences
        seq_len = len(char_indices)
        char_indices += [0] * (self.max_length - seq_len)
        diac_indices += [0] * (self.max_length - seq_len)
        
        return {
            'chars': torch.tensor(char_indices, dtype=torch.long),
            'diacritics': torch.tensor(diac_indices, dtype=torch.long),
            'length': seq_len
        }

# Create datasets from the separate files
train_dataset = ArabicDiacritizationDataset(train_paired_data, char_to_idx, diac_to_idx, MAX_SEQ_LENGTH)
val_dataset = ArabicDiacritizationDataset(val_paired_data, char_to_idx, diac_to_idx, MAX_SEQ_LENGTH)
test_dataset = ArabicDiacritizationDataset(test_paired_data, char_to_idx, diac_to_idx, MAX_SEQ_LENGTH)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")
print(f"Test samples: {len(test_dataset)}")

Training samples: 150000
Validation samples: 150000
Test samples: 150000


## 8. Model Definition

In [9]:
class DiacritizationModel(nn.Module):
    def __init__(self, vocab_size, diac_size, embedding_dim, hidden_dim, num_layers):
        super(DiacritizationModel, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(
            embedding_dim,
            hidden_dim,
            num_layers,
            batch_first=True,
            bidirectional=True
        )
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(hidden_dim * 2, diac_size)
    
    def forward(self, x):
        # x: (batch, seq_len)
        embedded = self.embedding(x)  # (batch, seq_len, embedding_dim)
        lstm_out, _ = self.lstm(embedded)  # (batch, seq_len, hidden_dim*2)
        lstm_out = self.dropout(lstm_out)
        output = self.fc(lstm_out)  # (batch, seq_len, diac_size)
        return output

model = DiacritizationModel(
    vocab_size=len(char_to_idx),
    diac_size=len(diac_to_idx),
    embedding_dim=EMBEDDING_DIM,
    hidden_dim=HIDDEN_DIM,
    num_layers=NUM_LAYERS
).to(device)

print(model)
print(f"\nTotal parameters: {sum(p.numel() for p in model.parameters()):,}")

DiacritizationModel(
  (embedding): Embedding(201, 64, padding_idx=0)
  (lstm): LSTM(64, 128, num_layers=2, batch_first=True, bidirectional=True)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=28, bias=True)
)

Total parameters: 613,980


## 9. Training

In [10]:
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

def train_epoch(model, loader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    
    for batch in loader:
        chars = batch['chars'].to(device)
        diacritics = batch['diacritics'].to(device)
        
        optimizer.zero_grad()
        outputs = model(chars)
        
        # Reshape for loss calculation
        outputs = outputs.view(-1, outputs.size(-1))
        diacritics = diacritics.view(-1)
        
        loss = criterion(outputs, diacritics)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        
        # Calculate accuracy
        predictions = outputs.argmax(dim=1)
        mask = diacritics != 0
        correct += (predictions[mask] == diacritics[mask]).sum().item()
        total += mask.sum().item()
    
    return total_loss / len(loader), correct / total if total > 0 else 0

def evaluate(model, loader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for batch in loader:
            chars = batch['chars'].to(device)
            diacritics = batch['diacritics'].to(device)
            
            outputs = model(chars)
            
            outputs_flat = outputs.view(-1, outputs.size(-1))
            diacritics_flat = diacritics.view(-1)
            
            loss = criterion(outputs_flat, diacritics_flat)
            total_loss += loss.item()
            
            predictions = outputs_flat.argmax(dim=1)
            mask = diacritics_flat != 0
            correct += (predictions[mask] == diacritics_flat[mask]).sum().item()
            total += mask.sum().item()
    
    return total_loss / len(loader), correct / total if total > 0 else 0

# Training loop
print("Starting training...\n")
for epoch in range(EPOCHS):
    train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, device)
    val_loss, val_acc = evaluate(model, val_loader, criterion, device)
    
    print(f"Epoch {epoch+1}/{EPOCHS}")
    print(f"  Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}")
    print(f"  Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}\n")

# Save model
torch.save(model.state_dict(), MODEL_PATH)
print(f"Model saved to {MODEL_PATH}")

Starting training...

Epoch 1/10
  Train Loss: 0.3882, Train Acc: 0.8679
  Val Loss: 0.2259, Val Acc: 0.9238

Epoch 2/10
  Train Loss: 0.2093, Train Acc: 0.9298
  Val Loss: 0.1796, Val Acc: 0.9393

Epoch 3/10
  Train Loss: 0.1738, Train Acc: 0.9417
  Val Loss: 0.1632, Val Acc: 0.9449

Epoch 4/10
  Train Loss: 0.1551, Train Acc: 0.9479
  Val Loss: 0.1501, Val Acc: 0.9496

Epoch 5/10
  Train Loss: 0.1431, Train Acc: 0.9519
  Val Loss: 0.1440, Val Acc: 0.9521

Epoch 6/10
  Train Loss: 0.1340, Train Acc: 0.9548
  Val Loss: 0.1390, Val Acc: 0.9534

Epoch 7/10
  Train Loss: 0.1270, Train Acc: 0.9571
  Val Loss: 0.1364, Val Acc: 0.9546

Epoch 8/10
  Train Loss: 0.1215, Train Acc: 0.9588
  Val Loss: 0.1340, Val Acc: 0.9554

Epoch 9/10
  Train Loss: 0.1167, Train Acc: 0.9604
  Val Loss: 0.1338, Val Acc: 0.9560

Epoch 10/10
  Train Loss: 0.1126, Train Acc: 0.9618
  Val Loss: 0.1325, Val Acc: 0.9567

Model saved to diacritization_model.pth


## 10. Export to ONNX

In [11]:
# Prepare dummy input for ONNX export
dummy_input = torch.randint(0, len(char_to_idx), (1, MAX_SEQ_LENGTH)).to(device)

# Export
model.eval()
torch.onnx.export(
    model,
    dummy_input,
    ONNX_PATH,
    export_params=True,
    opset_version=12,
    do_constant_folding=True,
    input_names=['input'],
    output_names=['output'],
    dynamic_axes={
        'input': {0: 'batch_size', 1: 'sequence'},
        'output': {0: 'batch_size', 1: 'sequence'}
    }
)

print(f"Model exported to {ONNX_PATH}")

# Verify ONNX model
onnx_model = onnx.load(ONNX_PATH)
onnx.checker.check_model(onnx_model)
print("ONNX model verified successfully!")

Model exported to diacritization_model.onnx
ONNX model verified successfully!




## 11. Benchmark

In [12]:
def benchmark_pytorch(model, device, num_iterations=100):
    """Benchmark PyTorch model"""
    model.eval()
    test_input = torch.randint(0, len(char_to_idx), (1, MAX_SEQ_LENGTH)).to(device)
    
    # Warmup
    with torch.no_grad():
        for _ in range(10):
            _ = model(test_input)
    
    # Benchmark
    times = []
    with torch.no_grad():
        for _ in range(num_iterations):
            start = time.time()
            _ = model(test_input)
            times.append(time.time() - start)
    
    return np.mean(times), np.std(times)

def benchmark_onnx(onnx_path, num_iterations=100):
    """Benchmark ONNX model"""
    session = ort.InferenceSession(onnx_path)
    test_input = np.random.randint(0, len(char_to_idx), (1, MAX_SEQ_LENGTH), dtype=np.int64)
    
    # Warmup
    for _ in range(10):
        _ = session.run(None, {'input': test_input})
    
    # Benchmark
    times = []
    for _ in range(num_iterations):
        start = time.time()
        _ = session.run(None, {'input': test_input})
        times.append(time.time() - start)
    
    return np.mean(times), np.std(times)

# Run benchmarks
print("Benchmarking PyTorch model...")
pytorch_mean, pytorch_std = benchmark_pytorch(model, device)
print(f"PyTorch - Mean: {pytorch_mean*1000:.2f}ms, Std: {pytorch_std*1000:.2f}ms")

print("\nBenchmarking ONNX model...")
onnx_mean, onnx_std = benchmark_onnx(ONNX_PATH)
print(f"ONNX - Mean: {onnx_mean*1000:.2f}ms, Std: {onnx_std*1000:.2f}ms")

print(f"\nSpeedup: {pytorch_mean/onnx_mean:.2f}x")

# Throughput calculation
pytorch_throughput = 1.0 / pytorch_mean
onnx_throughput = 1.0 / onnx_mean

print(f"\nThroughput:")
print(f"PyTorch: {pytorch_throughput:.2f} sequences/second")
print(f"ONNX: {onnx_throughput:.2f} sequences/second")

Benchmarking PyTorch model...
PyTorch - Mean: 0.49ms, Std: 0.04ms

Benchmarking ONNX model...
ONNX - Mean: 5.33ms, Std: 5.37ms

Speedup: 0.09x

Throughput:
PyTorch: 2054.99 sequences/second
ONNX: 187.46 sequences/second


## 12. Test Inference

In [13]:
def predict_diacritics(text, model, char_to_idx, idx_to_diac, device, max_length=MAX_SEQ_LENGTH):
    """Predict diacritics for undiacritized text"""
    model.eval()
    
    # Remove any existing diacritics
    text = remove_diacritics(text)
    
    # Encode
    char_indices = [char_to_idx.get(c, char_to_idx['<UNK>']) for c in text[:max_length]]
    char_indices += [0] * (max_length - len(char_indices))
    
    # Predict
    with torch.no_grad():
        input_tensor = torch.tensor([char_indices], dtype=torch.long).to(device)
        output = model(input_tensor)
        predictions = output.argmax(dim=-1).squeeze().cpu().numpy()
    
    # Reconstruct
    result = []
    for i, char in enumerate(text):
        result.append(char)
        diac = idx_to_diac.get(predictions[i], '')
        if diac:
            result.append(diac)
    
    return ''.join(result)

# Test on test samples
print("Testing predictions on test samples:\n")
for i in range(min(3, len(test_paired_data))):
    sample = test_paired_data[i]
    original = sample['diacritized']
    undiacritized = sample['undiacritized']
    predicted = predict_diacritics(undiacritized, model, char_to_idx, idx_to_diac, device)
    
    print(f"Sample {i+1}:")
    print(f"Undiacritized: {undiacritized[:80]}")
    print(f"Original:      {original[:80]}")
    print(f"Predicted:     {predicted[:80]}")
    print()

Testing predictions on test samples:

Sample 1:
Undiacritized: وإذا اختلفا في حائط بين دارين ،
Original:      وَإِذَا اخْتَلَفَا فِي حَائِطٍ بَيْنَ دَارَيْنِ ،
Predicted:     وَإِذَاَ َاُخْتَلَفَاَ َفِيَ ِحَاْئِطٍ ِبَيْنَ ِدَاَرَيْنِ َ،َ

Sample 2:
Undiacritized: فلم يكن في العرب ملوك يدفع بعضهم عن بعض ،
Original:      فَلَمْ يَكُنْ فِي العَرَبِ مُلُوكٌ يَدْفَعُ بَعْضُهُمْ عَنْ بَعْضٍ ،
Predicted:     فَلَمْ ِيَكُنْ ِفِيَ ِاَلْعَرَبِ ِمُلُوِّكٌ َيَدْفَعُ ِبَعْضُهُمْ ِعَنْ ِبَعْضٍ 

Sample 3:
Undiacritized: بين الخطبتين .
Original:      بَيْنَ الخُطْبَتَيْنِ .
Predicted:     بَيْنَ ِاَلْخُطْبَتَيْنِ ِ.َ



## 13. Test Set Evaluation

In [14]:
# Evaluate on test set
print("Evaluating on test set...\n")
test_loss, test_acc = evaluate(model, test_loader, criterion, device)

print(f"Test Results:")
print(f"  Test Loss: {test_loss:.4f}")
print(f"  Test Accuracy: {test_acc:.4f}")
print(f"  Test Error Rate: {(1-test_acc)*100:.2f}%")

Evaluating on test set...

Test Results:
  Test Loss: 0.1326
  Test Accuracy: 0.9567
  Test Error Rate: 4.33%


## 14. Model Summary

In [15]:
import os

print("=" * 60)
print("MODEL SUMMARY")
print("=" * 60)
print(f"\nDataset:")
print(f"  Training sentences: {len(train_sentences)} ({len(train_paired_data)} pairs)")
print(f"  Validation sentences: {len(val_sentences)} ({len(val_paired_data)} pairs)")
print(f"  Test sentences: {len(test_sentences)} ({len(test_paired_data)} pairs)")
print(f"  Total: {len(train_sentences) + len(val_sentences) + len(test_sentences)} sentences")

print(f"\nModel Architecture:")
print(f"  Embedding dim: {EMBEDDING_DIM}")
print(f"  Hidden dim: {HIDDEN_DIM}")
print(f"  Num layers: {NUM_LAYERS}")
print(f"  Total parameters: {sum(p.numel() for p in model.parameters()):,}")

print(f"\nVocabulary:")
print(f"  Character vocab size: {len(char_to_idx)}")
print(f"  Diacritic classes: {len(diac_to_idx)}")

print(f"\nPerformance:")
print(f"  Final validation accuracy: {val_acc:.4f}")
print(f"  Test accuracy: {test_acc:.4f}")

print(f"\nFiles:")
print(f"  PyTorch model: {MODEL_PATH} ({os.path.getsize(MODEL_PATH)/1024:.2f} KB)")
print(f"  ONNX model: {ONNX_PATH} ({os.path.getsize(ONNX_PATH)/1024:.2f} KB)")

print("\n" + "=" * 60)

MODEL SUMMARY

Dataset:
  Training sentences: 150000 (150000 pairs)
  Validation sentences: 150000 (150000 pairs)
  Test sentences: 150000 (150000 pairs)
  Total: 450000 sentences

Model Architecture:
  Embedding dim: 64
  Hidden dim: 128
  Num layers: 2
  Total parameters: 613,980

Vocabulary:
  Character vocab size: 201
  Diacritic classes: 28

Performance:
  Final validation accuracy: 0.9567
  Test accuracy: 0.9567

Files:
  PyTorch model: diacritization_model.pth (2401.87 KB)
  ONNX model: diacritization_model.onnx (2402.70 KB)

