In [None]:
import sentencepiece as spm
from pathlib import Path


data_dir = Path("../Data/cleaned_data")
tokenizer_dir = Path("../tokenizer_models")

In [2]:
# Train Arabic tokenizer
spm.SentencePieceTrainer.Train(
    input=str(data_dir/"train.cleaned.ar"),
    model_prefix=str(tokenizer_dir/"spm_ar_unigram"),
    vocab_size=32000,
    model_type="unigram",         
    character_coverage=0.9995,    # cover Arabic characters
    bos_id=1, eos_id=2, pad_id=0, unk_id=3
)

# Train English tokenizer
spm.SentencePieceTrainer.Train(
    input=str(data_dir/"train.cleaned.en"),
    model_prefix=str(tokenizer_dir/"spm_en_unigram"),
    vocab_size=26000,             
    model_type="unigram",         
    character_coverage=1.0,       # cover ASCII characters
    bos_id=1, eos_id=2, pad_id=0, unk_id=3
)

print("✅ Tokenizers trained for Arabic and English")



✅ تم تدريب التوكنيزر للعربي والإنجليزي


# try them model

In [3]:
tokenizer_dir = Path("../tokenizer_models")

sp_ar = spm.SentencePieceProcessor(model_file=str(tokenizer_dir/"spm_ar_unigram.model"))
sp_en = spm.SentencePieceProcessor(model_file=str(tokenizer_dir/"spm_en_unigram.model"))


In [4]:
x = "الزمالك احسن نادي في مصر"
print(sp_ar.encode(x, out_type=int))

[18024, 5558, 7280, 3558, 6, 2518]


In [5]:
sp_ar.encode_as_pieces(x)

['▁الزم', 'الك', '▁احسن', '▁نادي', '▁في', '▁مصر']

In [6]:
print(sp_en.encode("The book is nice", out_type=int))

[30, 1268, 23, 621]


# encode all data 

In [7]:
import sentencepiece as spm
from pathlib import Path
import os

# Paths
data_dir = Path("../Data/cleaned_data")
tokenizer_dir = Path("../tokenizer_models")
output_dir = Path("../Data/encoded_data")
os.makedirs(output_dir, exist_ok=True)

# Load trained tokenizers
sp_ar = spm.SentencePieceProcessor(model_file=str(tokenizer_dir/"spm_ar_unigram.model"))
sp_en = spm.SentencePieceProcessor(model_file=str(tokenizer_dir/"spm_en_unigram.model"))

def encode_file(in_file, out_file, tokenizer):
    with open(in_file, "r", encoding="utf-8") as fin, open(out_file, "w", encoding="utf-8") as fout:
        for line in fin:
            ids = tokenizer.encode(line.strip(), out_type=int)
            fout.write(" ".join(map(str, ids)) + "\n")

# Apply encoding for train/valid/test
splits = ["train", "validation", "test"]
for split in splits:
    # Arabic
    encode_file(data_dir/f"{split}.cleaned.ar", output_dir/f"{split}.ids.ar", sp_ar)
    # English
    encode_file(data_dir/f"{split}.cleaned.en", output_dir/f"{split}.ids.en", sp_en)
    print(f"✅ {split} encoded and saved as IDs")

✅ train encoded and saved as IDs
✅ validation encoded and saved as IDs
✅ test encoded and saved as IDs
