In [1]:
import re
from pathlib import Path


# 1) Normalization Functions


In [10]:

# Regex for Arabic diacritics
AR_DIAC = re.compile(r"[\u0610-\u061A\u064B-\u065F\u06D6-\u06ED]")  
TATWEEL = "\u0640"  # Arabic tatweel (elongation character)

def normalize_ar(text: str) -> str:
    """
    Normalize Arabic text by:
    - Removing tatweel (ـــ)
    - Removing diacritics (harakat)
    - Unifying alef/hamza forms (أ, إ, آ → ا)
    - Normalizing yaa' (ى → ي)
    - Converting Arabic digits (٠١٢٣...) to Latin digits (0123...)
    - Removing extra spaces
    """
    text = text.replace(TATWEEL, "")
    text = AR_DIAC.sub("", text)
    text = text.replace("أ", "ا").replace("إ", "ا").replace("آ", "ا")
    text = text.replace("ى", "ي")
    text = text.translate(str.maketrans("٠١٢٣٤٥٦٧٨٩", "0123456789"))
    text = re.sub(r"\s+", " ", text).strip()
    return text

def normalize_en(text: str) -> str:
    """
    Normalize English text by:
    - Removing extra spaces
    (Optional: could also lowercase and remove punctuation if needed)
    """
    text = re.sub(r"\s+", " ", text).strip()
    return text




# 2) Apply Normalization to Files

In [12]:


data_dir = Path(r"..\Data\raw_data")
output_dir = Path(r"..\Data\cleaned_data")

for split in ["train", "validation", "test"]:
    # Input files
    en_file = data_dir / f"{split}.en"
    ar_file = data_dir / f"{split}.ar"
    
    # Output files after normalization
    
    en_out = output_dir / f"{split}.cleaned.en"
    ar_out = output_dir / f"{split}.cleaned.ar"

    with open(en_file, "r", encoding="utf-8") as f_en, \
         open(ar_file, "r", encoding="utf-8") as f_ar, \
         open(en_out, "w", encoding="utf-8") as f_en_out, \
         open(ar_out, "w", encoding="utf-8") as f_ar_out:

        for en_line, ar_line in zip(f_en, f_ar):
            # Apply normalization
            en_clean = normalize_en(en_line)
            ar_clean = normalize_ar(ar_line)

            # Write to the new file if both lines are not empty
            if en_clean and ar_clean:
                f_en_out.write(en_clean + "\n")
                f_ar_out.write(ar_clean + "\n")

    print(f"✅ Saved normalized files: {split}.cleaned.en & {split}.cleaned.ar")


✅ Saved normalized files: train.cleaned.en & train.cleaned.ar
✅ Saved normalized files: validation.cleaned.en & validation.cleaned.ar
✅ Saved normalized files: test.cleaned.en & test.cleaned.ar
