# 🧠 Arabic Preprocessing with CAMeL Tools
This notebook performs Arabic text preprocessing using CAMeL Tools, including normalization, lemmatization, and optional dialect detection. Designed for use before alignment or semantic validation.

In [None]:

# ✅ Install compatible versions of NumPy and CAMeL Tools
!pip install numpy==1.23.5 --force-reinstall --no-cache-dir
!pip install camel-tools


In [None]:

# 📥 Upload your files: transcript (.txt) and captions (.json)
from google.colab import files
uploaded = files.upload()


In [None]:

# 📄 Parse transcript file with timecodes
import re

def load_transcript(path):
    with open(path, encoding='utf-8') as f:
        lines = f.readlines()

    segments = []
    pattern = re.compile(r"\[(\d+\.\d+) - (\d+\.\d+)\]\s+(.*)")
    for line in lines:
        match = pattern.match(line)
        if match:
            start, end, text = match.groups()
            segments.append({
                "start": float(start),
                "end": float(end),
                "text": text.strip()
            })
    return segments

transcript_path = [f for f in uploaded if f.endswith(".txt")][0]
segments = load_transcript(transcript_path)
print(f"Loaded {len(segments)} transcript segments.")


In [None]:

# 📄 Load caption JSON
import json

def load_captions(path):
    with open(path, encoding='utf-8') as f:
        data = json.load(f)
    captions = []
    for scene_id, meta in data.items():
        captions.append({
            "scene_id": scene_id,
            "scene_time": meta["scene_time"],
            "caption": meta["arabic"]
        })
    return captions

captions_path = [f for f in uploaded if f.endswith(".json")][0]
captions = load_captions(captions_path)
print(f"Loaded {len(captions)} scene captions.")


In [None]:

# 🧪 Normalize + Lemmatize using CAMeL Tools
from camel_tools.disambig.mle import MLEDisambiguator
from camel_tools.tokenizers.word import simple_word_tokenize
from camel_tools.utils.dediac import dediac_ar

disambig = MLEDisambiguator.pretrained()

def preprocess(text):
    text = dediac_ar(text)  # Remove diacritics
    tokens = simple_word_tokenize(text)
    disambig_results = disambig.disambiguate(tokens)
    lemmas = [res.analyses[0].lemma if res.analyses else tok for tok, res in zip(tokens, disambig_results)]
    return {
        "original": text,
        "tokens": tokens,
        "lemmas": lemmas
    }


In [None]:

# 🔁 Process all transcript segments
processed_segments = []
for seg in segments:
    proc = preprocess(seg["text"])
    processed_segments.append({
        "start": seg["start"],
        "end": seg["end"],
        "original": seg["text"],
        "tokens": proc["tokens"],
        "lemmas": proc["lemmas"]
    })

with open("processed_transcript.json", "w", encoding="utf-8") as f:
    json.dump(processed_segments, f, ensure_ascii=False, indent=2)

print("✅ Saved: processed_transcript.json")


In [None]:

# 🔁 Process all captions
processed_captions = []
for cap in captions:
    proc = preprocess(cap["caption"])
    processed_captions.append({
        "scene_id": cap["scene_id"],
        "scene_time": cap["scene_time"],
        "original": cap["caption"],
        "tokens": proc["tokens"],
        "lemmas": proc["lemmas"]
    })

with open("processed_captions.json", "w", encoding="utf-8") as f:
    json.dump(processed_captions, f, ensure_ascii=False, indent=2)

print("✅ Saved: processed_captions.json")
