<a href="https://colab.research.google.com/github/Raniamea/arabic-video-summarisation/blob/main/notebooks/03_ArabicPreprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Arabic Preprocessing with CAMeL Tools
This notebook performs Arabic text preprocessing using CAMeL Tools, including normalization & lemmatization

In [None]:

# ✅ Install compatible versions of NumPy and CAMeL Tools
!pip install numpy==1.23.5 --force-reinstall --no-cache-dir
!pip install camel-tools

In [None]:
# Download the morphology DB for MSA
!camel_data -i morphology-db-msa-r13

# Download the MLE disambiguator for MSA
!camel_data -i disambig-mle-calima-msa-r13

In [None]:
from google.colab import drive
import os

# Unmount first
!fusermount -u /content/drive || echo "Already unmounted"

# Delete the mount folder entirely
!rm -rf /content/drive

# Now mount again
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os, json, re
from typing import List, Dict, Any

# Project paths
base_path = "/content/drive/MyDrive/ArabicVideoSummariser"
param_path = os.path.join(base_path, "params.json")
with open(param_path, "r", encoding="utf-8") as f:
    params = json.load(f)
video_filename = params.get("video_file")
video_filename="Almasbagha.mp4"

assert video_filename, "params.json must include 'video_file'."
video_name = os.path.splitext(video_filename)[0]
videos_path = os.path.join(base_path, "videos")
transcripts_path = os.path.join(base_path, "transcripts")
preprocessed_path= os.path.join(base_path, "Preprocessed")
video_path = os.path.join(videos_path, video_filename)
assert os.path.exists(video_path), f"Video file not found: {video_path}"
transcript_path = os.path.join(transcripts_path, f"{video_name}_ar.json")
preprocessed_transcript_path= os.path.join(preprocessed_path, f"{video_name}_CleanTranscript.json")

print("Input :", transcript_path)
print("Output:", preprocessed_transcript_path)


Input : /content/drive/MyDrive/ArabicVideoSummariser/transcripts/Almasbagha_ar.json
Output: /content/drive/MyDrive/ArabicVideoSummariser/Preprocessed/Almasbagha_CleanTranscript.json


In [None]:
# Remove diacritics and standardize common forms (keep things readable but consistent)
_AR_DIAC = re.compile(r"[\u0610-\u061A\u064B-\u065F\u0670\u06D6-\u06ED]")
_TATWEEL = "\u0640"

def normalize_ar(text: str) -> str:
    if not text:
        return ""
    s = text
    s = _AR_DIAC.sub("", s)         # strip diacritics
    s = s.replace(_TATWEEL, "")     # strip tatweel
    # Normalize hamza/alef variants
    s = s.replace("أ", "ا").replace("إ", "ا").replace("آ", "ا")
    # Normalize alef maqsura to ya
    s = s.replace("ى", "ي")
    # Collapse whitespace
    s = re.sub(r"\s+", " ", s).strip()
    return s

def simple_word_tokens(text: str) -> List[str]:
    # Arabic letters + Arabic/ASCII digits
    return re.findall(r"[\u0600-\u06FF]+|[0-9٠-٩]+", text)

In [None]:
from camel_tools.disambig.mle import MLEDisambiguator

# Pretrained MSA disambiguator (context-aware; reliable baseline)
msa_mle = MLEDisambiguator.pretrained()

def dediac(s: str) -> str:
    return _AR_DIAC.sub("", s or "")

def _analysis_dict_from_scored(best):
    # CAMeL returns ScoredAnalysis objects; the dict is in .analysis
    if hasattr(best, "analysis"):
        return best.analysis
    if isinstance(best, dict):
        return best
    if hasattr(best, "__dict__"):
        return best.__dict__
    return {}

def disambiguate_tokens(tokens: List[str]) -> Dict[str, List[str]]:
    """Return parallel lists: lemmas, pos, diac, plus small diagnostics."""
    lemmas, poss, diacs = [], [], []
    oov_flags = []  # True if we failed to get a proper analysis
    results = msa_mle.disambiguate(tokens)
    for tok, res in zip(tokens, results):
        lemma, pos, diac = tok, "", ""
        oov = True
        try:
            if getattr(res, "analyses", None):
                best = res.analyses[0]
                feats = _analysis_dict_from_scored(best)
                raw_lemma = feats.get("lemma") or feats.get("lex") or tok
                lemma = dediac(raw_lemma)
                pos   = feats.get("pos") or feats.get("bw") or ""
                diac  = feats.get("diac") or ""
                oov   = False if (pos or diac or raw_lemma) else True
        except Exception:
            pass
        lemmas.append(lemma); poss.append(pos); diacs.append(diac); oov_flags.append(oov)
    return {"lemmas": lemmas, "pos": poss, "diac": diacs, "oov": oov_flags}


In [None]:
from collections import defaultdict

class DiacSmoother:
    """
    Keeps a canonical diacritization per (lemma, POS) across the document
    to reduce flip-flops. Conservative: only applies when POS matches and we
    already have a non-empty canonical diac for that (lemma, POS).
    """
    def __init__(self):
        self.canonical = {}                    # (lemma, pos) -> diac
        self.freqs = defaultdict(lambda: defaultdict(int))  # (lemma,pos)[diac] -> count

    def observe(self, lemma: str, pos: str, diac: str):
        if not lemma or not pos or not diac:
            return
        key = (lemma, pos)
        self.freqs[key][diac] += 1
        # Update canonical to the most frequent diac so far
        self.canonical[key] = max(self.freqs[key], key=self.freqs[key].get)

    def smooth(self, lemma: str, pos: str, diac: str) -> str:
        if not lemma or not pos:
            return diac
        key = (lemma, pos)
        canon = self.canonical.get(key)
        if canon and diac and diac != canon:
            # Prefer the canonical diac unless current diac is empty
            return canon
        if canon and not diac:
            # Fill in missing diac with canonical
            return canon
        return diac

smoother = DiacSmoother()


In [None]:
# Load transcript; accept either {"segments":[...]} or a plain list
with open(transcript_path, "r", encoding="utf-8") as f:
    raw = json.load(f)

segments = raw.get("segments", raw)
assert isinstance(segments, list), "Input JSON must be a list or have a 'segments' list."

clean_segments = []
for seg in segments:
    # Flexible field names
    start = seg.get("start") if isinstance(seg, dict) else None
    end   = seg.get("end")   if isinstance(seg, dict) else None
    text  = (seg.get("text") or seg.get("original") or seg.get("utterance") or "").strip()

    text_norm = normalize_ar(text)
    tokens = simple_word_tokens(text_norm)

    if tokens:
        ana = disambiguate_tokens(tokens)
    else:
        ana = {"lemmas": [], "pos": [], "diac": [], "oov": []}

    # Update smoother with observations and apply smoothing
    diac_smooth = []
    for l, p, d in zip(ana["lemmas"], ana["pos"], ana["diac"]):
        if d:
            smoother.observe(l, p, d)
        diac_smooth.append(smoother.smooth(l, p, d))

    # Simple quality heuristics for a per-segment reliability flag
    n = max(1, len(tokens))
    diac_cov = sum(1 for d in diac_smooth if d) / n
    oov_ratio = sum(1 for o in ana["oov"] if o) / n
    diac_reliable = (len(tokens) >= 3) and (diac_cov >= 0.6) and (oov_ratio <= 0.5)

    clean_segments.append({
        "start": start,
        "end": end,
        "original": text,          # raw ASR line (human-readable)
        "text_norm": text_norm,    # normalized (search/display)
        "tokens": tokens,          # cleaned tokens
        "lemmas": ana["lemmas"],   # context-aware lemmas
        "pos": ana["pos"],         # coarse POS
        "diac": diac_smooth,       # smoothed diacritics (can hide if unreliable)
        "diac_coverage": round(diac_cov, 3),
        "oov_ratio": round(oov_ratio, 3),
    })

clean_doc = {
    "video": video_name,
    "num_segments": len(clean_segments),
    "segments": clean_segments
}

with open(preprocessed_transcript_path, "w", encoding="utf-8") as f:
    json.dump(clean_doc, f, ensure_ascii=False, indent=2)

print(f"✅ Saved cleaned transcript → {preprocessed_transcript_path}")
print(f"Segments: {len(clean_segments)}")


✅ Saved cleaned transcript → /content/drive/MyDrive/ArabicVideoSummariser/Preprocessed/Almasbagha_CleanTranscript.json
Segments: 82
