<a href="https://colab.research.google.com/github/Raniamea/arabic-video-summarisation/blob/main/03_ArabicPreprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Arabic Preprocessing with CAMeL Tools
This notebook performs Arabic text preprocessing using CAMeL Tools, including normalization, lemmatization, and optional dialect detection. Designed for use before alignment or semantic validation.

In [1]:

# ✅ Install compatible versions of NumPy and CAMeL Tools
!pip install numpy==1.23.5 --force-reinstall --no-cache-dir
!pip install camel-tools

  Installing build dependencies ... [?25l[?25hdone
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mGetting requirements to build wheel[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Getting requirements to build wheel ... [?25l[?25herror
[1;31merror[0m: [1msubprocess-exited-with-error[0m

[31m×[0m [32mGetting requirements to build wheel[0m did not run successfully.
[31m│[0m exit code: [1;36m1[0m
[31m╰─>[0m See above for output.

[1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
Usage:
    camel_data (-i | --install) [-f | --force] <PACKAGE>
    camel_data (-p | --post-install) <PACKAGE> <ARGS>...
    camel_data (-l | --list)
    camel_data (-u | --update)
    camel_data (-v | --version)
    camel_data (-h | --help)


In [9]:
# Download the morphology DB for MSA
!camel_data -i morphology-db-msa-r13

# Download the MLE disambiguator for MSA
!camel_data -i disambig-mle-calima-msa-r13

The following packages will be installed: 'morphology-db-msa-r13'
Downloading package 'morphology-db-msa-r13': 100% 40.5M/40.5M [00:02<00:00, 13.8MB/s]
Extracting package 'morphology-db-msa-r13': 100% 40.5M/40.5M [00:00<00:00, 553MB/s]
The following packages will be installed: 'disambig-mle-calima-msa-r13'
Downloading package 'disambig-mle-calima-msa-r13': 100% 88.7M/88.7M [00:23<00:00, 3.84MB/s]
Extracting package 'disambig-mle-calima-msa-r13': 100% 88.7M/88.7M [00:00<00:00, 563MB/s]


In [2]:
from google.colab import drive
import os

# Unmount first
!fusermount -u /content/drive || echo "Already unmounted"

# Delete the mount folder entirely
!rm -rf /content/drive

# Now mount again
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os
# Define base paths
video_filename="PaperMaking.mp4"
base_path = "/content/drive/MyDrive/ArabicVideoSummariser"
videos_path = os.path.join(base_path, "videos")
transcripts_path = os.path.join(base_path, "transcripts")
captions_path = os.path.join(base_path, "captions")
preprocessed_path= os.path.join(base_path, "Preprocessed")
video_path = os.path.join(videos_path, video_filename)
assert os.path.exists(video_path), f"Video file not found: {video_path}"
video_name = os.path.splitext(video_filename)[0]
transcript_path = os.path.join(transcripts_path, f"{video_name}_ar_with_timecodes.txt")
captions_json_path = os.path.join(captions_path, f"{video_name}.json")
preprocessed_captions_path= os.path.join(preprocessed_path, f"captions_{video_name}_ar.json")
preprocessed_transcript_path= os.path.join(preprocessed_path, f"transcript_{video_name}_ar.json")
clean_captions_path= os.path.join(preprocessed_path, f"clean_captions_{video_name}_ar.json")
clean_transcript_path= os.path.join(preprocessed_path, f"clean_transcript_{video_name}_ar.json")

In [4]:

# 📄 Parse transcript file with timecodes
import re

def load_transcript(path):
    with open(path, encoding='utf-8') as f:
        lines = f.readlines()

    segments = []
    pattern = re.compile(r"\[(\d+\.\d+) - (\d+\.\d+)\]\s+(.*)")
    for line in lines:
        match = pattern.match(line)
        if match:
            start, end, text = match.groups()
            segments.append({
                "start": float(start),
                "end": float(end),
                "text": text.strip()
            })
    return segments

assert os.path.exists(transcript_path), f"Transcript file not found: {transcript_path}"

segments = load_transcript(transcript_path)
print(f"Loaded {len(segments)} transcript segments from {os.path.basename(transcript_path)}.")

Loaded 69 transcript segments from PaperMaking_ar_with_timecodes.txt.


In [5]:

# 📄 Load caption JSON
import json

def load_captions(path):
    with open(path, encoding='utf-8') as f:
        data = json.load(f)
    captions = []
    for scene_id, meta in data.items():
        captions.append({
            "scene_id": scene_id,
            "scene_time": meta["scene_time"],
            "caption": meta["arabic"]
        })
    return captions
assert os.path.exists(captions_json_path), f"Transcript file not found: {captions_json_path}"
captions = load_captions(captions_json_path)
print(f"Loaded {len(captions)} scene captions from {os.path.basename(captions_json_path)}.")


Loaded 54 scene captions from PaperMaking.json.


In [10]:
from camel_tools.disambig.mle import MLEDisambiguator
from camel_tools.tokenizers.word import simple_word_tokenize
from camel_tools.utils.dediac import dediac_ar

disambig = MLEDisambiguator.pretrained()

def preprocess(text: str):
    text = (text or "").strip()
    tokens = [t for t in simple_word_tokenize(text) if t.strip()]

    results = disambig.disambiguate(tokens)

    lemmas, pos_tags, diacs, glosses = [], [], [], []

    for tok, res in zip(tokens, results):
        if res.analyses:
            top = res.analyses[0]               # ScoredAnalysis
            a = top.analysis                    # dict of features
            # Prefer lexeme; fall back gracefully
            lemma = a.get('lex') or a.get('lemma') or a.get('stem') or a.get('diac') or tok
            lemmas.append(lemma)
            pos_tags.append(a.get('pos'))
            diacs.append(a.get('diac'))
            glosses.append(a.get('gloss'))
        else:
            lemmas.append(tok)
            pos_tags.append(None)
            diacs.append(None)
            glosses.append(None)

    return {
        "original": text,
        "tokens": tokens,
        "lemmas": lemmas,
        "lemmas_dediac": [dediac_ar(l) if isinstance(l, str) else l for l in lemmas],
        "pos": pos_tags,
        "diac": diacs,
        "gloss": glosses,
    }


In [11]:

# 🔁 Process all transcript segments
processed_segments = []
for seg in segments:
    proc = preprocess(seg["text"])
    processed_segments.append({
        "start": seg["start"],
        "end": seg["end"],
        "original": seg["text"],
        "tokens": proc["tokens"],
        "lemmas": proc["lemmas"]
    })

with open(preprocessed_transcript_path, "w", encoding="utf-8") as f:
    json.dump(processed_segments, f, ensure_ascii=False, indent=2)

print("✅ Saved: "+ preprocessed_transcript_path)


✅ Saved: /content/drive/MyDrive/ArabicVideoSummariser/Preprocessed/transcript_PaperMaking_ar.json


In [12]:

# 🔁 Process all captions
processed_captions = []
for cap in captions:
    proc = preprocess(cap["caption"])
    processed_captions.append({
        "scene_id": cap["scene_id"],
        "scene_time": cap["scene_time"],
        "original": cap["caption"],
        "tokens": proc["tokens"],
        "lemmas": proc["lemmas"]
    })

with open(preprocessed_captions_path, "w", encoding="utf-8") as f:
    json.dump(processed_captions, f, ensure_ascii=False, indent=2)

print("✅ Saved: "+ preprocessed_captions_path)


✅ Saved: /content/drive/MyDrive/ArabicVideoSummariser/Preprocessed/captions_PaperMaking_ar.json


In [13]:
# ---- Clean preprocessed captions & transcript (alignment-ready) ----
# Uses your already-defined variables:
# base_path, videos_path, transcripts_path, captions_path, preprocessed_path,
# video_filename, video_path, video_name,
# transcript_path, captions_json_path,
# preprocessed_captions_path, preprocessed_transcript_path,
# clean_captions_path, clean_transcript_path

import os, json, re, string, csv

# Where to save the summary CSV
summary_csv_path = os.path.join(preprocessed_path, f"cleaning_summary_{video_name}.csv")

# ---------- Config ----------
# (tweak if needed)
NEAR_DUP_JACCARD = 0.90  # consecutive near-duplicate threshold (captions-only)

# Arabic punctuation + ASCII
AR_PUNCT = set('''ـ،؛؟…«»“”''') | set('"\'()[]{}:;,.!?-/\\|@#$%^&*_+=~`')

# Light Arabic stopwords (lemmas)
AR_STOP = set("""
في من على عن إلى حتى منذ خلال لدى لكل لها له لهم هو هي هم هن نحن انا أنت انتي انتم هذا هذه ذلك تلك هناك هنا حيث لما لا ما يا أو أم ثم بل إن أن كان تكون يكون كانت كانوا كن كنت كنا إذ إذا إذن قد لن لم ألا الا إلا غير سوى كلا كما كيف متى أي الذي التي الذين اللواتي هنالك هكذا كل جدا فقط مثل بعض ربما ايضا أيضًا أيضاً ايضاً ليس ليسوا دون بدون مع بين
""".split())

# Minimal dialect→MSA normalization for lemmas (extend as needed)
DIALECT_MAP = {
    "كويس": "جيد", "كُوَيِّس": "جَيِّد",
    "ازاي": "كيف", "ازاى": "كيف", "إزاي": "كيف",
    "احنا": "نحن", "أحّ": "نحن",
    "عايز": "يريد", "عايزين": "نريد",
    "بنعمل": "نَعْمَل",
    "بنحطه": "نَضَع",
    "بننقعوه": "نَنْقَعُهُ",
    "كده": "هكذا",
    "فيعني": "يعني", "يعني": "يعني",
    "كتير": "كثير",
    "اللي": "الذي",
    "بيبقى": "يكون",
    "بيجي": "يأتي",
    "بيترسم": "يُرْسَم",
    "مفيش": "لا يوجد",
    "مش": "ليس",
    "طبعا": "طبعاً",
    "بنجيب": "نَجْلِب",
}

def is_arabic_char(ch: str) -> bool:
    code = ord(ch)
    return ((0x0600 <= code <= 0x06FF) or
            (0x0750 <= code <= 0x077F) or
            (0x08A0 <= code <= 0x08FF))

def is_mostly_arabic(token: str) -> bool:
    if not token:
        return False
    letters = [c for c in token if c.isalpha()]
    if not letters:
        return False
    ar = sum(1 for c in letters if is_arabic_char(c))
    return ar / len(letters) >= 0.6

def normalize_punct(s: str) -> str:
    s = (s or "").replace('ـ', '')
    s = s.replace('“','"').replace('”','"').replace('’',"'").replace('‘',"'")
    s = s.replace('«','"').replace('»','"')
    # collapse repeated punctuation
    s = re.sub(r'([:;,\.\-\—\–\!؟\?"])\\1{1,}', r'\1', s)
    return s

def clean_tokens(tokens, lemmas):
    """Return (tokens_clean, lemmas_clean) with:
       - punctuation normalization
       - drop pure punct / single Latin
       - prefer lemma; fallback to token if lemma missing/non-Arabic
       - stopword trimming on lemmas
       - small dialect normalization on lemmas
       - collapse consecutive duplicates
    """
    tokens = tokens or []
    lemmas = lemmas or []
    out_toks, out_lems = [], []

    for t, l in zip(tokens, lemmas):
        t = normalize_punct((t or "").strip())
        l = normalize_punct((l or "").strip())

        # empty?
        if not t and not l:
            continue

        # drop pure punctuation or single Latin char
        if t and (all((ch in AR_PUNCT or ch.isspace()) for ch in t) or re.fullmatch(r"[A-Za-z]", t)):
            continue
        if l and (all((ch in AR_PUNCT or ch.isspace()) for ch in l) or re.fullmatch(r"[A-Za-z]", l)):
            l = ""  # force fallback

        # prefer lemma; fallback to token if lemma missing/non-Arabic
        if not l or not is_mostly_arabic(l):
            l = t if is_mostly_arabic(t) else l

        # stopword trimming (lemmas)
        if l in AR_STOP:
            # keep token if it looks meaningful Arabic
            if t and t not in AR_STOP and is_mostly_arabic(t):
                out_toks.append(t)
                out_lems.append(t)
            continue

        # dialect normalization
        l = DIALECT_MAP.get(l, l)

        out_toks.append(t)
        out_lems.append(l)

    # collapse consecutive duplicate (token,lemma) pairs
    dedup_toks, dedup_lems = [], []
    prev = None
    for t, l in zip(out_toks, out_lems):
        pair = (t, l)
        if pair != prev:
            dedup_toks.append(t)
            dedup_lems.append(l)
        prev = pair
    return dedup_toks, dedup_lems

def jaccard(a, b):
    sa, sb = set(a), set(b)
    if not sa and not sb:
        return 1.0
    if not sa or not sb:
        return 0.0
    return len(sa & sb) / max(1, len(sa | sb))

# ---------- Load inputs ----------
assert os.path.exists(preprocessed_captions_path), f"Missing: {preprocessed_captions_path}"
assert os.path.exists(preprocessed_transcript_path), f"Missing: {preprocessed_transcript_path}"

with open(preprocessed_captions_path, "r", encoding="utf-8") as f:
    captions = json.load(f)
with open(preprocessed_transcript_path, "r", encoding="utf-8") as f:
    transcript = json.load(f)

# ---------- Clean captions (+ remove consecutive near-duplicates) ----------
cleaned_caps = []
removed_indices = []
prev_lems = None

for i, item in enumerate(captions):
    toks = item.get("tokens", [])
    lems = item.get("lemmas", [])
    ctoks, clems = clean_tokens(toks, lems)

    # skip if empty after cleaning
    if not clems and not ctoks:
        removed_indices.append(i)
        continue

    # remove consecutive near-duplicates at lemma level
    if prev_lems is not None and jaccard(prev_lems, clems) >= NEAR_DUP_JACCARD:
        removed_indices.append(i)
        continue

    cleaned_caps.append({
        "scene_id": item.get("scene_id"),
        "scene_time": item.get("scene_time"),
        "original": item.get("original"),
        "tokens_clean": ctoks,
        "lemmas_clean": clems,
    })
    prev_lems = clems

# ---------- Clean transcript ----------
cleaned_tr = []
for utt in transcript:
    toks = utt.get("tokens", [])
    lems = utt.get("lemmas", [])
    ctoks, clems = clean_tokens(toks, lems)
    cleaned_tr.append({
        "start": utt.get("start"),
        "end": utt.get("end"),
        "original": utt.get("original"),
        "tokens_clean": ctoks,
        "lemmas_clean": clems,
    })

# ---------- Save outputs ----------
os.makedirs(preprocessed_path, exist_ok=True)
with open(clean_captions_path, "w", encoding="utf-8") as f:
    json.dump(cleaned_caps, f, ensure_ascii=False, indent=2)
with open(clean_transcript_path, "w", encoding="utf-8") as f:
    json.dump(cleaned_tr, f, ensure_ascii=False, indent=2)

# ---------- Summary CSV ----------
def count(items, key):
    return sum(len(x.get(key, [])) for x in items)

summary_rows = [
    {
        "dataset": "captions",
        "items_before": len(captions),
        "items_after": len(cleaned_caps),
        "tokens_before": count(captions, "tokens"),
        "tokens_after": count(cleaned_caps, "tokens_clean"),
        "lemmas_before": count(captions, "lemmas"),
        "lemmas_after": count(cleaned_caps, "lemmas_clean"),
        "duplicates_removed": len(removed_indices),
    },
    {
        "dataset": "transcript",
        "items_before": len(transcript),
        "items_after": len(cleaned_tr),
        "tokens_before": count(transcript, "tokens"),
        "tokens_after": count(cleaned_tr, "tokens_clean"),
        "lemmas_before": count(transcript, "lemmas"),
        "lemmas_after": count(cleaned_tr, "lemmas_clean"),
        "duplicates_removed": 0,
    },
]

with open(summary_csv_path, "w", encoding="utf-8", newline="") as f:
    writer = csv.DictWriter(f, fieldnames=list(summary_rows[0].keys()))
    writer.writeheader()
    writer.writerows(summary_rows)

print("✅ Cleaning done.")
print(f"• Clean captions  → {clean_captions_path}")
print(f"• Clean transcript → {clean_transcript_path}")
print(f"• Summary CSV      → {summary_csv_path}")


✅ Cleaning done.
• Clean captions  → /content/drive/MyDrive/ArabicVideoSummariser/Preprocessed/clean_captions_PaperMaking_ar.json
• Clean transcript → /content/drive/MyDrive/ArabicVideoSummariser/Preprocessed/clean_transcript_PaperMaking_ar.json
• Summary CSV      → /content/drive/MyDrive/ArabicVideoSummariser/Preprocessed/cleaning_summary_PaperMaking.csv
