<a href="https://colab.research.google.com/github/Raniamea/arabic-video-summarisation/blob/main/notebooks/03_ArabicPreprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 🧠 Arabic Preprocessing with CAMeL Tools
This notebook performs Arabic text preprocessing using CAMeL Tools, including normalization, lemmatization, and optional dialect detection. Designed for use before alignment or semantic validation.

In [None]:

# Install compatible versions of NumPy and CAMeL Tools
!pip install numpy==1.26.4 --force-reinstall --no-cache-dir
!pip install camel-tools==1.5.6

In [13]:

# 📥 Upload your files: transcript (.txt) and captions (.json)
from google.colab import drive
drive.mount('/content/drive')
video_filename="PaperMaking.mp4"
import os
# Define base paths
base_path = "/content/drive/MyDrive/ArabicVideoSummariser"
videos_path = os.path.join(base_path, "videos")
transcripts_path = os.path.join(base_path, "transcripts")
captions_path = os.path.join(base_path, "captions")
preprocessing_path = os.path.join(base_path, "Preprocessed")

video_path = os.path.join(videos_path, video_filename)
video_name = os.path.splitext(video_filename)[0]



Mounted at /content/drive


In [None]:
!camel_data -l

In [19]:
!camel_data -i morphology-db-msa-r13
!camel_data -i disambig-mle-calima-msa-r13

!camel_data -i morphology-db-egy-r13
!camel_data -i disambig-mle-calima-egy-r13

!camel_data -i dialectid-model26

No new packages will be installed.
No new packages will be installed.
No new packages will be installed.
The following packages will be installed: 'morphology-db-egy-r13'
Downloading package 'morphology-db-egy-r13': 100% 67.3M/67.3M [00:02<00:00, 25.4MB/s]
Extracting package 'morphology-db-egy-r13': 100% 67.3M/67.3M [00:00<00:00, 193MB/s]
The following packages will be installed: 'disambig-mle-calima-egy-r13'
Downloading package 'disambig-mle-calima-egy-r13': 100% 27.2M/27.2M [00:00<00:00, 29.0MB/s]
Extracting package 'disambig-mle-calima-egy-r13': 100% 27.2M/27.2M [00:00<00:00, 234MB/s]
No new packages will be installed.


In [24]:
import re
import json
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

from camel_tools.tokenizers.word import simple_word_tokenize
from camel_tools.utils.dediac import dediac_ar
from camel_tools.dialectid import DialectIdentifier
from camel_tools.disambig.mle import MLEDisambiguator

# 📥 Initialize all tools
dialect_id = DialectIdentifier.pretrained()
msa_disambig = MLEDisambiguator.pretrained(model='calima-msa-r13')
egy_disambig = MLEDisambiguator.pretrained(model='calima-egy-r13')

# ✅ Detect dialect
def get_dialect(text):
    return dialect_id.predict([text])[0]  # returns 'MSA', 'EGY', etc.

# 🧠 Map sub-dialect label to broader disambiguation class
def map_dialect(did_label):
    if did_label == 'MSA':
        return 'MSA'
    elif did_label in ['EGY', 'CAI', 'ALX', 'ASW', 'SKH', 'MNF', 'ALY']:
        return 'EGY'
    else:
        return 'OTHER'


# ✅ Preprocessing function
def prepr2ocess(text):
    text = dediac_ar(text)
    tokens = simple_word_tokenize(text)
    detected = dialect_id.predict_sentence(text)
    dialect = map_dialect(detected)

    if dialect == "MSA":
        disambig = msa_disambig
    elif dialect == "EGY":
        disambig = egy_disambig
    else:
        print(f"⚠️ Skipping disambiguation. Unsupported dialect: {detected}")
        return {
            "dialect": detected,
            "original": text,
            "tokens": tokens,
            "lemmas": tokens  # fallback: no lemmatization
        }


    # Disambiguate and extract lemmas
    result = disambig.disambiguate(tokens)
    lemmas = []
    for i, r in enumerate(result):
        if r.analyses:
            analysis = r.analyses[0][1]
            lemma = analysis.get('lemma', r.word)
            lemmas.append(lemma)
        else:
            print(f"❌ No analysis for token: '{tokens[i]}'")
            lemmas.append(tokens[i])

    return {
        "original": text,
        "dialect": detected,
        "tokens": tokens,
        "lemmas": lemmas
    }


In [25]:
# Parse and lemmatize transcript file with timecodes
def load_transcript(path):
    with open(path, encoding='utf-8') as f:
        lines = f.readlines()

    segments = []
    pattern = re.compile(r"\[(\d+\.\d+)\s*-\s*(\d+\.\d+)\]\s+(.*)")
    for line in lines:
        match = pattern.match(line)
        if match:
            start, end, text = match.groups()
            result = preprocess(text.strip())
            segments.append({
                "start": float(start),
                "end": float(end),
                "original": text.strip(),
                "tokens": result["tokens"],
                "lemmas": result["lemmas"]
            })
    return segments

segments = load_transcript(f"{transcripts_path}/{video_name}_ar_with_timecodes.txt")
print(f"Loaded and lemmatized {len(segments)} transcript segments.")
transcript_preprocess_path = os.path.join(preprocessing_path, f"{video_name}_transcript_ar.json")
with open(transcript_preprocess_path, "w", encoding="utf-8") as f:
    json.dump(segments, f, ensure_ascii=False, indent=2)
print(f" Saved: {transcript_preprocess_path}")

AttributeError: 'DIDModel26' object has no attribute 'predict_sentence'

In [17]:
# Parse and lemmatize captions file
def load_captions(path, disambig):
    with open(path, encoding='utf-8') as f:
        data = json.load(f)

    captions = []
    for scene_id, meta in data.items():
        scene_time = meta.get("scene_time", "UNKNOWN")
        arabic_caption = meta.get("arabic", "")
        lemmatized_caption = preprocess(arabic_caption, disambig)

        captions.append({
            "scene_id": scene_id,
            "scene_time": scene_time,
            "caption": arabic_caption,
            "lemmas": lemmatized_caption
        })

    return captions

# ✅ Run
captions = load_captions(f"{captions_path}/{video_name}.json", disambig)
print(f"Loaded and lemmatized {len(captions)} scene captions.")
captions_preprocess_path = os.path.join(preprocessing_path, f"{video_name}_captions_ar.json")
with open(captions_preprocess_path, "w", encoding="utf-8") as f:
    json.dump(segments, f, ensure_ascii=False, indent=2)
print(f"Saved: {captions_preprocess_path}")

Loaded and lemmatized 54 scene captions.
Saved: /content/drive/MyDrive/ArabicVideoSummariser/Preprocessed/PaperMaking_captions_ar.json
