<a href="https://colab.research.google.com/github/Raniamea/arabic-video-summarisation/blob/main/notebooks/03_ArabicPreprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 🧠 Arabic Preprocessing with CAMeL Tools
This notebook performs Arabic text preprocessing using CAMeL Tools, including normalization, lemmatization, and optional dialect detection. Designed for use before alignment or semantic validation.

In [1]:

# Install compatible versions of NumPy and CAMeL Tools
!pip install numpy==1.23.5 --force-reinstall --no-cache-dir
!pip install camel-tools==1.5.6 --no-deps

Collecting numpy==1.23.5
  Downloading numpy-1.23.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.3 kB)
Downloading numpy-1.23.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.1/17.1 MB[0m [31m133.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.23.5
    Uninstalling numpy-1.23.5:
      Successfully uninstalled numpy-1.23.5
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 2.14.0 requires numpy>=1.24.0, but you have numpy 1.23.5 which is incompatible.
geopandas 1.1.1 requires numpy>=1.24, but you have numpy 1.23.5 which is incompatible.
xarray-einstats 0.9.1 requires numpy>=1.25, but you have numpy 1.23.5 which is incompatible.
xarray 2025.7



In [24]:

# 📥 Upload your files: transcript (.txt) and captions (.json)
from google.colab import files
uploaded = files.upload()


Saving PaperMaking.json to PaperMaking (3).json
Saving PaperMaking_ar_with_timecodes.txt to PaperMaking_ar_with_timecodes (2).txt


In [11]:
!camel_data -l

Package Name                      Size  License     Description
----------------------------  --------  ----------  -------------------------------------------------------------------------------------------------------------------------
all                                                 All available CAMeL Tools packages
defaults                                            Default datasets for all CAMeL Tools components
dialectid-all                                       All available dialect identification models
dialectid-model26             371.2 MB  MIT         Dialect identification model trained to differentiating between 25 Arabic city dialects as well as Modern Standard Arabic
dialectid-model6              153.0 MB  MIT         Dialect identification model trained to differentiating between 5 Arabic city dialects as well as Modern Standard Arabic
disambig-bert-unfactored-all                        All available unfactored BERT disambiguation models
disambig-bert-unfactored-egy

In [14]:
!camel_data -i morphology-db-msa-r13
!camel_data -i disambig-mle-calima-msa-r13

The following packages will be installed: 'morphology-db-msa-r13'
Downloading package 'morphology-db-msa-r13': 100% 40.5M/40.5M [00:00<00:00, 84.2MB/s]
Extracting package 'morphology-db-msa-r13': 100% 40.5M/40.5M [00:00<00:00, 234MB/s]
The following packages will be installed: 'disambig-mle-calima-msa-r13'
Downloading package 'disambig-mle-calima-msa-r13': 100% 88.7M/88.7M [00:00<00:00, 114MB/s]
Extracting package 'disambig-mle-calima-msa-r13': 100% 88.7M/88.7M [00:00<00:00, 151MB/s]


In [28]:
import re
import json
from camel_tools.disambig.mle import MLEDisambiguator
from camel_tools.tokenizers.word import simple_word_tokenize

# Load the disambiguator once
disambig = MLEDisambiguator.pretrained()

In [29]:
def preprocess(text, disambig):
    tokens = simple_word_tokenize(text)
    result = disambig.disambiguate(tokens)

    lemmas = []
    for r in result:
        if r.analyses:
            analysis = r.analyses[0][1]
            lemma = analysis.get('lemma', r.word)
            lemmas.append(lemma)
        else:
            lemmas.append(r.word)
    return ' '.join(lemmas)

# 📄 Parse and lemmatize transcript file with timecodes
def load_transcript(path, disambig):
    with open(path, encoding='utf-8') as f:
        lines = f.readlines()

    segments = []
    pattern = re.compile(r"\[(\d+\.\d+) - (\d+\.\d+)\]\s+(.*)")
    for line in lines:
        match = pattern.match(line)
        if match:
            start, end, text = match.groups()
            lemmatized = preprocess(text.strip(), disambig)
            segments.append({
                "start": float(start),
                "end": float(end),
                "text": text.strip(),
                "lemmas": lemmatized
            })
    return segments

# ✅ Run
transcript_path = [f for f in uploaded if f.endswith(".txt")][0]
segments = load_transcript(transcript_path, disambig)
print(f"Loaded and lemmatized {len(segments)} transcript segments.")


Loaded and lemmatized 69 transcript segments.


In [27]:
def preprocess(text, disambig):
    tokens = simple_word_tokenize(text)
    result = disambig.disambiguate(tokens)

    lemmas = []
    for r in result:
        if r.analyses:
            analysis = r.analyses[0][1]
            lemma = analysis.get('lemma', r.word)
            lemmas.append(lemma)
        else:
            lemmas.append(r.word)
    return ' '.join(lemmas)

def load_captions(path, disambig):
    with open(path, encoding='utf-8') as f:
        data = json.load(f)

    captions = []
    for scene_id, meta in data.items():
        scene_time = meta.get("scene_time", "UNKNOWN")
        arabic_caption = meta.get("arabic", "")
        lemmatized_caption = preprocess(arabic_caption, disambig)

        captions.append({
            "scene_id": scene_id,
            "scene_time": scene_time,
            "caption": arabic_caption,
            "lemmas": lemmatized_caption
        })

    return captions

# ✅ Run
captions_path = [f for f in uploaded if f.endswith(".json")][0]
captions = load_captions(captions_path, disambig)
print(f"Loaded and lemmatized {len(captions)} scene captions.")


Loaded and lemmatized 54 scene captions.


In [None]:

# 🔁 Process all transcript segments
processed_segments = []
for seg in segments:
    proc = preprocess(seg["text"])
    processed_segments.append({
        "start": seg["start"],
        "end": seg["end"],
        "original": seg["text"],
        "tokens": proc["tokens"],
        "lemmas": proc["lemmas"]
    })

with open("processed_transcript.json", "w", encoding="utf-8") as f:
    json.dump(processed_segments, f, ensure_ascii=False, indent=2)

print("✅ Saved: processed_transcript.json")


In [None]:

# 🔁 Process all captions
processed_captions = []
for cap in captions:
    proc = preprocess(cap["caption"])
    processed_captions.append({
        "scene_id": cap["scene_id"],
        "scene_time": cap["scene_time"],
        "original": cap["caption"],
        "tokens": proc["tokens"],
        "lemmas": proc["lemmas"]
    })

with open("processed_captions.json", "w", encoding="utf-8") as f:
    json.dump(processed_captions, f, ensure_ascii=False, indent=2)

print("✅ Saved: processed_captions.json")
