<a href="https://colab.research.google.com/github/Raniamea/arabic-video-summarisation/blob/main/notebooks/03_ArabicPreprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 🧠 Arabic Preprocessing with CAMeL Tools
This notebook performs Arabic text preprocessing using CAMeL Tools, including normalization, lemmatization, and optional dialect detection. Designed for use before alignment or semantic validation.

In [None]:

# Install compatible versions of NumPy and CAMeL Tools
!pip install numpy==1.23.5 --force-reinstall --no-cache-dir
!pip install camel-tools==1.5.6 --no-deps

In [47]:

# 📥 Upload your files: transcript (.txt) and captions (.json)
from google.colab import files
#uploaded = files.upload()
video_filename="PaperMaking.mp4"
import os
# Define base paths
base_path = "/content/drive/MyDrive/ArabicVideoSummariser"
videos_path = os.path.join(base_path, "videos")
transcripts_path = os.path.join(base_path, "transcripts")
captions_path = os.path.join(base_path, "captions")
preprocessing_path = os.path.join(base_path, "Preprocessed")

video_path = os.path.join(videos_path, video_filename)
video_name = os.path.splitext(video_filename)[0]



In [None]:
!camel_data -l

In [None]:
!camel_data -i morphology-db-msa-r13
!camel_data -i disambig-mle-calima-msa-r13

In [None]:
import re
import json
from camel_tools.disambig.mle import MLEDisambiguator
from camel_tools.tokenizers.word import simple_word_tokenize

# Load the disambiguator once
disambig = MLEDisambiguator.pretrained()

In [52]:
def preprocess(text, disambig):
    tokens = simple_word_tokenize(text)
    result = disambig.disambiguate(tokens)

    lemmas = []
    for i, r in enumerate(result):
        if r.analyses:
            analysis = r.analyses[0][1]
            lemma = analysis.get('lemma', r.word)
            lemmas.append(lemma)
        else:
            print(f"❌ No analysis for token: '{tokens[i]}'")
            lemmas.append(r.word)
    return ' '.join(lemmas)


# 📄 Parse and lemmatize transcript file with timecodes
def load_transcript(path, disambig):
    with open(path, encoding='utf-8') as f:
        lines = f.readlines()

    segments = []
    pattern = re.compile(r"\[(\d+\.\d+) - (\d+\.\d+)\]\s+(.*)")
    for line in lines:
        match = pattern.match(line)
        if match:
            start, end, text = match.groups()
            lemmatized = preprocess(text.strip(), disambig)
            segments.append({
                "start": float(start),
                "end": float(end),
                "text": text.strip(),
                "lemmas": lemmatized
            })
    return segments

# ✅
#print (f"{transcripts_path}/{video_name}_ar_with_timecodes.txt")
segments = load_transcript(f"{transcripts_path}/{video_name}_ar_with_timecodes.txt", disambig)
print(f"Loaded and lemmatized {len(segments)} transcript segments.")
transcript_preprocess_path = os.path.join(preprocessing_path, f"{video_name}_transcript_ar.json")
with open(transcript_preprocess_path, "w", encoding="utf-8") as f:
    json.dump(segments, f, ensure_ascii=False, indent=2)


Loaded and lemmatized 69 transcript segments.


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/ArabicVideoSummariser/preprocessed/PaperMaking_transcript_ar.json'

In [38]:
def load_captions(path, disambig):
    with open(path, encoding='utf-8') as f:
        data = json.load(f)

    captions = []
    for scene_id, meta in data.items():
        scene_time = meta.get("scene_time", "UNKNOWN")
        arabic_caption = meta.get("arabic", "")
        lemmatized_caption = preprocess(arabic_caption, disambig)

        captions.append({
            "scene_id": scene_id,
            "scene_time": scene_time,
            "caption": arabic_caption,
            "lemmas": lemmatized_caption
        })

    return captions

# ✅ Run
captions_path = [f for f in uploaded if f.endswith(".json")][0]
captions = load_captions(captions_path, disambig)
print(f"Loaded and lemmatized {len(captions)} scene captions.")


Loaded and lemmatized 54 scene captions.


In [40]:
transcript_preprocess_path = os.path.join(preprocessing_path, f"{video_name}_transcript_ar.json")
caption_preprocess_path = os.path.join(preprocessing_path, f"{video_name}_caption_ar.json")
with open(transcript_preprocess_path, "w", encoding="utf-8") as f:
    json.dump(segments, f, ensure_ascii=False, indent=2)

with open(caption_preprocess_path, "w", encoding="utf-8") as f:
    json.dump(captions, f, ensure_ascii=False, indent=2)

print("✅ Saved: $transcript_preprocess_path and $caption_preprocess_path")





✅ Saved: processed_transcript.json and processed_captions.json
