<a href="https://colab.research.google.com/github/Raniamea/arabic-video-summarisation/blob/main/notebooks/04_validate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#  Transcript Validation using CAMeLBERT-MSA & Arabic Captions
Compare a cleaned Arabic ASR transcript against scene captions to improve transcript accuracy.
- Uses `diac`, `lemma`, `pos` from transcript segments
- Uses Arabic captions generated previously
- CAMeLBERT-MSA for semantic validation
- Sliding window for misalignment
- Outputs: **replace**, **append**, and **flag** transcript versions

In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')


import os, json, re
from typing import List, Dict, Any

# Project paths
base_path = "/content/drive/MyDrive/ArabicVideoSummariser"
param_path = os.path.join(base_path, "params.json")
with open(param_path, "r", encoding="utf-8") as f:
    params = json.load(f)
video_filename = params.get("video_file")
video_filename="Almasbagha.mp4"

assert video_filename, "params.json must include 'video_file'."
video_name = os.path.splitext(video_filename)[0]
videos_path = os.path.join(base_path, "videos")
captions_path = os.path.join(base_path, "captions")
preprocessed_path= os.path.join(base_path, "Preprocessed")
validated_path= os.path.join(base_path, "Validated")
video_path = os.path.join(videos_path, video_filename)
assert os.path.exists(video_path), f"Video file not found: {video_path}"
caption_path = os.path.join(captions_path, f"{video_name}.json")
transcript_path= os.path.join(preprocessed_path, f"{video_name}_CleanTranscript.json")

with open(transcript_path, encoding='utf-8') as f:
    transcript_data = json.load(f)

with open(caption_path, encoding='utf-8') as f:
    captions_data = json.load(f)


Mounted at /content/drive


In [8]:

from sentence_transformers import SentenceTransformer, util
from tqdm import tqdm

# ✅ Setup Model
model = SentenceTransformer("CAMeL-Lab/bert-base-arabic-camelbert-msa")


ImportError: cannot import name 'VersionComparison' from 'transformers.utils.import_utils' (/usr/local/lib/python3.11/dist-packages/transformers/utils/import_utils.py)

In [10]:
from bisect import bisect_left
import torch
from sentence_transformers import util

def _as_text(x, prefer_keys):
    """Return the first existing key's value (string) from dict x; else stringify x."""
    if isinstance(x, dict):
        for k in prefer_keys:
            if k in x and x[k] is not None:
                return str(x[k])
        return ""
    return str(x)

def _as_joined_list(x, key):
    """Return space-joined list if dict[key] is a list; if str -> return it; else ''."""
    if isinstance(x, dict):
        v = x.get(key, "")
        if isinstance(v, list):
            return " ".join(map(str, v))
        elif isinstance(v, str):
            return v
    return ""

def validate_segments(
    transcript_data,
    captions_data,
    model,
    window_size=3,
    threshold_replace=0.82,
    threshold_append=0.75
):
    replace, append, flag = [], [], []

    # --- Prepare captions: sort by time and keep Arabic text ---
    # captions_data is a dict keyed by filename -> each value has 'scene_time' and 'arabic'
    captions_sorted = sorted(
        [{"t": v.get("scene_time", 0.0), "text": v.get("arabic", "")} for v in captions_data.values()],
        key=lambda x: x["t"]
    )
    caption_texts = [c["text"] for c in captions_sorted]
    caption_times = [c["t"] for c in captions_sorted]

    # Precompute caption embeddings once (big speed + stability win)
    if len(caption_texts) == 0:
        # Nothing to compare against
        return replace, append, flag
    cap_emb = model.encode(caption_texts, convert_to_tensor=True, show_progress_bar=False)

    # --- Helper: find nearest caption index for a given time ---
    def nearest_caption_index(t):
        j = bisect_left(caption_times, t)
        if j <= 0:
            return 0
        if j >= len(caption_times):
            return len(caption_times) - 1
        # pick closer one of j-1 or j
        return j if abs(caption_times[j] - t) < abs(caption_times[j - 1] - t) else (j - 1)

    # --- Iterate segments ---
    for idx, segment in tqdm(enumerate(transcript_data), total=len(transcript_data)):

        # Ensure we have a dict to attach results to
        if isinstance(segment, dict):
            seg_dict = segment
        else:
            seg_dict = {"text": str(segment)}  # wrap strings to allow attaching results

        # Pull text from transcript: prefer normalized > original > text
        # Your transcript JSON uses 'text_norm' and also has lists for pos/lemmas/diac
        seg_text  = _as_text(seg_dict, ["text_norm", "text", "original"])
        seg_pos   = _as_joined_list(seg_dict, "pos")
        seg_lemma = _as_joined_list(seg_dict, "lemmas")
        seg_diac  = _as_joined_list(seg_dict, "diac")

        # Build enriched string exactly like your logic intends
        enriched_segment = f"{seg_text} | POS: {seg_pos} | Lemma: {seg_lemma} | Diac: {seg_diac}"

        # Segment time (use midpoint if available; otherwise fall back to index)
        if "start" in seg_dict and "end" in seg_dict:
            seg_time = float(seg_dict.get("start", 0.0) + seg_dict.get("end", 0.0)) / 2.0
            center = nearest_caption_index(seg_time)
        else:
            # If no timing in transcript, fall back to aligning roughly by index
            center = min(idx, len(caption_texts) - 1)

        # Candidate caption window by index (like your original logic)
        lo = max(0, center - window_size)
        hi = min(len(caption_texts), center + window_size + 1)
        cand_slice = slice(lo, hi)

        # Compute segment embedding
        seg_emb = model.encode(enriched_segment, convert_to_tensor=True, show_progress_bar=False)

        # Cosine scores vs candidate captions
        scores_vec = util.cos_sim(seg_emb, cap_emb[cand_slice]).squeeze(0)
        best_local_idx = int(torch.argmax(scores_vec).item())
        best_score = float(scores_vec[best_local_idx].item())
        best_caption = caption_texts[lo + best_local_idx] if len(caption_texts) > 0 else None

        # Attach results (preserving your logic)
        seg_dict["match_score"] = best_score
        seg_dict["best_caption"] = best_caption

        if best_caption is None:
            seg_dict["validation_action"] = "flag"
            flag.append(seg_dict)
            continue

        if best_score >= threshold_replace:
            seg_dict["validation_action"] = "replace"
            seg_dict["revised_text"] = best_caption
            replace.append(seg_dict)
        elif best_score >= threshold_append:
            seg_dict["validation_action"] = "append"
            seg_dict["revised_text"] = (seg_text + " " + best_caption).strip()
            append.append(seg_dict)
        else:
            seg_dict["validation_action"] = "flag"
            flag.append(seg_dict)

    return replace, append, flag


ImportError: cannot import name 'cached_download' from 'huggingface_hub' (/usr/local/lib/python3.11/dist-packages/huggingface_hub/__init__.py)

In [None]:
# ✅ Run matching

replace, append, flag = validate_segments(transcript_data, captions_data, model)

base_name = os.path.splitext(os.path.basename(transcript_path))[0]
def save_json(data, suffix):
    output_path = os.path.join(validated_path, f"{base_name}_{suffix}.json")
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    print(f"✅ Saved: {output_path}")


save_json(replace, "replace")
save_json(append, "append")
save_json(flag, "flag")
print("✅ Validation results saved.")

