<a href="https://colab.research.google.com/github/Raniamea/arabic-video-summarisation/blob/main/notebooks/04_validate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#  Transcript Validation using CAMeLBERT-MSA & Arabic Captions
Compare a cleaned Arabic ASR transcript against scene captions to improve transcript accuracy.
- Uses `diac`, `lemma`, `pos` from transcript segments
- Uses Arabic captions generated previously
- CAMeLBERT-MSA for semantic validation
- Sliding window for misalignment
- Outputs: **replace**, **append**, and **flag** transcript versions

In [None]:
# Reset environment first
!pip uninstall -y torch torchvision torchaudio transformers tokenizers \
  sentence-transformers huggingface_hub camel_tools opencv-python opencv-contrib-python \
  opencv-python-headless numpy

# Core installs
!pip install --no-cache-dir numpy==1.23.5
!pip install --no-cache-dir torch==2.0.1 torchvision==0.15.2 torchaudio==2.0.2
!pip install --no-cache-dir transformers==4.41.2 tokenizers==0.19.1
!pip install --no-cache-dir sentence-transformers==2.2.2
!pip install --no-cache-dir huggingface_hub==0.23.2 tqdm==4.66.5

# ✅ OpenCV
!pip install --no-cache-dir opencv-python==4.7.0.72 opencv-contrib-python==4.7.0.72

# ✅ CAMeL Tools
!pip install --no-cache-dir camel-tools==1.5.2


In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os, json, re
from bisect import bisect_left, bisect_right
from typing import List, Dict, Any, Tuple, Iterable
from collections import Counter

import torch
from sentence_transformers import SentenceTransformer, util
from tqdm import tqdm

# ---------------------------
# Project Paths
# ---------------------------
base_path = "/content/drive/MyDrive/ArabicVideoSummariser"
params_path = os.path.join(base_path, "params.json")

with open(params_path, "r", encoding="utf-8") as f:
    params = json.load(f)

video_filename = params.get("video_file")
video_filename = "Almasbagha.mp4"

assert video_filename, "params.json must include 'video_file'."
video_name  = os.path.splitext(video_filename)[0]

videos_path      = os.path.join(base_path, "videos")
captions_path    = os.path.join(base_path, "captions")
preprocessed_path= os.path.join(base_path, "Preprocessed")
validated_path   = os.path.join(base_path, "Validated")

os.makedirs(validated_path, exist_ok=True)

caption_path   = os.path.join(captions_path,   f"{video_name}.json")
transcript_path= os.path.join(preprocessed_path, f"{video_name}_CleanTranscript.json")
merged_file= os.path.join(validated_path, f"{video_name}_ScenesIntervalTranscripts.json")
validation_file=os.path.join(validated_path, f"{video_name}_Validation.json")
validation_results  = os.path.join(validated_path, f"{video_name}_result.txt")
assert os.path.exists(caption_path),   f"Missing captions file: {caption_path}"
assert os.path.exists(transcript_path),f"Missing transcript file: {transcript_path}"

# ---------------------------
# Load multilingual SBERT model fine-tuned for semantic similarity
# ---------------------------
MODEL_ID = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer(MODEL_ID, device=device)


# Allign Scenes and Transcript Segments by timecode and cobine in a single file

In [None]:
import json, os

# --- Load & prep ---
with open(transcript_path, "r", encoding="utf-8") as f:
    transcript_obj = json.load(f)
with open(caption_path, "r", encoding="utf-8") as f:
    captions_data = json.load(f)



segments = sorted(transcript_obj["segments"], key=lambda s: float(s.get("start", 0.0)))

# Build sorted list of (time, caption) pairs
scene_items = sorted(
    [
        (
            float(v.get("scene_time", 0.0)),
            (v.get("arabic") or v.get("english") or "").strip()
        )
        for v in captions_data.values()
    ],
    key=lambda x: x[0]
)

scene_times = [t for t, _ in scene_items]
scene_caps  = [c for _, c in scene_items]

# last scene ends at max transcript end
max_end = max((float(s.get("end", 0.0)) for s in segments), default=0.0)

# --- Build scene -> transcript alignment across intervals ---
out = []
n_segs = len(segments)
seg_idx = 0  # pointer sweep over transcript segments (sorted by start)

for i, scene_start in enumerate(scene_times):
    scene_end = scene_times[i+1] if (i + 1) < len(scene_times) else max_end
    caption_text = scene_caps[i]

    # Advance pointer past segments that end before the scene starts
    while seg_idx < n_segs and float(segments[seg_idx].get("end", 0.0)) <= scene_start:
        seg_idx += 1

    # Collect all segments that overlap [scene_start, scene_end)
    cur_texts, cur_lemmas, cur_pos = [], [], []
    j = seg_idx
    while j < n_segs:
        s = segments[j]
        s_start = float(s.get("start", 0.0))
        s_end   = float(s.get("end", 0.0))

        if s_start >= scene_end:
            break  # beyond interval

        if (s_end > scene_start) and (s_start < scene_end):
            cur_texts.append(s.get("text_norm") or s.get("original", "") or "")
            cur_lemmas.extend(s.get("lemmas", []))
            cur_pos.extend(s.get("pos", []))

        j += 1

    out.append({
        "scene_index": i,
        "scene_start": scene_start,
        "scene_end": scene_end,
        "caption": caption_text,
        "transcript_text": " ".join(t for t in cur_texts if t).strip(),
        "lemmas": cur_lemmas,
        "pos": cur_pos
    })

# --- Save ---
with open(merged_file, "w", encoding="utf-8") as f:
    json.dump({"scenes": out}, f, ensure_ascii=False, indent=2)

print(f"✅ Saved {len(out)} scenes:", merged_file)


# Combine scenes & captions that have empty Transcript text

In [None]:
import json, os


with open(merged_file, "r", encoding="utf-8") as f:
    data = json.load(f)

scenes = data["scenes"]

merged = []
buffer = None  # to collect consecutive empty scenes

for scene in scenes:
    has_text = bool(scene.get("transcript_text", "").strip())

    if not has_text:
        if buffer is None:
            buffer = dict(scene)  # start new empty block
        else:
            buffer["scene_end"] = scene["scene_end"]
            buffer["caption"] = (buffer.get("caption", "") + " " + scene.get("caption", "")).strip()
            # transcript_text/lemmas/pos stay empty
    else:
        if buffer:
            merged.append(buffer)
            buffer = None
        merged.append(scene)

# Flush if leftover empty buffer
if buffer:
    merged.append(buffer)

# Reindex
for i, scene in enumerate(merged):
    scene["scene_index"] = i

# --- Overwrite the same file ---
data["scenes"] = merged
with open(merged_file, "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

print(f"✅ Consecutive empty transcript scenes merged.")
print(f"   Original scenes: {len(scenes)} -> Updated scenes: {len(merged)}")
print("💾 File updated in place:", merged_file)


# Validation

In [None]:
import json, os, re
from tqdm import tqdm
import torch
from sentence_transformers import util

# ---------------------------
# Helpers
# ---------------------------

AR_STOPWORDS = {
    "و","في","على","من","إلى","عن","أن","إن","كان","كانت","يكون","مع","هذا","هذه",
    "ذلك","تلك","هناك","هنا","هو","هي","هم","هن","كما","لكن","بل","قد","تم","ثم",
    "كل","أي","أو","أمام","خلال","بعد","قبل","حتى","حيث","إذا","إنما","إما","لدى",
    "لدي","لها","له","لهم","لنا","ما","ماذا","لماذا","كيف","متى","أيضا","بدون","أمام",
    "داخل","خارج","بين","أكثر","أقل"
}

def ar_normalize(text: str) -> str:
    """Light Arabic normalization (remove diacritics, unify alef/ya/ta marbuta)."""
    if not isinstance(text, str):
        return ""
    text = re.sub(r"[\u0610-\u061A\u064B-\u065F\u0670\u06D6-\u06ED]", "", text)
    text = text.replace("أ","ا").replace("إ","ا").replace("آ","ا")
    text = text.replace("ة","ه").replace("ى","ي")
    return text

def ar_tokens(text: str):
    """Tokenize + normalize Arabic text."""
    tokens = re.split(r"\s+", ar_normalize(text))
    return [t for t in tokens if t and t not in AR_STOPWORDS]

def lexical_overlap(seg_lemmas, cap_tokens, seg_tokens, use_ngrams=True):
    """Compute lexical overlap score between transcript and caption tokens."""
    seg_set = set(seg_lemmas) | set(seg_tokens)
    cap_set = set()
    for toks in cap_tokens:
        cap_set |= set(toks)
        if use_ngrams:
            cap_set |= set("".join(toks[i:i+2]) for i in range(len(toks)-1))
    if not seg_set or not cap_set:
        return 0.0
    return len(seg_set & cap_set) / len(seg_set)

def fusion_score(lexical, cosine, alpha=0.5):
    """Weighted fusion of lexical and cosine scores."""
    return alpha*lexical + (1-alpha)*cosine

def is_propn(pos_tag):
    """Check if POS tag indicates proper noun."""
    return "PROPN" in pos_tag.upper()

def should_backoff_too_much_removed(kept_words, dropped_words, max_removed_ratio=0.3):
    """If too many words would be dropped, keep original segment."""
    total = len(kept_words) + len(dropped_words)
    if total == 0:
        return True
    return (len(dropped_words) / total) > max_removed_ratio

# ---------------------------
# Main validation
# ---------------------------

def validate_words_by_visual_support(
    scenes,
    model,
    alpha_fusion: float = 0.5,
    sim_threshold: float = 0.55,
    min_word_len: int = 3,
    propn_keep_margin: float = 0.15,
    backoff_removed_ratio: float = 0.30
):
    enriched = []
    kept_total, dropped_total = 0, 0

    for scene in tqdm(scenes):
        seg_text = scene.get("transcript_text", "").strip()
        seg_lemmas = scene.get("lemmas", [])
        seg_pos = scene.get("pos", [])
        caption = scene.get("caption", "").strip()

        seg_out = dict(scene)
        seg_out["validated_text"] = seg_text
        seg_out["support_score"] = 0.0
        seg_out["validated"] = False

        if caption and seg_text:
            # embeddings
            with torch.no_grad():
                seg_emb = model.encode(seg_text, convert_to_tensor=True)
                cap_emb = model.encode(caption, convert_to_tensor=True)
                cos = float(util.cos_sim(seg_emb, cap_emb).item())

            # lexical overlap
            cap_tokens = [ar_tokens(caption)]
            lex = lexical_overlap(seg_lemmas, cap_tokens, ar_tokens(seg_text), use_ngrams=True)

            # fused score
            fused = fusion_score(lex, cos, alpha=alpha_fusion)

            # keep/drop decision
            raw_words = [w for w in re.sub(r"\s+", " ", seg_text).split(" ") if w]
            kept_words, dropped_words = [], []

            for i, w in enumerate(raw_words):
                w_norm = ar_normalize(w)
                if len(w_norm) < min_word_len or w_norm in AR_STOPWORDS:
                    kept_words.append(w)
                    continue

                # proper noun safeguard
                is_name = False
                if isinstance(seg_pos, list) and i < len(seg_pos):
                    is_name = is_propn(str(seg_pos[i]))

                thr = sim_threshold - (propn_keep_margin if is_name else 0.0)
                if fused >= thr:
                    kept_words.append(w)
                else:
                    dropped_words.append(w)

            # backoff if too many drops
            if should_backoff_too_much_removed(kept_words, dropped_words, max_removed_ratio=backoff_removed_ratio):
                kept_words = raw_words
                dropped_words = []

            validated_text = " ".join(kept_words)

            seg_out["scores"] = {"cosine": cos, "lexical": lex, "fused": fused}
            seg_out["validated_text"] = validated_text
            seg_out["dropped_words"] = dropped_words
            seg_out["kept_words"] = kept_words
            seg_out["support_score"] = fused
            seg_out["validated"] = fused >= sim_threshold

            kept_total += len(kept_words)
            dropped_total += len(dropped_words)

        enriched.append(seg_out)

    summary = {
        "scenes": len(enriched),
        "kept_words": kept_total,
        "dropped_words": dropped_total,
        "params": {
            "alpha_fusion": alpha_fusion,
            "sim_threshold": sim_threshold,
            "min_word_len": min_word_len,
            "propn_keep_margin": propn_keep_margin,
            "backoff_removed_ratio": backoff_removed_ratio
        }
    }
    return enriched, summary

# ---------------------------
# Run on your aligned file
# ---------------------------


with open(merged_file, "r", encoding="utf-8") as f:
    data = json.load(f)

scenes = data["scenes"]

enriched, stats = validate_words_by_visual_support(
    scenes=scenes,
    model=model,       # assumes SentenceTransformer model is already loaded
    alpha_fusion=0.5,
    sim_threshold=0.55,
    min_word_len=3,
    propn_keep_margin=0.15,
    backoff_removed_ratio=0.30
)

print("Summary:", stats)

# Save

with open(validation_file, "w", encoding="utf-8") as f:
    json.dump({"summary": stats, "scenes": enriched}, f, ensure_ascii=False, indent=2)

print(f"✅ Saved validated file: {validation_file}")


In [None]:
import json, os
from tqdm import tqdm
import torch
from sentence_transformers import util, SentenceTransformer

# ---------------------------
# Validation function
# ---------------------------
def validate_arabic_embeddings(
    scenes,
    model,
    sim_threshold: float = 0.35,   # lower threshold for Arabic cosine
    propn_keep_margin: float = 0.15
):
    enriched = []

    for scene in tqdm(scenes):
        seg_text = scene.get("transcript_text", "").strip()
        seg_pos  = scene.get("pos", [])
        caption  = scene.get("caption", "").strip()

        scene_out = dict(scene)
        scene_out["validated_text"] = seg_text
        scene_out["support_score"] = 0.0
        scene_out["validated"] = False

        if caption and seg_text:
            with torch.no_grad():
                seg_emb = model.encode(seg_text, convert_to_tensor=True)
                cap_emb = model.encode(caption, convert_to_tensor=True)
                cos = float(util.cos_sim(seg_emb, cap_emb).item())

            scene_out["support_score"] = cos

            thr = sim_threshold - (propn_keep_margin if any("PROPN" in str(p).upper() for p in seg_pos) else 0.0)
            if cos >= thr:
                scene_out["validated"] = True

        enriched.append(scene_out)

    summary = {
        "scenes": len(enriched),
        "params": {
            "model": model_name,
            "sim_threshold": sim_threshold,
            "propn_keep_margin": propn_keep_margin,
        }
    }
    return enriched, summary

# ---------------------------
# Run validation on aligned file
# ---------------------------
with open(merged_file, "r", encoding="utf-8") as f:
    data = json.load(f)

scenes = data["scenes"]

enriched, stats = validate_arabic_embeddings(
    scenes=scenes,
    model=model,
    sim_threshold=0.35,   # tune between 0.25–0.4
    propn_keep_margin=0.15
)

print("Summary:", stats)

# Save

with open(validation_file, "w", encoding="utf-8") as f:
    json.dump({"summary": stats, "scenes": enriched}, f, ensure_ascii=False, indent=2)

print(f"✅ Saved validated file: {validation_file}")


In [None]:
import json, os

# ==== CONFIG ====
THRESH = 0.30  # "exceed 0.3" -> use > 0.30

# ==== LOAD ====
with open(validation_file, "r", encoding="utf-8") as f:
    data = json.load(f)

# The structure is assumed to be: {"summary": {...}, "scenes": [ ... ]}
scenes = data.get("scenes", data)  # fallback if file is just a list

# ==== COLLECT ====
selected = []
for s in scenes:
    score = float(s.get("support_score", 0.0))
    text  = (s.get("transcript_text") or "").strip()
    if score > THRESH and text:
        selected.append(text)

collective_text = " ".join(selected).strip()

# ==== SAVE ====
with open(validation_results, "w", encoding="utf-8") as f:
    f.write(collective_text)

print(f"✅ Scenes in file: {len(scenes)}")
print(f"✅ Selected transcripts (> {THRESH}): {len(selected)}")
print("💾 Saved collective text to:", validation_results)
print("\nPreview:\n", collective_text[:400] + ("..." if len(collective_text) > 400 else ""))
