<a href="https://colab.research.google.com/github/Raniamea/arabic-video-summarisation/blob/main/notebooks/04_validate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#  Transcript Validation using ASR Generated Transcript & Arabic Captions



In [None]:
!pip install -q --no-cache-dir torch==2.6.0
!pip install -q --no-cache-dir transformers==4.44.2 sentence-transformers==2.6.1
!pip install -q --no-cache-dir numpy==1.26.4 tqdm==4.67.1
!pip install -q torch torchvision torchaudio sentence-transformers evaluate rouge-score


In [1]:
# =========================================================
# Mount Google Drive and define base path
# =========================================================
from google.colab import drive
import os

if not os.path.ismount("/content/drive"):
    drive.mount("/content/drive", force_remount=False)

# Define base path for project files
BASE_PATH = "/content/drive/MyDrive/ArabicVideoSummariser"
os.makedirs(BASE_PATH, exist_ok=True)


In [2]:
# =========================================================
# Obtain Video File Name
# =========================================================
import os, json

params_path = os.path.join(BASE_PATH, "params.json")

#with open(params_path, "r", encoding="utf-8") as f:
#    params = json.load(f)

#video_filename = params.get("video_file")
#assert video_filename, "params.json must include 'video_file'."
video_filename="Almasbagha.mp4"

video_name  = os.path.splitext(video_filename)[0]

In [3]:
# =========================================================
# Define File Paths & Names
# =========================================================
videos_path      = os.path.join(BASE_PATH, "videos")
captions_path    = os.path.join(BASE_PATH, "captions")
preprocessed_path= os.path.join(BASE_PATH, "Preprocessed")
validated_path   = os.path.join(BASE_PATH, "Validated")
os.makedirs(validated_path, exist_ok=True)

caption_path   = os.path.join(captions_path,   f"{video_name}.json")
transcript_path= os.path.join(preprocessed_path, f"{video_name}_CleanTranscript.json")
merged_file= os.path.join(validated_path, f"{video_name}_ScenesIntervalTranscripts_WA.json")
strict_file= os.path.join(validated_path, f"{video_name}_ScenesIntervalTranscripts_SA.json")
validated_alignment_WA=os.path.join(validated_path, f"{video_name}_Validated_WA.json")
validated_alignment_SA=os.path.join(validated_path, f"{video_name}_Validated_SA.json")
validated_alignment=os.path.join(validated_path, f"{video_name}_Validated.json")
validated_result=os.path.join(validated_path, f"{video_name}_Validated.txt")
assert os.path.exists(caption_path),   f"Missing captions file: {caption_path}"
assert os.path.exists(transcript_path),f"Missing transcript file: {transcript_path}"



In [5]:
# ---------------------------
# Load multilingual SBERT model
# ---------------------------
import os, json, re
from bisect import bisect_left, bisect_right
from typing import List, Dict, Any, Tuple, Iterable
from collections import Counter

import torch
from sentence_transformers import SentenceTransformer, util
from tqdm import tqdm


MODEL_ID = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer(MODEL_ID, device=device)


# Allignment

## Strict Allignment

In [6]:
import json, os

# --- Load & prep ---
with open(transcript_path, "r", encoding="utf-8") as f:
    transcripts = json.load(f)

with open(caption_path, "r", encoding="utf-8") as f:
    captions_data = json.load(f)

# --- Extract scene times and texts from the caption file ---
scene_times = []
scene_texts_en = []
scene_texts_ar = []

for key, val in captions_data.items():
    if "scene_time" in val:
        scene_times.append(float(val["scene_time"]))
        scene_texts_en.append((val.get("english") or "").strip())
        scene_texts_ar.append((val.get("arabic") or "").strip())

# --- Add captions to each transcript segment ---
for segment in transcripts["segments"]:
    start = float(segment.get("start", 0))
    end   = float(segment.get("end", 0))

    in_window = [i for i, t in enumerate(scene_times) if start <= t < end]
    en_strings = [scene_texts_en[i] for i in in_window] if in_window else []
    ar_strings = [scene_texts_ar[i] for i in in_window] if in_window else []

    # joined strings + raw lists (non-breaking additions)
    segment["captions_en"] = " ".join([s for s in en_strings if s]).strip()
    segment["captions_ar"] = " ".join([s for s in ar_strings if s]).strip()

# --- Save the updated transcript ---
with open(strict_file, "w", encoding="utf-8") as f:
    json.dump(transcripts, f, ensure_ascii=False, indent=2)

print(f"Strict alignment saved: {strict_file}")
print(f"Updated captions for {len(transcripts['segments'])} transcript segments.")


Strict alignment saved: /content/drive/MyDrive/ArabicVideoSummariser/Validated/Almasbagha_ScenesIntervalTranscripts_SA.json
Updated captions for 81 transcript segments.


## Scene-Based Sliding-Window Alignment

In [7]:
import json

# --- Load data ---
with open(transcript_path, 'r', encoding='utf-8') as f:
    transcripts = json.load(f)
with open(caption_path, 'r', encoding='utf-8') as f:
    captions_data = json.load(f)

# --- Build sorted caption lists ---
scene_items = sorted(
    (
        (
            float(item['scene_time']),
            (item.get('english') or '').strip(),
            (item.get('arabic')  or '').strip(),
        )
        for item in captions_data.values()
    ),
    key=lambda x: x[0]
)

scene_times = [t for t, _, _ in scene_items]
scene_texts_en = [en for _, en, _ in scene_items]
scene_texts_ar = [ar for _, _, ar in scene_items]

# --- Align captions with transcript segments ---
for segment in transcripts['segments']:
    start = float(segment.get('start', 0))
    end   = float(segment.get('end', 0))

    # Find all captions that fall within this segment’s time window
    in_window = [i for i, t in enumerate(scene_times) if start <= t < end]

    gathered_en, gathered_ar = [], []

    # Always include immediate previous and next captions by index
    if in_window:
        first_idx = min(in_window)
        last_idx  = max(in_window)
    else:
        # If no captions fall in the window, pick nearest caption by time
        nearest_idx = min(range(len(scene_times)), key=lambda i: abs(scene_times[i] - start))
        first_idx = last_idx = nearest_idx

    # Caption immediately before
    prev_idx = first_idx - 1 if first_idx > 0 else None
    if prev_idx is not None:
        gathered_en.append(scene_texts_en[prev_idx])
        gathered_ar.append(scene_texts_ar[prev_idx])

    # Captions inside the window
    gathered_en.extend(scene_texts_en[i] for i in range(first_idx, last_idx + 1))
    gathered_ar.extend(scene_texts_ar[i] for i in range(first_idx, last_idx + 1))

    # Caption immediately after
    next_idx = last_idx + 1 if last_idx + 1 < len(scene_texts_en) else None
    if next_idx is not None:
        gathered_en.append(scene_texts_en[next_idx])
        gathered_ar.append(scene_texts_ar[next_idx])

    # Join captions
    segment['captions_en'] = ' '.join([s for s in gathered_en if s]).strip()
    segment['captions_ar'] = ' '.join([s for s in gathered_ar if s]).strip()

# --- Save updated file ---
with open(merged_file, 'w', encoding='utf-8') as f:
    json.dump(transcripts, f, ensure_ascii=False, indent=2)

print(f"Updated captions for {len(transcripts['segments'])} transcript segments")
print(f"Saved to {merged_file}")


Updated captions for 81 transcript segments
Saved to /content/drive/MyDrive/ArabicVideoSummariser/Validated/Almasbagha_ScenesIntervalTranscripts_WA.json


# Config

In [24]:
# ==============================
# CONFIG
# ==============================
KEEP_MARGIN = 0.15
ALFA_FUSION = 0.35
SIM_THRESHOLD = 0.25


# Complete Validation

## Validation

### Helper Functions

In [25]:
import evaluate

# ==============================
# Arabic  ROUGE
# ==============================
rouge_metric = evaluate.load("rouge")


def arabic_rouge_score(pred_text, ref_text):
    from evaluate import load
    rouge_metric = load("rouge")
    scores = rouge_metric.compute(
        predictions=[pred_text],
        references=[ref_text],
        tokenizer=lambda x: list(x)
    )
    return (scores["rouge1"] + scores["rouge2"] + scores["rougeL"]) / 3.0

def compute_fusion_score(lexical, cosine):
    return ALFA_FUSION * lexical + (1 - ALFA_FUSION) * cosine

# ==============================
# Field getters
# ==============================
def first_nonempty(rec, keys):
    for k in keys:
        v = rec.get(k)
        if isinstance(v, str) and v.strip():
            return v.strip()
    return ""

def get_seg_text(rec):
    return first_nonempty(rec, [
        "transcript_text", "text_norm", "original", "text", "segment_text"
    ])

def get_caption_en(rec):
    """English captions for semantic validation."""
    return first_nonempty(rec, [
        "captions_en", "english_caption", "caption_en", "captionsEnglish"
    ])

def get_caption_ar(rec):
    """Arabic captions for lexical validation."""
    return first_nonempty(rec, [
        "captions_ar", "caption_ar", "captionsArabic", "captions", "scene_caption",
        "visual_caption", "blip_caption", "clip_caption", "yolo_caption"
    ])

def get_lemma_text(rec):
    lemmas = rec.get("lemmas")
    if isinstance(lemmas, list) and lemmas:
        return " ".join(map(str, lemmas))
    return ""

# ==============================
# Stopwords & POS helpers
# ==============================
AR_STOPWORDS = {
    "و","في","على","من","إلى","عن","أن","إن","كان","كانت","يكون","مع","هذا","هذه",
    "ذلك","تلك","هناك","هنا","هو","هي","هم","هن","كما","لكن","بل","قد","تم","ثم",
    "كل","أي","أو","أمام","خلال","بعد","قبل","حتى","حيث","إذا","إنما","إما","لدى",
    "لدي","لها","له","لهم","لنا","ما","ماذا","لماذا","كيف","متى","أيضا","بدون","أمام",
    "داخل","خارج","بين","أكثر","أقل"
}

def is_propn(pos_tag):
    return "NOUN" in str(pos_tag).upper()


# ==============================
# Safe JSON serialization
# ==============================
def to_serializable(obj):
    if isinstance(obj, (np.integer,)):
        return int(obj)
    elif isinstance(obj, (np.floating,)):
        return float(obj)
    elif isinstance(obj, (np.bool_,)):
        return bool(obj)
    elif isinstance(obj, torch.Tensor):
        return obj.item()
    return str(obj)


### Main Validation Function

In [26]:
# ==============================
# Main validation
# ==============================
def validate_words_by_visual_support(
    records
):
    enriched = []

    for rec in tqdm(records):
        seg_text = get_seg_text(rec)
        seg_lemma_text = get_lemma_text(rec) or seg_text

        cap_en = get_caption_en(rec)   # English for semantic
        cap_ar = get_caption_ar(rec)   # Arabic for lexical

        rec_out = dict(rec)
        rec_out["validated_text"] = seg_text
        rec_out["cosine_score"] = 0.0

        # --- Semantic similarity ---
        if cap_en and seg_lemma_text:
            with torch.no_grad():
                seg_emb = model.encode(seg_lemma_text, convert_to_tensor=True)
                cap_emb = model.encode(cap_en, convert_to_tensor=True)
                cosine_val = float(util.cos_sim(seg_emb, cap_emb).item())
            rec_out["cosine_score"] = cosine_val
        else:
            cosine_val = 0.0

        # --- Lexical similarity ---
        tokens = rec.get("tokens", [])
        seg_joined = " ".join(tokens) if tokens else seg_lemma_text
        lex = arabic_rouge_score(seg_joined, cap_ar) if cap_ar and seg_joined else 0.0


        # --- Fusion score ---
        fused = compute_fusion_score(lex, cosine_val)
        rec_out["scores"] = {
            "cosine": float(cosine_val),
            "lexical_rouge": float(lex),
            "fused": float(fused)
        }

        #Named Entity flag (using CAMeL NER output) + margin value ---
        named_entities = rec.get("named_entities") or []
        has_named_entity = bool(named_entities)  # True if any named entity present
        #Numeric margin per record -> KEEP_MARGIN if NOUN present, else 0.0
        rec_out["Propn_Keep_Margin"] = float(KEEP_MARGIN) if has_named_entity else 0.0

        enriched.append(rec_out)

    summary = {
        "records": len(enriched),
        "Proper_Keep_true": sum(1 for r in enriched if r.get("Proper_Keep")),
        "params": {
            "alpha_fusion": ALFA_FUSION,
            "keep_margin": KEEP_MARGIN,
            "model": MODEL_ID
        }
    }
    return enriched, summary


###  Stict Allignment Validation

In [27]:
with open(strict_file, "r", encoding="utf-8") as f:
    data = json.load(f)

records = data.get("scenes") or data.get("segments") or data

enriched_records, stats = validate_words_by_visual_support(
    records=records
)

# --- Save full output---
with open(validated_alignment_SA, "w", encoding="utf-8") as f:
    json.dump(
        {"summary": stats, "records": enriched_records},
        f,
        ensure_ascii=False,
        indent=2,
        default=to_serializable
    )


print("Summary:", stats)

print(f"Strict Allignment Validation complete : {validated_alignment_SA}")


100%|██████████| 81/81 [00:45<00:00,  1.80it/s]

Summary: {'records': 81, 'Proper_Keep_true': 0, 'params': {'alpha_fusion': 0.35, 'keep_margin': 0.15, 'model': 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2'}}
Strict Allignment Validation complete : /content/drive/MyDrive/ArabicVideoSummariser/Validated/Almasbagha_Validated_SA.json





### Scene Window Allignment Validation

In [28]:
with open(merged_file, "r", encoding="utf-8") as f:
    data = json.load(f)

records = data.get("scenes") or data.get("segments") or data

enriched_records, stats = validate_words_by_visual_support(
    records=records,
)

print("Summary:", stats)

# --- Save full output ---
with open(validated_alignment_WA, "w", encoding="utf-8") as f:
    json.dump(
        {"summary": stats, "records": enriched_records},
        f,
        ensure_ascii=False,
        indent=2,
        default=to_serializable
    )

print(f"Window Allignment Validation complete : {validated_alignment_WA}")


100%|██████████| 81/81 [01:54<00:00,  1.41s/it]

Summary: {'records': 81, 'Proper_Keep_true': 0, 'params': {'alpha_fusion': 0.35, 'keep_margin': 0.15, 'model': 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2'}}
Window Allignment Validation complete : /content/drive/MyDrive/ArabicVideoSummariser/Validated/Almasbagha_Validated_WA.json





## Combining & Final Output

In [29]:
# ==========================================
# Combine Strict + Window by Highest Fusion
# ==========================================
import json, os

# --- Load both validated alignment files ---
with open(validated_alignment_SA, "r", encoding="utf-8") as f:
    strict_data = json.load(f)
with open(validated_alignment_WA, "r", encoding="utf-8") as f:
    window_data = json.load(f)

# --- Extract record lists ---
strict_segments = strict_data.get("records") or strict_data.get("segments") or strict_data.get("scenes") or []
window_segments = window_data.get("records") or window_data.get("segments") or window_data.get("scenes") or []

# --- Combine based on highest fusion score ---
combined_records = []
common_len = min(len(strict_segments), len(window_segments))

num_from_strict = 0
num_from_window = 0

for i in range(common_len):
    s_rec = strict_segments[i]
    w_rec = window_segments[i]

    s_scores = s_rec.get("scores") or {}
    w_scores = w_rec.get("scores") or {}

    s_cosine = float(s_scores.get("cosine", 0.0) or 0.0)
    w_cosine = float(w_scores.get("cosine", 0.0) or 0.0)

    # Choose record with higher fused score
    choose_window = (w_cosine > s_cosine)
    best = w_rec if choose_window else s_rec

    best_out = dict(best)  # shallow copy so we don't modify the original
    num_from_window += int(choose_window)
    num_from_strict += int(not choose_window)

    combined_records.append(best_out)

# --- Summary metadata ---
combined_summary = {
    "records_total": len(combined_records),
    "source_files": {
        "strict": validated_alignment_SA,
        "window": validated_alignment_WA
    },
    "selection_criterion": "highest fused score per segment",
    "chosen_source_counts": {
        "from_strict": num_from_strict,
        "from_window": num_from_window
    }
}


# --- Save combined JSON file ---
with open(validated_alignment, "w", encoding="utf-8") as f:
    json.dump(
        {"summary": combined_summary, "records": combined_records},
        f,
        ensure_ascii=False,
        indent=2
    )

print(f"Combined alignment saved: {validated_alignment}")
print(f"Total segments compared: {common_len}")
print(f"Chosen from Strict Alignment: {num_from_strict}")
print(f"Chosen from Window Alignment: {num_from_window}")
print("Selection criteria: Highest cosine score per segment")


Combined alignment saved: /content/drive/MyDrive/ArabicVideoSummariser/Validated/Almasbagha_Validated.json
Total segments compared: 81
Chosen from Strict Alignment: 11
Chosen from Window Alignment: 70
Selection criteria: Highest cosine score per segment


In [30]:
# ==========================================
# Select and Concatenate Validated Segments
# ==========================================
import json, os

# --- Load validated alignment file ---
with open(validated_alignment, "r", encoding="utf-8") as f:
    data = json.load(f)

records = data.get("records") or data.get("segments") or data.get("scenes") or []

selected_segments = []

# --- Iterate and select segments based on condition ---
for rec in records:
    scores = rec.get("scores", {})
    fusion_score = float(scores.get("fused", 0.0))
    margin = float(rec.get("Propn_Keep_Margin", 0.0) or 0.0)

    text = (
        rec.get("validated_text")
        or rec.get("transcript_text")
        or rec.get("text")
        or ""
    ).strip()

    if fusion_score>SIM_THRESHOLD-margin:
        if text:
            selected_segments.append(text)
    else:
        if text:
            print(f" Rejected segment: {text[:80]}...")

# --- Concatenate selected text ---
validated_text_all = " ".join(selected_segments).strip()

with open(validated_result, "w", encoding="utf-8") as f:
    f.write(validated_text_all)

# --- Print stats ---
print(f"Saved concatenated validated text → {validated_result}")
print(f"Segments meeting condition: {len(selected_segments)} / {len(records)}")
print(f"Threshold applied: SIM_THRESHOLD - Propn_Keep_Margin ({SIM_THRESHOLD})")


 Rejected segment: مصبعة...
 Rejected segment: ستشعر فور دخولك ان حقبة زمانية عاثرها هذا المكان...
 Rejected segment: منها ما زال يروي حتي الان...
 Rejected segment: التي لا تزال تعمل...
 Rejected segment: كي يوم...
 Rejected segment: اربعة واربعين. اه مواليد سبعة وتلاتين شهر اربعة يوم اربعة....
 Rejected segment: عندي حوالي مشي بالتلاتة وتمانين سنة. اه كنت شغال متعلم...
 Rejected segment: دي ايه؟ دي حاجة...
 Rejected segment: ده فوق كل شيء...
 Rejected segment: دخلت الجيش ععدت فيه 9 سنين...
 Rejected segment: حضرت حاجة...
 Rejected segment: حرب سبعة وشتين وحضرت حرب...
 Rejected segment: فاخدت المسبغة دي المسبغة دي عمرها...
 Rejected segment: 119 سنة من سنة واحدة...
 Rejected segment: الروح صبت عيش...
 Rejected segment: تقول كده...
 Rejected segment: انا حاجة جيدة...
 Rejected segment: ها هم اولاد عميقين...
 Rejected segment: هتغرب عليك؟...
 Rejected segment: في فترة انا مسكتي فيها المزبغة...
 Rejected segment: وفترة بعد كده انا قعدت ودخلت جيشي ونظامي...
 Rejected segment: الاول كان كل

# Test Results

In [31]:
print(f"Segments meeting condition: {len(selected_segments)} / {len(records)}")
print(f"Threshold applied: SIM_THRESHOLD - Propn_Keep_Margin ({SIM_THRESHOLD})")
print(f"Threshold applied: AlFA_Fusion ({ALFA_FUSION})")


Segments meeting condition: 44 / 81
Threshold applied: SIM_THRESHOLD - Propn_Keep_Margin (0.25)
Threshold applied: AlFA_Fusion (0.35)
