<a href="https://colab.research.google.com/github/Raniamea/arabic-video-summarisation/blob/main/notebooks/04_validate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#  Transcript Validation using CAMeLBERT-MSA & Arabic Captions
Compare a cleaned Arabic ASR transcript against scene captions to improve transcript accuracy.
- Uses `diac`, `lemma`, `pos` from transcript segments
- Uses Arabic captions generated previously
- CAMeLBERT-MSA for semantic validation
- Sliding window for misalignment
- Outputs: **replace**, **append**, and **flag** transcript versions

In [None]:
# ✅ Install dependencies
!pip install -q numpy==1.23.5 transformers==4.35.2 sentence-transformers==2.2.2

In [None]:
# ✅ Imports
import json
from sentence_transformers import SentenceTransformer, util
from tqdm import tqdm
import numpy as np

In [None]:
# ✅ Load Files
with open('/content/Almasbagha_CleanTranscript.json', 'r', encoding='utf-8') as f:
    transcript = json.load(f)
with open('/content/Almasbagha_visualcaptions.json', 'r', encoding='utf-8') as f:
    captions = json.load(f)

In [None]:
# ✅ Setup Model
model = SentenceTransformer('CAMeL-Lab/bert-base-camelbert-msa')

In [None]:
# ✅ Validation config
WINDOW_SIZE = 3
THRESHOLD_REPLACE = 0.75
THRESHOLD_APPEND = 0.60

In [None]:
# ✅ Helper: Sliding window match
def sliding_match(transcript, captions):
    updated_replace, updated_append, updated_flag = [], [], []

    for i in tqdm(range(len(transcript) - WINDOW_SIZE + 1)):
        chunk = transcript[i:i+WINDOW_SIZE]
        chunk_text = ' '.join([seg['diac'] for seg in chunk])

        best_sim = -1
        best_cap = None
        for cap in captions:
            cap_text = cap['caption_ar']
            sim = util.cos_sim(model.encode(chunk_text), model.encode(cap_text)).item()
            if sim > best_sim:
                best_sim = sim
                best_cap = cap_text

        for seg in chunk:
            if best_sim >= THRESHOLD_REPLACE:
                updated_replace.append({**seg, 'adjusted': best_cap, 'sim': best_sim})
            elif best_sim >= THRESHOLD_APPEND:
                updated_append.append({**seg, 'adjusted': seg['diac'] + ' / ' + best_cap, 'sim': best_sim})
            else:
                updated_flag.append({**seg, 'flag': 'low_sim', 'sim': best_sim})

    return updated_replace, updated_append, updated_flag

In [None]:
# ✅ Run matching
replace, append, flag = sliding_match(transcript, captions)

with open('/content/Almasbagha_transcript_replace.json', 'w', encoding='utf-8') as f:
    json.dump(replace, f, ensure_ascii=False, indent=2)
with open('/content/Almasbagha_transcript_append.json', 'w', encoding='utf-8') as f:
    json.dump(append, f, ensure_ascii=False, indent=2)
with open('/content/Almasbagha_transcript_flag.json', 'w', encoding='utf-8') as f:
    json.dump(flag, f, ensure_ascii=False, indent=2)