<a href="https://colab.research.google.com/github/Raniamea/arabic-video-summarisation/blob/main/notebooks/04_validate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#  Transcript Validation using CAMeLBERT-MSA & Arabic Captions
Compare a cleaned Arabic ASR transcript against scene captions to improve transcript accuracy.
- Uses `diac`, `lemma`, `pos` from transcript segments
- Uses Arabic captions generated previously
- CAMeLBERT-MSA for semantic validation
- Sliding window for misalignment
- Outputs: **replace**, **append**, and **flag** transcript versions

In [4]:

# ✅ Install only necessary packages (Python 3.11+ compatible)
!pip install -q transformers==4.35.2 sentence-transformers==2.2.2 ftfy


In [6]:
import json
import os
from sentence_transformers import SentenceTransformer, util
from tqdm import tqdm

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')


AttributeError: partially initialized module 'torch' has no attribute 'fx' (most likely due to a circular import)

In [None]:

transcript_path = "/content/drive/MyDrive/ArabicVideoSummariser/Preprocessed/Almasbagha_CleanTranscript.json"
captions_path = "/content/drive/MyDrive/ArabicVideoSummariser/captions/Almasbagha.json"

with open(transcript_path, encoding='utf-8') as f:
    transcript_data = json.load(f)

with open(captions_path, encoding='utf-8') as f:
    captions_data = json.load(f)


In [None]:
# ✅ Setup Model
model = SentenceTransformer("CAMeL-Lab/bert-base-camelbert-msa")

In [None]:

def validate_segments(transcript_data, captions_data, model, window_size=3, threshold_replace=0.82, threshold_append=0.75):
    replace, append, flag = [], [], []
    captions = [c['arabic'] for c in captions_data]

    for idx, segment in tqdm(enumerate(transcript_data), total=len(transcript_data)):
        seg_text = segment['text']
        seg_pos = segment.get('pos', '')
        seg_lemma = segment.get('lemma', '')
        seg_diac = segment.get('diac', '')

        # Combine all info into enriched sentence
        enriched_segment = f"{seg_text} | POS: {seg_pos} | Lemma: {seg_lemma} | Diac: {seg_diac}"
        seg_embedding = model.encode(enriched_segment, convert_to_tensor=True)

        best_score = 0.0
        best_caption = None

        for i in range(max(0, idx - window_size), min(len(captions), idx + window_size + 1)):
            caption_embedding = model.encode(captions[i], convert_to_tensor=True)
            score = util.cos_sim(seg_embedding, caption_embedding).item()
            if score > best_score:
                best_score = score
                best_caption = captions[i]

        segment['match_score'] = best_score
        segment['best_caption'] = best_caption

        if best_score >= threshold_replace:
            segment['validation_action'] = 'replace'
            segment['revised_text'] = best_caption
            replace.append(segment)
        elif best_score >= threshold_append:
            segment['validation_action'] = 'append'
            segment['revised_text'] = seg_text + " " + best_caption
            append.append(segment)
        else:
            segment['validation_action'] = 'flag'
            flag.append(segment)

    return replace, append, flag


In [None]:
# ✅ Run matching

replace, append, flag = validate_segments(transcript_data, captions_data, model)

base_name = os.path.splitext(os.path.basename(transcript_path))[0]
def save_json(data, suffix):
    with open(f"{base_name}_{suffix}.json", "w", encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

save_json(replace, "replace")
save_json(append, "append")
save_json(flag, "flag")
print("✅ Validation results saved.")