<a href="https://colab.research.google.com/github/Raniamea/arabic-video-summarisation/blob/main/notebooks/04_validate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Arabic Transcript Validation: CAMeLBERT + CLIP
This notebook validates transcript segments by comparing:
1. Textual semantics via [CAMeLBERT](https://huggingface.co/CAMeL-Lab/bert-base-camelbert-mix)
2. Visual alignment via [CLIP multilingual model](https://huggingface.co/sentence-transformers/clip-ViT-B-32-multilingual-v1)

Outputs three versions of the transcript: `replace`, `append`, and `flag`, based on match confidence.

In [None]:
# ✅ Install dependencies
!pip install -q transformers==4.35.2 camel-tools==1.5.0 numpy==1.23.5 sentence-transformers==2.2.2 opencv-python-headless ftfy


In [None]:
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
import torch

camelbert = AutoModel.from_pretrained("CAMeL-Lab/bert-base-camelbert-mix")
camelbert_tokenizer = AutoTokenizer.from_pretrained("CAMeL-Lab/bert-base-camelbert-mix")

clip_model = SentenceTransformer("clip-ViT-B-32-multilingual-v1")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
camelbert = camelbert.to(device)


In [None]:
from google.colab import files
import json

uploaded = files.upload()
transcript_file = [f for f in uploaded if "Transcript" in f][0]
captions_file = [f for f in uploaded if "captions" in f][0]

with open(transcript_file, encoding='utf-8') as f:
    transcript = json.load(f)

with open(captions_file, encoding='utf-8') as f:
    captions = json.load(f)

print(f"Loaded {len(transcript)} transcript segments, {len(captions)} captions.")


In [None]:
from torch.nn.functional import cosine_similarity
from PIL import Image
from tqdm import tqdm
import os

def embed_text_camelbert(text):
    tokens = camelbert_tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
    with torch.no_grad():
        output = camelbert(**tokens)
    return output.last_hidden_state.mean(dim=1).squeeze()

def embed_text_clip(text):
    return clip_model.encode(text, convert_to_tensor=True)

def embed_image_clip(image_path):
    return clip_model.encode(Image.open(image_path), convert_to_tensor=True)


In [None]:
scene_folder = "scenes"
validated = {"replace": [], "append": [], "flag": []}

for seg in tqdm(transcript):
    scene_id = seg.get("scene_id") or seg.get("id") or seg.get("scene")
    seg_text = seg.get("diac") or seg.get("text")
    image_path = os.path.join(scene_folder, f"{scene_id}.jpg")

    caption = next((c["arabic"] for c in captions if str(c.get("scene_id")) == str(scene_id)), None)

    if not caption or not os.path.exists(image_path):
        validated["flag"].append({**seg, "reason": "Missing caption or image"})
        continue

    try:
        emb_text = embed_text_camelbert(seg_text)
        emb_caption = embed_text_camelbert(caption)
        sem_sim = cosine_similarity(emb_text, emb_caption, dim=0).item()

        emb_img = embed_image_clip(image_path)
        emb_txt_clip = embed_text_clip(seg_text)
        vis_sim = cosine_similarity(emb_img, emb_txt_clip, dim=0).item()

        result = {
            "scene_id": scene_id,
            "start": seg.get("start"),
            "end": seg.get("end"),
            "text": seg_text,
            "caption": caption,
            "semantic_sim": round(sem_sim, 4),
            "visual_sim": round(vis_sim, 4)
        }

        if sem_sim >= 0.75 and vis_sim >= 0.7:
            validated["replace"].append(result)
        elif sem_sim >= 0.5 or vis_sim >= 0.6:
            validated["append"].append(result)
        else:
            validated["flag"].append(result)
    except Exception as e:
        validated["flag"].append({**seg, "error": str(e)})


In [None]:
for k in validated:
    with open(f"Validated_{k}_CAMEL_CLIP.json", "w", encoding="utf-8") as f:
        json.dump(validated[k], f, ensure_ascii=False, indent=2)

print("✅ Output files saved:")
for k in validated:
    print(f"  - Validated_{k}_CAMEL_CLIP.json")