<a href="https://colab.research.google.com/github/Raniamea/arabic-video-summarisation/blob/main/02_scenedetect.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
try:
    import scenedetect
except Exception:
    !pip -q install "scenedetect>=0.6,<0.7"

try:
    import sentence_transformers
except Exception:
    !pip -q install "sentence-transformers>=2.2,<2.7"

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/130.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━[0m [32m122.9/130.8 kB[0m [31m3.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m130.8/130.8 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
# --- Imports
import os, json, random, re
import numpy as np
import torch
import cv2
from PIL import Image

from scenedetect import open_video, SceneManager
from scenedetect.detectors import ContentDetector

# --- Reproducibility (deterministic decoding across runs/devices)
random.seed(0); np.random.seed(0)
torch.manual_seed(0)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(0)

  lines_video = [l for l in lines if ' Video: ' in l and re.search('\d+x\d+', l)]
  rotation_lines = [l for l in lines if 'rotate          :' in l and re.search('\d+$', l)]
  match = re.search('\d+$', rotation_line)
  IMAGEMAGICK_BINARY = r"C:\Program Files\ImageMagick-6.8.8-Q16\magick.exe"


In [3]:
from google.colab import drive
import os

# Unmount first
!fusermount -u /content/drive || echo "Already unmounted"

# Delete the mount folder entirely
!rm -rf /content/drive

# Now mount again
from google.colab import drive
drive.mount('/content/drive')

fusermount: failed to unmount /content/drive: No such file or directory
Already unmounted
Mounted at /content/drive


In [4]:
# --- Paths & params
base_path = "/content/drive/MyDrive/ArabicVideoSummariser"
videos_path = os.path.join(base_path, "videos")
captions_path = os.path.join(base_path, "captions")
keyframes_path = os.path.join(base_path, "keyframes")
transcripts_path = os.path.join(base_path, "transcripts")

os.makedirs(captions_path, exist_ok=True)
os.makedirs(keyframes_path, exist_ok=True)

# Read params.json (must contain: {"video_file": "..."} )
param_path = os.path.join(base_path, "params.json")
with open(param_path, "r", encoding="utf-8") as f:
    params = json.load(f)

#video_filename = params.get("video_file")
video_filename="PaperMaking.mp4"
assert video_filename, "params.json must include 'video_file'."
video_path = os.path.join(videos_path, video_filename)
assert os.path.exists(video_path), f"Video not found: {video_path}"

video_name = os.path.splitext(video_filename)[0]
keyframe_dir = os.path.join(keyframes_path, video_name)
os.makedirs(keyframe_dir, exist_ok=True)

captions_json_path = os.path.join(captions_path, f"{video_name}.json")

print(f"🎥 Processing video file: {video_filename}")

🎥 Processing video file: PaperMaking.mp4


In [5]:
from transformers import AutoProcessor, Blip2ForConditionalGeneration
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import CLIPProcessor, CLIPModel
from sentence_transformers import SentenceTransformer, util
import torch

# Device
device = "cuda" if torch.cuda.is_available() else "cpu"
print("🖥️ Vision/Text device:", device)

# BLIP-2 (EN captioning) — stays on GPU via device_map
caption_processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b", use_fast=False)
caption_model = Blip2ForConditionalGeneration.from_pretrained(
    "Salesforce/blip2-opt-2.7b",
    device_map="auto",
    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
)

# --- Robust NLLB lang-id resolver (works across tokenizer variants)
def resolve_lang_id(tokenizer, lang_code: str) -> int:
    if hasattr(tokenizer, "lang_code_to_id"):
        return tokenizer.lang_code_to_id[lang_code]
    if hasattr(tokenizer, "get_lang_id"):
        return tokenizer.get_lang_id(lang_code)
    if hasattr(tokenizer, "lang_code_to_token"):
        tok = tokenizer.lang_code_to_token[lang_code]
        tid = tokenizer.convert_tokens_to_ids(tok)
        if tid != tokenizer.unk_token_id:
            return tid
    for cand in (f"__{lang_code}__", f"<<{lang_code}>>", lang_code):
        tid = tokenizer.convert_tokens_to_ids(cand)
        if tid != tokenizer.unk_token_id:
            return tid
    raise RuntimeError(f"Could not resolve language id for: {lang_code}")

# NLLB-200 (EN -> AR) — keep on CPU to avoid GPU OOM
TRANS_DEVICE = "cpu"
nllb_model_name = "facebook/nllb-200-distilled-600M"
translator_tokenizer = AutoTokenizer.from_pretrained(nllb_model_name, src_lang="eng_Latn")
translator_model = AutoModelForSeq2SeqLM.from_pretrained(nllb_model_name).to(TRANS_DEVICE)
forced_bos_token_id = resolve_lang_id(translator_tokenizer, "arb_Arab")
print("✅ NLLB-200 loaded on CPU. forced_bos_token_id:", forced_bos_token_id)

# CLIP for image<->EN text grounding — keep on GPU if available
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)

print("✅ Models ready.")


🖥️ Vision/Text device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.



processor_config.json:   0%|          | 0.00/68.0 [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

preprocessor_config.json:   0%|          | 0.00/432 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/882 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/548 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/10.0G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


✅ NLLB-200 loaded on CPU. forced_bos_token_id: 256011


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

✅ Models ready.


In [17]:
import re
from PIL import Image
import cv2
import numpy as np

timecode_re = re.compile(r"\[(\d+(?:\.\d+)?)\s*-\s*(\d+(?:\.\d+)?)\]\s+(.*)")

def get_transcript_context_ar(scene_time: float, window_s: float = 10.0) -> str:
    """Return Arabic transcript text around scene_time ± window_s (concatenated)."""
    if not os.path.exists(transcript_tc_path):
        return ""
    ctx = []
    start_win, end_win = scene_time - window_s, scene_time + window_s
    with open(transcript_tc_path, encoding="utf-8") as f:
        for line in f:
            m = timecode_re.match(line.strip())
            if not m:
                continue
            s, e, text = float(m.group(1)), float(m.group(2)), m.group(3)
            if not (e < start_win or s > end_win):
                ctx.append(text)
    return " ".join(ctx)[:1500]  # keep short for embeddings

def grab_frames_around(cap: cv2.VideoCapture, fps: float, base_time_s: float, offsets=(0.0, 0.2, 0.4)):
    """Return list of PIL images at base_time_s + offsets (seconds)."""
    images = []
    for off in offsets:
        t = max(0.0, base_time_s + off)
        frame_idx = int(t * fps)
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
        ok, fr = cap.read()
        if ok:
            images.append(Image.fromarray(cv2.cvtColor(fr, cv2.COLOR_BGR2RGB)))
    return images

@torch.no_grad()
def blip2_nbest_en(images, beams=5, returns=3):
    """Generate N-best English captions across multiple images (deterministic)."""
    cands = []
    for img in images:
        inputs = caption_processor(images=[img], return_tensors="pt", padding=True)\
                 .to(device, torch.float16 if device=="cuda" else torch.float32)
        ids = caption_model.generate(
            **inputs,
            do_sample=False,
            num_beams=beams,
            num_return_sequences=returns,
            length_penalty=1.0,
            repetition_penalty=1.05,
            no_repeat_ngram_size=3,
            max_new_tokens=50,
        )
        for seq in ids:
            txt = caption_processor.decode(seq, skip_special_tokens=True).strip()
            cands.append(txt)
    seen, uniq = set(), []
    for t in cands:
        if t not in seen:
            seen.add(t); uniq.append(t)
    return uniq

@torch.no_grad()
def mt_nbest_ar(en_list, beams=4, returns=2):
    """Translate EN candidates to Arabic using NLLB on CPU; return unique (en, ar) pairs."""
    pairs, seen = [], set()
    for en in en_list:
        ti = translator_tokenizer([en], return_tensors="pt", padding=True).to(TRANS_DEVICE)
        out = translator_model.generate(
            **ti,
            do_sample=False,                 # deterministic
            num_beams=beams,
            num_return_sequences=returns,    # smaller to keep CPU fast
            max_new_tokens=64,
            no_repeat_ngram_size=3,
            repetition_penalty=1.15,
            forced_bos_token_id=forced_bos_token_id,  # Arabic output
        )
        for seq in out:
            ar = translator_tokenizer.decode(seq, skip_special_tokens=True).strip()
            key = (en, ar)
            if key not in seen:
                seen.add(key); pairs.append(key)
    return pairs

@torch.no_grad()
def clip_image_text_score(image_pil: Image.Image, text_en: str) -> float:
    """Cosine similarity between image and EN caption using CLIP."""
    inputs = clip_processor(text=[text_en], images=image_pil, return_tensors="pt", padding=True).to(device)
    outs = clip_model(**inputs)
    img = outs.image_embeds / outs.image_embeds.norm(dim=-1, keepdim=True)
    txt = outs.text_embeds / outs.text_embeds.norm(dim=-1, keepdim=True)
    return float((img @ txt.T).squeeze().detach().cpu())

@torch.no_grad()
def sbert_sim_ar(ar_caption: str, ar_context: str) -> float:
    """Cosine similarity between AR caption and AR transcript context."""
    if not ar_context:
        return 0.0
    embs = sbert.encode([ar_caption, ar_context], convert_to_tensor=True, normalize_embeddings=True)
    return float(util.cos_sim(embs[0], embs[1]).item())


In [9]:
# Scene detection (PySceneDetect) & video prep
# Detect scenes
scene_manager = SceneManager()
scene_manager.add_detector(ContentDetector(threshold=30.0))

video = open_video(video_path)
scene_manager.detect_scenes(video)
scene_list = scene_manager.get_scene_list()

# VideoCapture for frame access
cap = cv2.VideoCapture(video_path)
fps = cap.get(cv2.CAP_PROP_FPS) or 25.0

print(f"🎬 Detected {len(scene_list)} scenes, FPS={fps:.2f}")


INFO:pyscenedetect:Detecting scenes...


🎬 Detected 54 scenes, FPS=25.00


In [None]:
# ===== captioning & Visual-grounding re-ranking =====

import os, json, cv2, numpy as np
from PIL import Image

# Container for results
captions = {}

# Hyperparameters (keep consistent with main run for fair comparison)
frame_offsets = (0.0, 0.2, 0.4)   # multi-frame sampling around scene start
beams_caption, returns_caption = 5, 3
beams_mt, returns_mt = 4, 2       # translate best EN for parity (not used for scoring)
save_keyframes = True

for i, (start, _) in enumerate(scene_list):
    scene_t = start.get_seconds()

    # 1) Multi-frame sampling around the same scene boundry
    images = grab_frames_around(cap, fps, scene_t, offsets=frame_offsets)
    if not images:
        print(f"⚠️ Scene {i:03} has no decodable frames; skipping.")
        continue

    # Save representative keyframe (first sampled image)
    frame_name = f"scene_{i:03}.jpg"
    if save_keyframes:
        frame_path = os.path.join(keyframe_dir, frame_name)
        cv2.imwrite(frame_path, cv2.cvtColor(np.array(images[0]), cv2.COLOR_RGB2BGR))

    # 2) English N-best captions across frames with deterministic decoding
    en_cands = blip2_nbest_en(images, beams=beams_caption, returns=returns_caption)
    if not en_cands:
        print(f"⚠️ No EN candidates for scene {i:03}; skipping.")
        continue

    # 3) Visual grounding: average CLIP score across sampled frames
    en_vis = {en: float(np.mean([clip_image_text_score(img, en) for img in images])) for en in en_cands}

    # 4) Pick best English by visual score
    best_en = max(en_vis.keys(), key=lambda t: en_vis[t])
    best_vscore = en_vis[best_en]

    # 5) Translate best EN (N-best) for Arabic output
    pair_cands = mt_nbest_ar([best_en], beams=beams_mt, returns=returns_mt)
    if pair_cands:
        # Prefer the first Arabic for the chosen EN
        cand_for_best_en = [ar for (en, ar) in pair_cands if en == best_en]
        best_ar = cand_for_best_en[0] if cand_for_best_en else pair_cands[0][1]
    else:
        best_ar = ""

    # 6) Save visually validated result
    captions[frame_name] = {
        "scene_time": round(scene_t, 2),
        "english": best_en,
        "arabic": best_ar,
        "scores": {"visual": round(best_vscore, 4)},
    }

    print(f"✓ {frame_name} @ {scene_t:.2f}s | EN*: {best_en} | AR: {best_ar} | V={best_vscore:.3f}")

cap.release()

with open(captions_json_path, "w", encoding="utf-8") as f:
    json.dump(captions, f, ensure_ascii=False, indent=2)

print(f"✅ Captions saved to: {captions_json_path}")
print(f"🖼️ Keyframes dir: {keyframe_dir}")
