<a href="https://colab.research.google.com/github/Raniamea/arabic-video-summarisation/blob/main/notebooks/02_object_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# If already mounted, unmount it safely
!fusermount -u /content/drive

# Then try mounting again
from google.colab import drive
drive.mount('/content/drive')

In [None]:
pip install scenedetect[opencv] transformers accelerate bitsandbytes

In [None]:
import os
import cv2
import json
from PIL import Image
from scenedetect import VideoManager, SceneManager
from scenedetect.detectors import ContentDetector
from transformers import AutoProcessor, Blip2ForConditionalGeneration
from transformers import MarianMTModel, MarianTokenizer
import torch

# ============ SETUP ============
device = "cuda" if torch.cuda.is_available() else "cpu"

# BLIP-2 model
caption_processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b")
caption_model = Blip2ForConditionalGeneration.from_pretrained(
    "Salesforce/blip2-opt-2.7b",
    device_map="auto",
    torch_dtype=torch.float16 if device == "cuda" else torch.float32
).to(device)

# Translation model (EN → AR)
translator_tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-ar")
translator_model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-ar").to(device)

# Base paths
base_path = "/content/drive/MyDrive/ArabicVideoSummariser"
videos_path = os.path.join(base_path, "videos")
keyframes_base = os.path.join(base_path, "keyframes")
captions_base = os.path.join(base_path, "captions")
os.makedirs(keyframes_base, exist_ok=True)
os.makedirs(captions_base, exist_ok=True)

# ============ FUNCTION ============
def extract_and_caption(video_path, video_name):
    keyframe_dir = os.path.join(keyframes_base, video_name)
    os.makedirs(keyframe_dir, exist_ok=True)
    captions = {}

    # --- Scene detection ---
    video_manager = VideoManager([video_path])
    scene_manager = SceneManager()
    scene_manager.add_detector(ContentDetector(threshold=30.0))
    video_manager.set_downscale_factor()
    video_manager.start()
    scene_manager.detect_scenes(video_manager)
    scene_list = scene_manager.get_scene_list()
    video_manager.release()

    # --- Extract frames ---
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)

    for i, (start, _) in enumerate(scene_list):
        frame_num = int(start.get_seconds() * fps)
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
        ret, frame = cap.read()
        if not ret:
            continue

        frame_name = f"scene_{i:03}.jpg"
        frame_path = os.path.join(keyframe_dir, frame_name)
        cv2.imwrite(frame_path, frame)

        # Convert to PIL
        image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

        # --- Captioning with BLIP-2 ---
        inputs = caption_processor(images=image, return_tensors="pt").to(device, torch.float16 if device == "cuda" else torch.float32)
        generated_ids = caption_model.generate(**inputs, max_new_tokens=50)
        english_caption = caption_processor.decode(generated_ids[0], skip_special_tokens=True).strip()

        # --- Translate to Arabic ---
        translation_inputs = translator_tokenizer(english_caption, return_tensors="pt", padding=True).to(device)
        translated = translator_model.generate(**translation_inputs)
        arabic_caption = translator_tokenizer.decode(translated[0], skip_special_tokens=True).strip()

        # --- Save result with scene start time ---
        captions[frame_name] = {
            "scene_time": round(start.get_seconds(), 2),  # Time in seconds, rounded for readability
            "english": english_caption,
            "arabic": arabic_caption
        }

        print(f"✓ {frame_name} @ {start.get_seconds():.2f}s | EN: {english_caption} | AR: {arabic_caption}")

    cap.release()

    # Save JSON
    json_path = os.path.join(captions_base, f"{video_name}.json")
    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(captions, f, ensure_ascii=False, indent=2)
    print(f"✅ Captions saved to: {json_path}")

# ============ MAIN LOOP ============
video_files = [f for f in os.listdir(videos_path) if f.lower().endswith(('.mp4', '.mov', '.avi', '.mkv'))]
print(f"🎬 Found {len(video_files)} videos.")

for video_file in video_files:
    video_name = os.path.splitext(video_file)[0]
    video_path = os.path.join(videos_path, video_file)
    print(f"\n🔄 Processing: {video_file}")
    extract_and_caption(video_path, video_name)
