<a href="https://colab.research.google.com/github/Raniamea/arabic-video-summarisation/blob/main/notebooks/02_object_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# If already mounted, unmount it safely
!fusermount -u /content/drive

# Then try mounting again
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Install YOLOv8
#!pip install -q ultralytics
#from ultralytics import YOLO


In [None]:
# Load YOLOv8 model
#model = YOLO("yolov8n.pt")  # Use 'n' for speed, 's' or 'm' for better accuracy


In [None]:
import os
import cv2
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration
from transformers import MarianMTModel, MarianTokenizer
import torch

# Set paths
video_path = "/content/drive/MyDrive/ArabicVideoSummariser/videos/KhanElkhalili.mp4"
output_dir = "/content/drive/MyDrive/ArabicVideoSummariser/keyframes"
os.makedirs(output_dir, exist_ok=True)

# 1. Extract keyframes (simple interval)
def extract_keyframes(video_path, output_folder, interval=30):
    cap = cv2.VideoCapture(video_path)
    frame_count = 0
    saved = 0
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        if frame_count % interval == 0:
            path = os.path.join(output_folder, f"frame_{saved}.jpg")
            cv2.imwrite(path, frame)
            saved += 1
        frame_count += 1
    cap.release()

extract_keyframes(video_path, output_dir, interval=60)

# 2. Caption model
device = "cuda" if torch.cuda.is_available() else "cpu"
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

# 3. Translation model
en_ar_model = "Helsinki-NLP/opus-mt-en-ar"
tokenizer = MarianTokenizer.from_pretrained(en_ar_model)
translator = MarianMTModel.from_pretrained(en_ar_model).to(device)

# 4. Process keyframes
captions = {}

for filename in os.listdir(output_dir):
    if filename.endswith(".jpg"):
        path = os.path.join(output_dir, filename)
        image = Image.open(path).convert("RGB")

        # Captioning
        inputs = processor(image, return_tensors="pt").to(device)
        output = model.generate(**inputs)
        english = processor.decode(output[0], skip_special_tokens=True)

        # Translation
        translated = translator.generate(**tokenizer(english, return_tensors="pt", padding=True).to(device))
        arabic = tokenizer.decode(translated[0], skip_special_tokens=True)

        captions[filename] = {"english": english, "arabic": arabic}
        print(f"{filename}: EN: {english} | AR: {arabic}")
