<a href="https://colab.research.google.com/github/Raniamea/arabic-video-summarisation/blob/main/VideoProcessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 🎥 Arabic Video Multimodal Validator and Summarizer

This Colab notebook lets you input the name of an Arabic video file and automatically performs:
- Audio transcription (Arabic)
- Scene/keyframe caption validation using Sentence-BERT and CLIP
- (Optional) Abstractive summarization with mBART

## 🔊 1. Setup Environment

In [5]:
# Install Whisper and Torch
!pip install -q git+https://github.com/openai/whisper.git
!pip install -q torch torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install scenedetect[opencv] transformers accelerate bitsandbytes
!pip install -q transformers sentence-transformers torchaudio opencv-python Pillow
from google.colab import drive
import os
import torch



  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m110.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m87.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m57.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m39.8 MB/s[0m eta [36m0

In [6]:
# Mount Google Drive
# Unmount first
!fusermount -u /content/drive || echo "Already unmounted"

# Delete the mount folder entirely
!rm -rf /content/drive

# Now mount again
from google.colab import drive
drive.mount('/content/drive')

fusermount: failed to unmount /content/drive: No such file or directory
Already unmounted
Mounted at /content/drive


In [7]:
# Define base paths
base_path = "/content/drive/MyDrive/ArabicVideoSummariser"
videos_path = os.path.join(base_path, "videos")
transcripts_path = os.path.join(base_path, "transcripts")
captions_path = os.path.join(base_path, "captions")
keyframes_path = os.path.join(base_path, "keyframes")
os.makedirs(transcripts_path, exist_ok=True)
os.makedirs(captions_path, exist_ok=True)
os.makedirs(keyframes_path, exist_ok=True)

## 🔊 2. Input Video Filename

In [8]:
# Input Video Filename
video_filename = input("Enter the name of the video file (e.g., MyVideo.mp4): ")

Enter the name of the video file (e.g., MyVideo.mp4): Calligraphy.mp4


In [9]:
video_path = os.path.join(videos_path, video_filename)
assert os.path.exists(video_path), f"Video file not found: {video_path}"
video_name = os.path.splitext(video_filename)[0]



## 🔊 3. Transcribe Arabic Audio using Whisper



In [None]:
import whisper

# Load Whisper model
model = whisper.load_model("large", device="cuda")

# transcribe (Arabic)
result = model.transcribe(video_path, language="ar", task="transcribe")
transcript_txt = os.path.join(transcripts_path, f"{video_name}_ar.txt")
with open(transcript_txt, "w", encoding="utf-8") as f:
    f.write(result['text'])
print(f"✅ Saved Arabic transcript to: {transcript_txt}")


# Translate (Arabic → English)
result_en = model.transcribe(video_path, language="ar", task="translate")
translation_txt = os.path.join(transcripts_path, f"{video_name}_en.txt")
with open(translation_txt, "w", encoding="utf-8") as f:
    f.write(result_en["text"])
print(f"✅ Saved English translation to: {translation_txt}")

100%|█████████████████████████████████████| 2.88G/2.88G [01:19<00:00, 38.9MiB/s]


✅ Saved Arabic transcript to: /content/drive/MyDrive/ArabicVideoSummariser/transcripts/Calligraphy_ar.txt
✅ Saved English translation to: /content/drive/MyDrive/ArabicVideoSummariser/transcripts/Calligraphy_en.txt


## 🖼️ 4. KeyFrame Detection & Captioning

In [None]:
import cv2
import json
from PIL import Image
from scenedetect import VideoManager, SceneManager
from scenedetect.detectors import ContentDetector
from transformers import AutoProcessor, Blip2ForConditionalGeneration
from transformers import MarianMTModel, MarianTokenizer

# ============ SETUP ============
device = "cuda" if torch.cuda.is_available() else "cpu"

# BLIP-2 model
caption_processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b")
caption_model = Blip2ForConditionalGeneration.from_pretrained(
    "Salesforce/blip2-opt-2.7b",
    device_map="auto",
    torch_dtype=torch.float16 if device == "cuda" else torch.float32
).to(device)

# Translation model (EN → AR)
translator_tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-ar")
translator_model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-ar").to(device)


keyframe_dir = os.path.join(keyframes_path, video_name)
os.makedirs(keyframe_dir, exist_ok=True)
captions = {}

# --- Scene detection ---
video_manager = VideoManager([video_path])
scene_manager = SceneManager()
scene_manager.add_detector(ContentDetector(threshold=30.0))
video_manager.set_downscale_factor()
video_manager.start()
scene_manager.detect_scenes(video_manager)
scene_list = scene_manager.get_scene_list()
video_manager.release()

# --- Extract frames ---
cap = cv2.VideoCapture(video_path)
fps = cap.get(cv2.CAP_PROP_FPS)

for i, (start, _) in enumerate(scene_list):
  frame_num = int(start.get_seconds() * fps)
  cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
  ret, frame = cap.read()
  if not ret:
    continue

  frame_name = f"scene_{i:03}.jpg"
  frame_path = os.path.join(keyframe_dir, frame_name)
  cv2.imwrite(frame_path, frame)

  # Convert to PIL
  image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

  # --- Captioning with BLIP-2 ---
  inputs = caption_processor(images=image, return_tensors="pt").to(device, torch.float16 if device == "cuda" else torch.float32)
  generated_ids = caption_model.generate(**inputs, max_new_tokens=50)
  english_caption = caption_processor.decode(generated_ids[0], skip_special_tokens=True).strip()

  # --- Translate to Arabic ---
  translation_inputs = translator_tokenizer(english_caption, return_tensors="pt", padding=True).to(device)
  translated = translator_model.generate(**translation_inputs)
  arabic_caption = translator_tokenizer.decode(translated[0], skip_special_tokens=True).strip()

  # --- Save result with scene start time ---
  captions[frame_name] = {
    "scene_time": round(start.get_seconds(), 2),  # Time in seconds, rounded for readability
    "english": english_caption,
    "arabic": arabic_caption
    }

  print(f"✓ {frame_name} @ {start.get_seconds():.2f}s | EN: {english_caption} | AR: {arabic_caption}")

cap.release()

# Save JSON
json_path = os.path.join(captions_path, f"{video_name}.json")
with open(json_path, "w", encoding="utf-8") as f:
  json.dump(captions, f, ensure_ascii=False, indent=2)
print(f"✅ Captions saved to: {json_path}")

## 🧠 5. Process Transcript into Overlapping Chunks

In [None]:
!pip install transformers --quiet

from transformers import AutoTokenizer

# === Load tokenizer ===
tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv2")

# === Load transcript text ===
with open(transcript_txt, encoding="utf-8") as f:
    full_transcript = f.read()

# === Tokenize in small overlapping windows ===
tokens = tokenizer.tokenize(full_transcript)

# Define chunking parameters
chunk_size = 128
step = 64

# Generate safe overlapping chunks (as token strings)
token_chunks = [tokens[i:i+chunk_size] for i in range(0, len(tokens) - chunk_size + 1, step)]

# Convert token chunks back to readable strings
text_chunks = [tokenizer.convert_tokens_to_string(chunk) for chunk in token_chunks]

# Prepare model-ready input (≤512 tokens, padded)
tokenized_chunks = [
    tokenizer(chunk_text, return_tensors="pt", truncation=True, max_length=512, padding="max_length")
    for chunk_text in text_chunks
]

# Preview
for i, chunk in enumerate(text_chunks[:3]):
    print(f"\n--- Chunk {i+1} ---\n{chunk}")



Token indices sequence length is longer than the specified maximum sequence length for this model (1667 > 512). Running this sequence through the model will result in indexing errors



--- Chunk 1 ---
في قلب القاهرة التاريخية حيث تتجسد الحضارة الإسلامية في كل زاوية يروي الخط العربي حكاية أزلية عن الجمال والإبداع إنه أكثر من مجرد خط إنه فن ، إنه هوية عربية الخط العربي موجود من قبل الإسلام يعني هو بقدم العرب ولكن تطوره اختلف ما بعد الإسلام فالخط العربي موجود ولكن لأنه هو يعني ببساطة شديدة هو من عائلة الخطوط النباتية ولكن الأمثلة اللي موجودة عندنا فعلا ما قبل الإسلام شيء وما بعد الإسلام على طول شيء تاني خالص ففي حاج

--- Chunk 2 ---
##كن تطوره اختلف ما بعد الإسلام فالخط العربي موجود ولكن لأنه هو يعني ببساطة شديدة هو من عائلة الخطوط النباتية ولكن الأمثلة اللي موجودة عندنا فعلا ما قبل الإسلام شيء وما بعد الإسلام على طول شيء تاني خالص ففي حاجة حصلت مع نزول الرسالة وتتنوع الخطوط العربية بشكل كبير حيث يروي كل خط عن عصر إصداره فيه واستخدامات تميز بها أنواع الخط العربي طبعا هم كذا نوع كتير مهماش بس نسخ والرقع زي ما بيعلمونا في المدرسة وحتى الن

--- Chunk 3 ---
##ة حصلت مع نزول الرسالة وتتنوع الخطوط العربية بشكل كبير حيث يروي كل خط عن عصر إصداره فيه واستخدامات تميز بها أنواع ا

## 🖼️ 6. Load Scene Captions

In [None]:
# Load captions from JSON
import json
captions_json = os.path.join(captions_path, f"{os.path.splitext(video_filename)[0]}.json")
with open(captions_json, encoding='utf-8') as f:
    scenes = json.load(f)
scene_captions = [(scene, data["arabic"]) for scene, data in scenes.items()]

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/ArabicVideoSummariser/captions/Calligraphy.json'

## 🔡 7. Embed Captions and Transcript Chunks

In [None]:
# Encode using multilingual Sentence-BERT
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

caption_texts = [text for _, text in scene_captions]
caption_embeddings = model.encode(caption_texts, convert_to_tensor=True)
transcript_embeddings = model.encode(transcript_chunks, convert_to_tensor=True)

## 🔗 8. Match Captions to Transcript Chunks

In [None]:
# Find best transcript match for each caption
results = []
similarities = util.cos_sim(caption_embeddings, transcript_embeddings)
for i, (scene_id, caption_text) in enumerate(scene_captions):
    sim_scores = similarities[i]
    top_idx = sim_scores.argmax().item()
    results.append({
        "scene_id": scene_id,
        "caption": caption_text,
        "best_transcript_chunk": transcript_chunks[top_idx],
        "similarity_score": float(sim_scores[top_idx])
    })

## 📥 9. Output Results

In [None]:
# Display a few matches
import pandas as pd
df = pd.DataFrame(results)
df[['scene_id', 'caption', 'best_transcript_chunk', 'similarity_score']].head(10)