<a href="https://colab.research.google.com/github/Raniamea/arabic-video-summarisation/blob/main/VideoProcessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 🎥 Arabic Video Multimodal Validator and Summarizer

This Colab notebook lets you input the name of an Arabic video file and automatically performs:
- Audio transcription (Arabic)
- Scene/keyframe caption validation using Sentence-BERT and CLIP
- (Optional) Abstractive summarization with mBART

##♻️ 1. Setup Environment

In [None]:
# Uninstall all conflicting packages
!pip uninstall -y torch torchaudio torchvision whisper transformers tokenizers camel-tools opencv-python opencv-contrib-python scenedetect numpy
# Reinstall compatible base stack
!pip install --upgrade pip setuptools wheel
# Step 1: Torch + CUDA 11.8 (Compatible with Whisper and BLIP2)
!pip install torch==2.0.1+cu118 torchaudio==2.0.2 --index-url https://download.pytorch.org/whl/cu118
# Step 2: NumPy compatible with Whisper and CAMeL Tools
!pip install numpy==1.23.5
# Step 3: Whisper from GitHub
!pip install git+https://github.com/openai/whisper.git
# Step 4: OpenCV and SceneDetect
!pip install opencv-python==4.7.0.72 opencv-contrib-python==4.7.0.72 scenedetect==0.6.6
# Step 5: Transformers for BLIP2 / Sentence-BERT / AraBERT
!pip install "transformers==4.37.2" "tokenizers>=0.14,<0.19" sentence-transformers sacremoses
# Step 6: CAMeL Tools (Arabic NLP)
!pip install git+https://github.com/CAMeL-Lab/camel_tools.git@master
# Step 7: Utilities
!pip install librosa==0.10.0.post2 Pillow accelerate bitsandbytes
# Step 8. Suppress warnings in runtime output
import warnings
warnings.filterwarnings("ignore")

Found existing installation: torch 2.6.0+cu124
Uninstalling torch-2.6.0+cu124:
  Successfully uninstalled torch-2.6.0+cu124
Found existing installation: torchaudio 2.6.0+cu124
Uninstalling torchaudio-2.6.0+cu124:
  Successfully uninstalled torchaudio-2.6.0+cu124
Found existing installation: torchvision 0.21.0+cu124
Uninstalling torchvision-0.21.0+cu124:
  Successfully uninstalled torchvision-0.21.0+cu124
[0mFound existing installation: transformers 4.54.0
Uninstalling transformers-4.54.0:
  Successfully uninstalled transformers-4.54.0
Found existing installation: tokenizers 0.21.2
Uninstalling tokenizers-0.21.2:
  Successfully uninstalled tokenizers-0.21.2
[0mFound existing installation: opencv-python 4.12.0.88
Uninstalling opencv-python-4.12.0.88:
  Successfully uninstalled opencv-python-4.12.0.88
Found existing installation: opencv-contrib-python 4.12.0.88
Uninstalling opencv-contrib-python-4.12.0.88:
  Successfully uninstalled opencv-contrib-python-4.12.0.88
[0mFound existing ins

Looking in indexes: https://download.pytorch.org/whl/cu118
Collecting torch==2.0.1+cu118
  Downloading https://download.pytorch.org/whl/cu118/torch-2.0.1%2Bcu118-cp311-cp311-linux_x86_64.whl (2267.3 MB)
[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━[0m [32m1.4/2.3 GB[0m [31m132.7 MB/s[0m eta [36m0:00:07[0m^C


In [3]:
import torch
import whisper
import cv2
import numpy as np
import transformers
import tokenizers
import scenedetect
import camel_tools
from PIL import Image

ModuleNotFoundError: No module named 'transformers'

## 2. Mount Google Drive & Define Folder Paths

In [None]:
from google.colab import drive
import os

# Unmount first
!fusermount -u /content/drive || echo "Already unmounted"

# Delete the mount folder entirely
!rm -rf /content/drive

# Now mount again
from google.colab import drive
drive.mount('/content/drive')

fusermount: failed to unmount /content/drive: No such file or directory
Already unmounted
Mounted at /content/drive


## ▶️ 3. Input Video Filename

In [None]:
# Input Video Filename
video_filename = input("Enter the name of the video file (e.g., MyVideo.mp4): ")

Enter the name of the video file (e.g., MyVideo.mp4): Aluminium.mp4


In [2]:
# Define base paths
base_path = "/content/drive/MyDrive/ArabicVideoSummariser"
videos_path = os.path.join(base_path, "videos")
transcripts_path = os.path.join(base_path, "transcripts")
captions_path = os.path.join(base_path, "captions")
keyframes_path = os.path.join(base_path, "keyframes")
os.makedirs(transcripts_path, exist_ok=True)
os.makedirs(captions_path, exist_ok=True)
os.makedirs(keyframes_path, exist_ok=True)

video_path = os.path.join(videos_path, video_filename)
assert os.path.exists(video_path), f"Video file not found: {video_path}"
video_name = os.path.splitext(video_filename)[0]
transcript_path = os.path.join(transcripts_path, f"{video_name}_ar.txt")
translation_path = os.path.join(transcripts_path, f"{video_name}_en.txt")
keyframe_dir = os.path.join(keyframes_path, video_name)
os.makedirs(keyframe_dir, exist_ok=True)
captions_json_path = os.path.join(captions_path, f"{video_name}.json")
trascription_json_path = os.path.join(transcripts_path, f"{video_name}.json")


NameError: name 'video_filename' is not defined


## 🔊 4. Transcribe Arabic Audio using Whisper



In [1]:
import shutil
import os
import whisper

# 1. Get Whisper model cache dir
cache_dir = os.path.expanduser("~/.cache/whisper")

# 2. Delete the corrupted model if it exists
model_name = "large"
model_path = os.path.join(cache_dir, f"{model_name}.pt")
if os.path.exists(model_path):
    print(f"Removing corrupted model: {model_path}")
    os.remove(model_path)

# Optional: remove download temp directory if present
tmp_download = os.path.join(cache_dir, "downloads")
if os.path.exists(tmp_download):
    print(f"Removing temp downloads: {tmp_download}")
    shutil.rmtree(tmp_download)

# 3. Re-download and load the model
model = whisper.load_model(model_name, device="cuda")


  import scipy
100%|█████████████████████████████████████| 2.88G/2.88G [01:22<00:00, 37.3MiB/s]
  return self.fget.__get__(instance, owner)()


In [None]:
import os
import torch, whisper, json

# Load Whisper model
model = whisper.load_model("large", device="cuda")

# transcribe (Arabic)
result = model.transcribe(video_path, language="ar", task="transcribe", verbose=True)

with open(transcript_path, "w", encoding="utf-8") as f:
    f.write(result['text'])
print(f"✅ Saved Arabic transcript to: {transcript_path}")

with open(transcript_path.replace(".txt", "_with_timecodes.txt"), "w", encoding="utf-8") as f:
    for segment in result["segments"]:
        start = segment["start"]
        end = segment["end"]
        text = segment["text"]
        f.write(f"[{start:.2f} - {end:.2f}] {text}\n")

# ✅ Save full result as JSON (NEW)
with open(trascription_json_path, "w", encoding="utf-8") as f:
    json.dump(result, f, ensure_ascii=False, indent=2)
print(f"✅ Saved full Whisper output (AR) to: {trascription_json_path}")

# Translate (Arabic → English)
result_en = model.transcribe(video_path, language="ar", task="translate", verbose=True)
with open(translation_path, "w", encoding="utf-8") as f:
    f.write(result_en["text"])
print(f"✅ Saved English translation to: {translation_path}")

# Save timecoded translation
with open(translation_path.replace(".txt", "_with_timecodes.txt"), "w", encoding="utf-8") as f:
    for segment in result_en["segments"]:
        start = segment["start"]
        end = segment["end"]
        text = segment["text"]
        f.write(f"[{start:.2f} - {end:.2f}] {text}\n")

 17%|██████▍                               | 499M/2.88G [00:10<00:53, 48.3MiB/s]


RuntimeError: Model has been downloaded but the SHA256 checksum does not not match. Please retry loading the model.

## 🖼️ 5. KeyFrame Detection & Captioning

In [None]:
import os, torch, cv2, json
from PIL import Image
from scenedetect import open_video, SceneManager
from scenedetect import VideoManager, SceneManager
from scenedetect.detectors import ContentDetector
from transformers import AutoProcessor, Blip2ForConditionalGeneration
from transformers import MarianMTModel, MarianTokenizer

# ============ SETUP ============
device = "cuda" if torch.cuda.is_available() else "cpu"

# BLIP-2 model
caption_processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b",use_fast=False)
caption_model = Blip2ForConditionalGeneration.from_pretrained(
    "Salesforce/blip2-opt-2.7b",
    device_map="auto",
    torch_dtype=torch.float16 if device == "cuda" else torch.float32
).to(device)

# Translation model (EN → AR)
translator_tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-ar")
translator_model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-ar").to(device)



captions = {}

scene_manager = SceneManager()
scene_manager.add_detector(ContentDetector(threshold=30.0))
video = open_video(video_path)
scene_manager.detect_scenes(video)
scene_list = scene_manager.get_scene_list()

# --- Extract frames ---
cap = cv2.VideoCapture(video_path)
fps = cap.get(cv2.CAP_PROP_FPS)

for i, (start, _) in enumerate(scene_list):
  frame_num = int(start.get_seconds() * fps)
  cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
  ret, frame = cap.read()
  if not ret:
    continue

  frame_name = f"scene_{i:03}.jpg"
  frame_path = os.path.join(keyframe_dir, frame_name)
  cv2.imwrite(frame_path, frame)

  # Convert to PIL
  image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

  # --- Captioning with BLIP-2 ---
  inputs = caption_processor(images=image, return_tensors="pt").to(device, torch.float16 if device == "cuda" else torch.float32)
  generated_ids = caption_model.generate(**inputs, max_new_tokens=50)
  english_caption = caption_processor.decode(generated_ids[0], skip_special_tokens=True).strip()

  # --- Translate to Arabic ---
  translation_inputs = translator_tokenizer(english_caption, return_tensors="pt", padding=True).to(device)
  translated = translator_model.generate(**translation_inputs)
  arabic_caption = translator_tokenizer.decode(translated[0], skip_special_tokens=True).strip()

  # --- Save result with scene start time ---
  captions[frame_name] = {
    "scene_time": round(start.get_seconds(), 2),  # Time in seconds, rounded for readability
    "english": english_caption,
    "arabic": arabic_caption
    }

  print(f"✓ {frame_name} @ {start.get_seconds():.2f}s | EN: {english_caption} | AR: {arabic_caption}")

cap.release()

# Save JSON
with open(captions_json_path, "w", encoding="utf-8") as f:
  json.dump(captions, f, ensure_ascii=False, indent=2)
print(f"✅ Captions saved to: {captions_json_path}")

## 🧠 7. Process Transcript into Overlapping Chunks

In [None]:
!pip install transformers --quiet

from transformers import AutoTokenizer

# === Load tokenizer ===
tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv2")

# === Load transcript text ===
with open(transcript_txt, encoding="utf-8") as f:
    full_transcript = f.read()

# === Tokenize in small overlapping windows ===
tokens = tokenizer.tokenize(full_transcript)

# Define chunking parameters
chunk_size = 128
step = 64

# Generate safe overlapping chunks (as token strings)
token_chunks = [tokens[i:i+chunk_size] for i in range(0, len(tokens) - chunk_size + 1, step)]

# Convert token chunks back to readable strings
text_chunks = [tokenizer.convert_tokens_to_string(chunk) for chunk in token_chunks]

# Prepare model-ready input (≤512 tokens, padded)
tokenized_chunks = [
    tokenizer(chunk_text, return_tensors="pt", truncation=True, max_length=512, padding="max_length")
    for chunk_text in text_chunks
]

# Preview
for i, chunk in enumerate(text_chunks[:3]):
    print(f"\n--- Chunk {i+1} ---\n{chunk}")



## 🖼️ 8. Load Scene Captions

In [None]:
# Load captions from JSON
import json
captions_json = os.path.join(captions_path, f"{os.path.splitext(video_filename)[0]}.json")
with open(captions_json, encoding='utf-8') as f:
    scenes = json.load(f)
scene_captions = [(scene, data["arabic"]) for scene, data in scenes.items()]

## 🔡 9. Embed Captions and Transcript Chunks

In [None]:
# Encode using multilingual Sentence-BERT
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

caption_texts = [text for _, text in scene_captions]
caption_embeddings = model.encode(caption_texts, convert_to_tensor=True)
transcript_embeddings = model.encode(transcript_chunks, convert_to_tensor=True)

## 🔗 10. Match Captions to Transcript Chunks

In [None]:
# Find best transcript match for each caption
results = []
similarities = util.cos_sim(caption_embeddings, transcript_embeddings)
for i, (scene_id, caption_text) in enumerate(scene_captions):
    sim_scores = similarities[i]
    top_idx = sim_scores.argmax().item()
    results.append({
        "scene_id": scene_id,
        "caption": caption_text,
        "best_transcript_chunk": transcript_chunks[top_idx],
        "similarity_score": float(sim_scores[top_idx])
    })

## 📥 11. Output Results

In [None]:
# Display a few matches
import pandas as pd
df = pd.DataFrame(results)
df[['scene_id', 'caption', 'best_transcript_chunk', 'similarity_score']].head(10)