<a href="https://colab.research.google.com/github/Raniamea/arabic-video-summarisation/blob/main/VideoProcessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 🎥 Arabic Video Multimodal Validator and Summarizer

This Colab notebook lets you input the name of an Arabic video file and automatically performs:
- Audio transcription (Arabic)
- Scene/keyframe caption validation using Sentence-BERT and CLIP
- (Optional) Abstractive summarization with mBART

##♻️ 1. Setup Environment

In [2]:
# === CORE BUILD TOOLS ===
!pip install -U pip setuptools wheel --quiet
# === TORCH (CUDA 11.8) ===
!pip install torch==2.0.1 torchaudio==2.0.2 --index-url https://download.pytorch.org/whl/cu118 --quiet
# === NUMPY (compatible with OpenCV, Whisper, SceneDetect) ===
!pip install numpy==1.24.4 --quiet
# === OpenCV + SceneDetect ===
!pip install opencv-python==4.7.0.72 opencv-contrib-python==4.7.0.72 scenedetect==0.6.6 --quiet
# === Whisper for ASR ===
!pip install git+https://github.com/openai/whisper.git --quiet
# === Transformers for BLIP-2, MarianMT, etc. ===
!pip install transformers==4.38.2 tokenizers>=0.14,<0.19 sentence-transformers sacremoses --quiet
# === CAMeL Tools (for Arabic sentence tokenization etc.) ===
!pip install git+https://github.com/CAMeL-Lab/camel_tools.git@master --quiet
# === Audio & Image processing ===
!pip install librosa==0.10.0.post2 soundfile Pillow --quiet

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m68.5 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m65.9 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorboard 2.18.0 requires numpy>=1.12.0, which is not installed.
ipython 7.34.0 requires jedi>=0.16, which is not installed.
cufflinks 0.17.3 requires numpy>=1.9.2, which is not installed.
pandas-gbq 0.29.2 requires numpy>=1.18.1, which is not installed.
pytensor 2.31.7 requires numpy>=1.17.0, which is not installed.
spanner-graph-notebook 1.1.6 re

## 2. Mount Google Drive & Define Folder Paths

In [1]:
from google.colab import drive
import os
import torch

In [2]:

# Unmount first
!fusermount -u /content/drive || echo "Already unmounted"

# Delete the mount folder entirely
!rm -rf /content/drive

# Now mount again
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Define base paths
base_path = "/content/drive/MyDrive/ArabicVideoSummariser"
videos_path = os.path.join(base_path, "videos")
transcripts_path = os.path.join(base_path, "transcripts")
captions_path = os.path.join(base_path, "captions")
keyframes_path = os.path.join(base_path, "keyframes")
os.makedirs(transcripts_path, exist_ok=True)
os.makedirs(captions_path, exist_ok=True)
os.makedirs(keyframes_path, exist_ok=True)

## ▶️ 3. Input Video Filename

In [4]:
# Input Video Filename
video_filename = input("Enter the name of the video file (e.g., MyVideo.mp4): ")

Enter the name of the video file (e.g., MyVideo.mp4): Calligraphy.mp4


In [5]:
video_path = os.path.join(videos_path, video_filename)
assert os.path.exists(video_path), f"Video file not found: {video_path}"
video_name = os.path.splitext(video_filename)[0]
transcript_path = os.path.join(transcripts_path, f"{video_name}_ar.txt")
translation_path = os.path.join(transcripts_path, f"{video_name}_en.txt")
keyframe_dir = os.path.join(keyframes_path, video_name)
os.makedirs(keyframe_dir, exist_ok=True)
captions_json_path = os.path.join(captions_path, f"{video_name}.json")
trascription_json_path = os.path.join(transcripts_path, f"{video_name}.json")



## 🔊 4. Transcribe Arabic Audio using Whisper



In [6]:
import whisper
import json

# Load Whisper model
model = whisper.load_model("large", device="cuda")

# transcribe (Arabic)
result = model.transcribe(video_path, language="ar", task="transcribe", verbose=True)

with open(transcript_path, "w", encoding="utf-8") as f:
    f.write(result['text'])
print(f"✅ Saved Arabic transcript to: {transcript_path}")

with open(transcript_path.replace(".txt", "_with_timecodes.txt"), "w", encoding="utf-8") as f:
    for segment in result["segments"]:
        start = segment["start"]
        end = segment["end"]
        text = segment["text"]
        f.write(f"[{start:.2f} - {end:.2f}] {text}\n")

# ✅ Save full result as JSON (NEW)
with open(trascription_json_path, "w", encoding="utf-8") as f:
    json.dump(result, f, ensure_ascii=False, indent=2)
print(f"✅ Saved full Whisper output (AR) to: {trascription_json_path}")

# Translate (Arabic → English)
result_en = model.transcribe(video_path, language="ar", task="translate", verbose=True)
with open(translation_path, "w", encoding="utf-8") as f:
    f.write(result_en["text"])
print(f"✅ Saved English translation to: {translation_path}")

# Save timecoded translation
with open(translation_path.replace(".txt", "_with_timecodes.txt"), "w", encoding="utf-8") as f:
    for segment in result_en["segments"]:
        start = segment["start"]
        end = segment["end"]
        text = segment["text"]
        f.write(f"[{start:.2f} - {end:.2f}] {text}\n")

  import scipy
100%|█████████████████████████████████████| 2.88G/2.88G [01:06<00:00, 46.2MiB/s]
  return self.fget.__get__(instance, owner)()


[00:00.000 --> 00:10.620]  في قلب القاهرة التاريخية حيث تتجسد الحضارة الإسلامية في كل زاوية
[00:10.620 --> 00:16.380]  يروي الخط العربي حكاية أزلية عن الجمال والإبداع
[00:16.380 --> 00:19.300]  إنه أكثر من مجرد خط
[00:19.300 --> 00:23.180]  إنه فن، إنه هوية عربية
[00:23.180 --> 00:26.760]  الخط العربي موجود من قبل الإسلام
[00:26.760 --> 00:29.580]  يعني هو بقدم العرب
[00:29.580 --> 00:32.180]  ولكن تطوره اختلف ما بعد الإسلام
[00:32.180 --> 00:35.020]  فالخط العربي موجود ولكن لأنه هو
[00:35.020 --> 00:40.380]  يعني ببساطة شديدة هو من عيلة الخطوط النباتية
[00:40.380 --> 00:44.180]  ولكن الأمثلة اللي موجودة عندنا فعلا ما قبل الإسلام شيء
[00:44.180 --> 00:46.560]  وما بعد الإسلام على طول شيء تاني خالص
[00:46.560 --> 00:50.060]  ففي حاجة حصلت مع نزول الرسالة
[00:50.060 --> 00:56.300]  وتتنوع الخطوط العربية بشكل كبير
[00:56.300 --> 01:00.240]  حيث يروي كل خط عن عصر إصداره فيه
[01:00.240 --> 01:02.480]  واستخدامات تميز بها
[01:02.480 --> 01:07.320]  الأنواع الخط العربي طبعا هم كذا نوع كتير
[0

## 🖼️ 5. KeyFrame Detection & Captioning

In [None]:
import cv2
import json
from PIL import Image
from scenedetect import open_video, SceneManager
#from scenedetect import VideoManager, SceneManager
from scenedetect.detectors import ContentDetector
from transformers import AutoProcessor, Blip2ForConditionalGeneration
from transformers import MarianMTModel, MarianTokenizer


# ============ SETUP ============
device = "cuda" if torch.cuda.is_available() else "cpu"

# BLIP-2 model
caption_processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b",use_fast=False)
caption_model = Blip2ForConditionalGeneration.from_pretrained(
    "Salesforce/blip2-opt-2.7b",
    device_map="auto",
    torch_dtype=torch.float16 if device == "cuda" else torch.float32
).to(device)

# Translation model (EN → AR)
translator_tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-ar")
translator_model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-ar").to(device)



captions = {}

scene_manager = SceneManager()
scene_manager.add_detector(ContentDetector(threshold=30.0))
video = open_video(video_path)
scene_manager.detect_scenes(video)
scene_list = scene_manager.get_scene_list()

# --- Extract frames ---
cap = cv2.VideoCapture(video_path)
fps = cap.get(cv2.CAP_PROP_FPS)

for i, (start, _) in enumerate(scene_list):
  frame_num = int(start.get_seconds() * fps)
  cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
  ret, frame = cap.read()
  if not ret:
    continue

  frame_name = f"scene_{i:03}.jpg"
  frame_path = os.path.join(keyframe_dir, frame_name)
  cv2.imwrite(frame_path, frame)

  # Convert to PIL
  image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

  # --- Captioning with BLIP-2 ---
  inputs = caption_processor(images=image, return_tensors="pt").to(device, torch.float16 if device == "cuda" else torch.float32)
  generated_ids = caption_model.generate(**inputs, max_new_tokens=50)
  english_caption = caption_processor.decode(generated_ids[0], skip_special_tokens=True).strip()

  # --- Translate to Arabic ---
  translation_inputs = translator_tokenizer(english_caption, return_tensors="pt", padding=True).to(device)
  translated = translator_model.generate(**translation_inputs)
  arabic_caption = translator_tokenizer.decode(translated[0], skip_special_tokens=True).strip()

  # --- Save result with scene start time ---
  captions[frame_name] = {
    "scene_time": round(start.get_seconds(), 2),  # Time in seconds, rounded for readability
    "english": english_caption,
    "arabic": arabic_caption
    }

  print(f"✓ {frame_name} @ {start.get_seconds():.2f}s | EN: {english_caption} | AR: {arabic_caption}")

cap.release()

# Save JSON
with open(captions_json_path, "w", encoding="utf-8") as f:
  json.dump(captions, f, ensure_ascii=False, indent=2)
print(f"✅ Captions saved to: {json_path}")

## 🧹 6. Clean the Script

### 6.1. Clean the Transcript

In [None]:
import re
from camel_tools.tokenizers.word import simple_word_tokenize
from camel_tools.utils.charmap import CharMapper
from camel_tools.disambig.mle import MLEDisambiguator

# ==== Setup ====
normalizer = CharMapper.builtin_mapper('arclean')

disambig = MLEDisambiguator.pretrained()

# ==== Function: Clean, Segment, POS tag ====
def arabic_segment_and_analyze(text):
    # Normalize Arabic text (removes Tatweel, unifies alef/ya, strips diacritics)
    normalized_text = normalizer.map_string(text)

    # Insert sentence breaks based on conjunctions and commas
    segmented = re.sub(r'(?<=\S)\s+(?=(و|ثم|لكن|ف|بعد|إلا أن|غير أن)\s)', r'. ', normalized_text)
    segmented = re.sub(r'،|\.\s*', '.\n', segmented)
    sentences = [s.strip() for s in segmented.split('\n') if s.strip()]

    # POS tagging
    results = []
    for sent in sentences:
        tokens = simple_word_tokenize(sent)
        disambig_results = disambig.disambiguate(tokens)
        tagged = [(tok, d.analyses[0].analysis['pos']) for tok, d in zip(tokens, disambig_results)]
        results.append({
            "sentence": sent,
            "tokens_with_pos": tagged
        })
    return results

# ==== File Paths ====
output_txt = os.path.join(transcripts_path, f"{video_name}_segmented_ar.txt")

# ==== Run segmentation ====
with open(transcript_path, "r", encoding="utf-8") as f:
    raw_text = f.read()

segmented = arabic_segment_and_analyze(raw_text)

# ==== Save output ====
with open(output_txt, "w", encoding="utf-8") as f:
    for item in segmented:
        f.write(item['sentence'] + '\n')

print(f"✅ Segmented transcript saved to: {output_txt}")


### 6.2. Clean the captions

## 🧠 7. Process Transcript into Overlapping Chunks

In [None]:
!pip install transformers --quiet

from transformers import AutoTokenizer

# === Load tokenizer ===
tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv2")

# === Load transcript text ===
with open(transcript_txt, encoding="utf-8") as f:
    full_transcript = f.read()

# === Tokenize in small overlapping windows ===
tokens = tokenizer.tokenize(full_transcript)

# Define chunking parameters
chunk_size = 128
step = 64

# Generate safe overlapping chunks (as token strings)
token_chunks = [tokens[i:i+chunk_size] for i in range(0, len(tokens) - chunk_size + 1, step)]

# Convert token chunks back to readable strings
text_chunks = [tokenizer.convert_tokens_to_string(chunk) for chunk in token_chunks]

# Prepare model-ready input (≤512 tokens, padded)
tokenized_chunks = [
    tokenizer(chunk_text, return_tensors="pt", truncation=True, max_length=512, padding="max_length")
    for chunk_text in text_chunks
]

# Preview
for i, chunk in enumerate(text_chunks[:3]):
    print(f"\n--- Chunk {i+1} ---\n{chunk}")



## 🖼️ 8. Load Scene Captions

In [None]:
# Load captions from JSON
import json
captions_json = os.path.join(captions_path, f"{os.path.splitext(video_filename)[0]}.json")
with open(captions_json, encoding='utf-8') as f:
    scenes = json.load(f)
scene_captions = [(scene, data["arabic"]) for scene, data in scenes.items()]

## 🔡 9. Embed Captions and Transcript Chunks

In [None]:
# Encode using multilingual Sentence-BERT
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

caption_texts = [text for _, text in scene_captions]
caption_embeddings = model.encode(caption_texts, convert_to_tensor=True)
transcript_embeddings = model.encode(transcript_chunks, convert_to_tensor=True)

## 🔗 10. Match Captions to Transcript Chunks

In [None]:
# Find best transcript match for each caption
results = []
similarities = util.cos_sim(caption_embeddings, transcript_embeddings)
for i, (scene_id, caption_text) in enumerate(scene_captions):
    sim_scores = similarities[i]
    top_idx = sim_scores.argmax().item()
    results.append({
        "scene_id": scene_id,
        "caption": caption_text,
        "best_transcript_chunk": transcript_chunks[top_idx],
        "similarity_score": float(sim_scores[top_idx])
    })

## 📥 11. Output Results

In [None]:
# Display a few matches
import pandas as pd
df = pd.DataFrame(results)
df[['scene_id', 'caption', 'best_transcript_chunk', 'similarity_score']].head(10)