<a href="https://colab.research.google.com/github/Raniamea/arabic-video-summarisation/blob/main/VideoProcessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 🎥 Arabic Video Multimodal Validator and Summarizer

This Colab notebook lets you input the name of an Arabic video file and automatically performs:
- Audio transcription (Arabic)
- Scene/keyframe caption validation using Sentence-BERT and CLIP
- (Optional) Abstractive summarization with mBART

---

# 🔊 1. Setup Environment

In [1]:
# Install Whisper and Torch
!pip install -q git+https://github.com/openai/whisper.git
!pip install -q torch torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install -q transformers sentence-transformers torchaudio opencv-python Pillow
from google.colab import drive
import os
import torch


  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [6]:
# Mount Google Drive
# Unmount first
!fusermount -u /content/drive || echo "Already unmounted"

# Delete the mount folder entirely
!rm -rf /content/drive

# Now mount again
from google.colab import drive
drive.mount('/content/drive')




fusermount: failed to unmount /content/drive: Invalid argument
Already unmounted
Mounted at /content/drive


In [7]:
# Define base paths
base_path = "/content/drive/MyDrive/ArabicVideoSummariser"
videos_path = os.path.join(base_path, "videos")
transcripts_path = os.path.join(base_path, "transcripts")
captions_path = os.path.join(base_path, "captions")
keyframes_path = os.path.join(base_path, "keyframes")
os.makedirs(transcripts_path, exist_ok=True)
os.makedirs(captions_path, exist_ok=True)
os.makedirs(keyframes_path, exist_ok=True)

# 🔊 2. Input Video Filename

In [8]:
# Input Video Filename
video_filename = input("Enter the name of the video file (e.g., MyVideo.mp4): ")

Enter the name of the video file (e.g., MyVideo.mp4): Calligraphy.mp4


In [9]:
video_path = os.path.join(videos_path, video_filename)
assert os.path.exists(video_path), f"Video file not found: {video_path}"


In [None]:
# Transcribe the video
import whisper
# Load Whisper model
model = whisper.load_model("large", device="cuda")

# transcribe (Arabic)
result = model.transcribe(video_path, language="ar", task="transcribe")
transcript_txt = os.path.join(transcripts_path, f"{os.path.splitext(video_filename)[0]}_ar.txt")
with open(transcript_txt, "w", encoding="utf-8") as f:
    f.write(result['text'])
print(f"✅ Saved Arabic transcript to: {transcript_txt}")


# Translate (Arabic → English)
result_en = model.transcribe(video_path, language="ar", task="translate")
translation_txt = os.path.join(transcripts_path, f"{os.path.splitext(video_filename)[0]}_en.txt")
with open(translation_txt, "w", encoding="utf-8") as f:
    f.write(result_en["text"])
print(f"✅ Saved English translation to: {translation_txt}")

100%|█████████████████████████████████████| 2.88G/2.88G [01:19<00:00, 38.9MiB/s]


# 🧠 4. Process Transcript into Overlapping Chunks

In [None]:
# Load transcript and split into chunks
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

with open(transcript_txt, encoding='utf-8') as f:
    full_transcript = f.read()

words = word_tokenize(full_transcript)
chunk_size = 30
step = 15
transcript_chunks = [' '.join(words[i:i+chunk_size]) for i in range(0, len(words)-chunk_size, step)]

# 🖼️ 5. Load Scene Captions

In [None]:
# Load captions from JSON
import json
captions_json = os.path.join(captions_path, f"{os.path.splitext(video_filename)[0]}.json")
with open(captions_json, encoding='utf-8') as f:
    scenes = json.load(f)
scene_captions = [(scene, data["arabic"]) for scene, data in scenes.items()]

# 🔡 6. Embed Captions and Transcript Chunks

In [None]:
# Encode using multilingual Sentence-BERT
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

caption_texts = [text for _, text in scene_captions]
caption_embeddings = model.encode(caption_texts, convert_to_tensor=True)
transcript_embeddings = model.encode(transcript_chunks, convert_to_tensor=True)

# 🔗 7. Match Captions to Transcript Chunks

In [None]:
# Find best transcript match for each caption
results = []
similarities = util.cos_sim(caption_embeddings, transcript_embeddings)
for i, (scene_id, caption_text) in enumerate(scene_captions):
    sim_scores = similarities[i]
    top_idx = sim_scores.argmax().item()
    results.append({
        "scene_id": scene_id,
        "caption": caption_text,
        "best_transcript_chunk": transcript_chunks[top_idx],
        "similarity_score": float(sim_scores[top_idx])
    })

# 📥 8. Output Results

In [None]:
# Display a few matches
import pandas as pd
df = pd.DataFrame(results)
df[['scene_id', 'caption', 'best_transcript_chunk', 'similarity_score']].head(10)