<a href="https://colab.research.google.com/github/Raniamea/arabic-video-summarisation/blob/main/VideoProcessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 🎥 Arabic Video Multimodal Validator and Summarizer

This Colab notebook lets you input the name of an Arabic video file and automatically performs:
- Audio transcription (Arabic)
- Scene/keyframe caption validation using Sentence-BERT and CLIP
- (Optional) Abstractive summarization with mBART

##♻️ 1. Setup Environment

In [None]:
# Install Whisper and Torch
!pip install -q git+https://github.com/openai/whisper.git
!pip install -q torch torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install scenedetect[opencv] transformers accelerate bitsandbytes
!pip install -q transformers sentence-transformers torchaudio opencv-python Pillow
!pip install sacremoses
#! pip install pyarabic
! pip install camel-tools
from google.colab import drive
import os
import torch



In [None]:
# Mount Google Drive
# Unmount first
!fusermount -u /content/drive || echo "Already unmounted"

# Delete the mount folder entirely
!rm -rf /content/drive

# Now mount again
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Define base paths
base_path = "/content/drive/MyDrive/ArabicVideoSummariser"
videos_path = os.path.join(base_path, "videos")
transcripts_path = os.path.join(base_path, "transcripts")
captions_path = os.path.join(base_path, "captions")
keyframes_path = os.path.join(base_path, "keyframes")
os.makedirs(transcripts_path, exist_ok=True)
os.makedirs(captions_path, exist_ok=True)
os.makedirs(keyframes_path, exist_ok=True)

## ▶️ 2. Input Video Filename

In [None]:
# Input Video Filename
video_filename = input("Enter the name of the video file (e.g., MyVideo.mp4): ")

In [None]:
video_path = os.path.join(videos_path, video_filename)
assert os.path.exists(video_path), f"Video file not found: {video_path}"
video_name = os.path.splitext(video_filename)[0]



## 🔊 3. Transcribe Arabic Audio using Whisper



In [None]:
import whisper

# Load Whisper model
model = whisper.load_model("large", device="cuda")

# transcribe (Arabic)
result = model.transcribe(video_path, language="ar", task="transcribe")
transcript_txt = os.path.join(transcripts_path, f"{video_name}_ar.txt")
with open(transcript_txt, "w", encoding="utf-8") as f:
    f.write(result['text'])
print(f"✅ Saved Arabic transcript to: {transcript_txt}")


# Translate (Arabic → English)
result_en = model.transcribe(video_path, language="ar", task="translate")
translation_txt = os.path.join(transcripts_path, f"{video_name}_en.txt")
with open(translation_txt, "w", encoding="utf-8") as f:
    f.write(result_en["text"])
print(f"✅ Saved English translation to: {translation_txt}")

## 🖼️ 4. KeyFrame Detection & Captioning

In [None]:
import cv2
import json
from PIL import Image
from scenedetect import open_video, SceneManager
#from scenedetect import VideoManager, SceneManager
from scenedetect.detectors import ContentDetector
from transformers import AutoProcessor, Blip2ForConditionalGeneration
from transformers import MarianMTModel, MarianTokenizer


# ============ SETUP ============
device = "cuda" if torch.cuda.is_available() else "cpu"

# BLIP-2 model
caption_processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b",use_fast=False)
caption_model = Blip2ForConditionalGeneration.from_pretrained(
    "Salesforce/blip2-opt-2.7b",
    device_map="auto",
    torch_dtype=torch.float16 if device == "cuda" else torch.float32
).to(device)

# Translation model (EN → AR)
translator_tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-ar")
translator_model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-ar").to(device)


keyframe_dir = os.path.join(keyframes_path, video_name)
os.makedirs(keyframe_dir, exist_ok=True)
captions = {}

scene_manager = SceneManager()
scene_manager.add_detector(ContentDetector(threshold=30.0))
video = open_video(video_path)
scene_manager.detect_scenes(video)
scene_list = scene_manager.get_scene_list()

# --- Extract frames ---
cap = cv2.VideoCapture(video_path)
fps = cap.get(cv2.CAP_PROP_FPS)

for i, (start, _) in enumerate(scene_list):
  frame_num = int(start.get_seconds() * fps)
  cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
  ret, frame = cap.read()
  if not ret:
    continue

  frame_name = f"scene_{i:03}.jpg"
  frame_path = os.path.join(keyframe_dir, frame_name)
  cv2.imwrite(frame_path, frame)

  # Convert to PIL
  image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

  # --- Captioning with BLIP-2 ---
  inputs = caption_processor(images=image, return_tensors="pt").to(device, torch.float16 if device == "cuda" else torch.float32)
  generated_ids = caption_model.generate(**inputs, max_new_tokens=50)
  english_caption = caption_processor.decode(generated_ids[0], skip_special_tokens=True).strip()

  # --- Translate to Arabic ---
  translation_inputs = translator_tokenizer(english_caption, return_tensors="pt", padding=True).to(device)
  translated = translator_model.generate(**translation_inputs)
  arabic_caption = translator_tokenizer.decode(translated[0], skip_special_tokens=True).strip()

  # --- Save result with scene start time ---
  captions[frame_name] = {
    "scene_time": round(start.get_seconds(), 2),  # Time in seconds, rounded for readability
    "english": english_caption,
    "arabic": arabic_caption
    }

  print(f"✓ {frame_name} @ {start.get_seconds():.2f}s | EN: {english_caption} | AR: {arabic_caption}")

cap.release()

# Save JSON
json_path = os.path.join(captions_path, f"{video_name}.json")
with open(json_path, "w", encoding="utf-8") as f:
  json.dump(captions, f, ensure_ascii=False, indent=2)
print(f"✅ Captions saved to: {json_path}")

## 🧹 5. Clean the Script

In [None]:

import re
import pyarabic.araby as araby



def clean_arabic_text(text):
    #text = re.sub(r'[^\u0600-\u06FF\s]', '', text)  # remove non-Arabic characters
    text = araby.normalize_ligature(text)  # normalize 'لا'
    text = re.sub(r'[ـ]+', '', text)  # remove Tatweel
    text= araby.autocorrect(text)
    text= re.sub(r'\b(\w+)( \1\b)+', r'\1', text) # replace repeated words that occur right after each other
    return text.strip()


In [None]:


import re
from camel_tools.tokenizers.word import simple_word_tokenize
from camel_tools.utils.charmap import CharMapper
from camel_tools.disambig.mle import MLEDisambiguator

# ==== Step 1: Setup CAMeL Tools ====
normalizer = CharMapper.builtin_mapper('arclean')
disambig = MLEDisambiguator.pretrained()

# ==== Step 2: Define segmentation and analysis function ====

def arabic_segment_and_analyze(text):
    # --- Normalize text (remove Tatweel, diacritics, unify alef/ya forms)
    normalized_text = normalizer.map_string(text)

    # --- Heuristic-based sentence segmentation ---
    # Break before conjunctions (و، ثم، لكن) & after commas
    segmented = re.sub(r'(?<=\S)\s+(?=(و|ثم|لكن|ف|بعد|إلا أن|غير أن)\s)', r'. ', normalized_text)
    segmented = re.sub(r'،|\.\s*', '.\n', segmented)  # break after punctuation
    sentences = [s.strip() for s in segmented.split('\n') if s.strip()]

    # --- Morphological disambiguation & POS tagging ---
    results = []
    for sent in sentences:
        tokens = simple_word_tokenize(sent)
        disambig_results = disambig.disambiguate(tokens)
        tagged = [(tok, d.analyses[0].analysis['pos']) for tok, d in zip(tokens, disambig_results)]
        results.append({
            "sentence": sent,
            "tokens_with_pos": tagged
        })

    return results

# ==== Step 3: Read raw transcript and apply segmentation ====
transcripts_path = "/content/transcripts"
video_name = "my_video"
transcript_txt = os.path.join(transcripts_path, f"{video_name}_ar.txt")
output_txt = os.path.join(transcripts_path, f"{video_name}_segmented_ar.txt")

with open(transcript_txt, "r", encoding="utf-8") as f:
    raw_text = f.read()

segmented = arabic_segment_and_analyze(raw_text)

# ==== Step 4: Write segmented sentences to output ====
with open(output_txt, "w", encoding="utf-8") as f:
    for item in segmented:
        f.write(item['sentence'] + '\n')

print(f"✅ Segmented transcript saved to: {output_txt}")


### 4.1. Clean the Transcript

In [None]:
transcript_clean_txt = os.path.join(transcripts_path, f"{video_name}_clean_ar.txt")

with open(transcript_txt, "r", encoding="utf-8") as f:
    raw_text = f.read()

cleaned_transcript = clean_arabic_text(raw_text)

with open(transcript_clean_txt, "w", encoding="utf-8") as f:
    f.write(cleaned_transcript)

### 4.1. Clean the captions

In [None]:

json_clean_path = os.path.join(captions_path, f"{video_name}_clean.json")

with open(json_path, "r", encoding="utf-8") as f:
    raw_text = f.read()

cleaned_captions = clean_arabic_text(raw_text)

with open(json_clean_path, "w", encoding="utf-8") as f:
    f.write(cleaned_captions)

## 🧠 6. Process Transcript into Overlapping Chunks

In [None]:
!pip install transformers --quiet

from transformers import AutoTokenizer

# === Load tokenizer ===
tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv2")

# === Load transcript text ===
with open(transcript_txt, encoding="utf-8") as f:
    full_transcript = f.read()

# === Tokenize in small overlapping windows ===
tokens = tokenizer.tokenize(full_transcript)

# Define chunking parameters
chunk_size = 128
step = 64

# Generate safe overlapping chunks (as token strings)
token_chunks = [tokens[i:i+chunk_size] for i in range(0, len(tokens) - chunk_size + 1, step)]

# Convert token chunks back to readable strings
text_chunks = [tokenizer.convert_tokens_to_string(chunk) for chunk in token_chunks]

# Prepare model-ready input (≤512 tokens, padded)
tokenized_chunks = [
    tokenizer(chunk_text, return_tensors="pt", truncation=True, max_length=512, padding="max_length")
    for chunk_text in text_chunks
]

# Preview
for i, chunk in enumerate(text_chunks[:3]):
    print(f"\n--- Chunk {i+1} ---\n{chunk}")



## 🖼️ 7. Load Scene Captions

In [None]:
# Load captions from JSON
import json
captions_json = os.path.join(captions_path, f"{os.path.splitext(video_filename)[0]}.json")
with open(captions_json, encoding='utf-8') as f:
    scenes = json.load(f)
scene_captions = [(scene, data["arabic"]) for scene, data in scenes.items()]

## 🔡 8. Embed Captions and Transcript Chunks

In [None]:
# Encode using multilingual Sentence-BERT
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

caption_texts = [text for _, text in scene_captions]
caption_embeddings = model.encode(caption_texts, convert_to_tensor=True)
transcript_embeddings = model.encode(transcript_chunks, convert_to_tensor=True)

## 🔗 9. Match Captions to Transcript Chunks

In [None]:
# Find best transcript match for each caption
results = []
similarities = util.cos_sim(caption_embeddings, transcript_embeddings)
for i, (scene_id, caption_text) in enumerate(scene_captions):
    sim_scores = similarities[i]
    top_idx = sim_scores.argmax().item()
    results.append({
        "scene_id": scene_id,
        "caption": caption_text,
        "best_transcript_chunk": transcript_chunks[top_idx],
        "similarity_score": float(sim_scores[top_idx])
    })

## 📥 10. Output Results

In [None]:
# Display a few matches
import pandas as pd
df = pd.DataFrame(results)
df[['scene_id', 'caption', 'best_transcript_chunk', 'similarity_score']].head(10)