<a href="https://colab.research.google.com/github/Raniamea/arabic-video-summarisation/blob/main/notebooks/01_transcription.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')


In [None]:
# Install Whisper and Torch
!pip install -q git+https://github.com/openai/whisper.git
!pip install -q torch torchaudio --index-url https://download.pytorch.org/whl/cu118



In [None]:
import os
import whisper
import torch

# Paths
input_dir = "/content/drive/MyDrive/ArabicVideoSummariser/videos"
output_dir = "/content/drive/MyDrive/ArabicVideoSummariser/transcripts"

# Create output folder if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Load Whisper model
model = whisper.load_model("large", device="cuda")

# Get list of video/audio files
video_files = [f for f in os.listdir(input_dir) if f.lower().endswith(('.mp4', '.mp3', '.wav', '.m4a'))]

print(f"Found {len(video_files)} files.")

# Loop through files
for filename in video_files:
    input_path = os.path.join(input_dir, filename)
    base_name = os.path.splitext(filename)[0]
    print(f"\n🔄 Transcribing: {filename} ...")

    try:
        # 1️⃣ Transcribe (Arabic → Arabic)
        result_ar = model.transcribe(input_path, language="ar", task="transcribe")
        transcript_path = os.path.join(output_dir, base_name + "_ar.txt")
        with open(transcript_path, "w", encoding="utf-8") as f:
            f.write(result_ar["text"])
        print(f"✅ Saved Arabic transcript to: {transcript_path}")

        # 2️⃣ Translate (Arabic → English)
        result_en = model.transcribe(input_path, language="ar", task="translate")
        translation_path = os.path.join(output_dir, base_name + "_en.txt")
        with open(translation_path, "w", encoding="utf-8") as f:
            f.write(result_en["text"])
        print(f"✅ Saved English translation to: {translation_path}")

    except Exception as e:
        print(f"❌ Failed to transcribe {filename}: {e}")
