<a href="https://colab.research.google.com/github/Raniamea/arabic-video-summarisation/blob/main/notebooks/01_transcription.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')


In [None]:
# Install Whisper and Torch
!pip install -q git+https://github.com/openai/whisper.git
!pip install -q torch torchaudio --index-url https://download.pytorch.org/whl/cu118

In [None]:
import os
import whisper

# Paths
input_dir = "/content/drive/MyDrive/ArabicVideoSummariser/videos"
output_dir = "/content/drive/MyDrive/ArabicVideoSummariser/transcripts"

# Create output folder if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Load Whisper model
model = whisper.load_model("large")  # or "small", "medium", etc.

# Get list of video/audio files
video_files = [f for f in os.listdir(input_dir) if f.lower().endswith(('.mp4', '.mp3', '.wav', '.m4a'))]

print(f"Found {len(video_files)} files.")

# Loop through files
for filename in video_files:
    input_path = os.path.join(input_dir, filename)
    print(f"\n🔄 Transcribing: {filename} ...")

    try:
        # Run Whisper transcription
        result = model.transcribe(input_path)

        # Prepare output path
        output_filename = os.path.splitext(filename)[0] + ".txt"
        output_path = os.path.join(output_dir, output_filename)

        # Save transcript
        with open(output_path, "w", encoding="utf-8") as f:
            f.write(result["text"])

        print(f"✅ Saved transcript to: {output_filename}")

    except Exception as e:
        print(f"❌ Failed to transcribe {filename}: {e}")
