<a href="https://colab.research.google.com/github/Raniamea/arabic-video-summarisation/blob/main/notebooks/01_transcription.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Uninstall all conflicting packages
!pip uninstall -y torch torchaudio torchsummary torchtune torchdata torchao sentence-transformers torchvision whisper transformers tokenizers camel-tools camel-kenlm openai-whisper opencv-python opencv-contrib-python scenedetect numpy opencv-python-headless

In [None]:
import sys
!echo "Python version: $(python --version)"
!pip list | grep -E 'torch|numpy|transformers|whisper|camel|opencv|scene'

In [None]:
# Install Whisper and Torch
!pip install -q git+https://github.com/openai/whisper.git
!pip install -q torch torchaudio --index-url https://download.pytorch.org/whl/cu118

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import json
import os

# Path to your params.json on Google Drive
param_path = "/content/drive/MyDrive/ArabicVideoSummariser/params.json"

# Load it
with open(param_path, "r") as f:
    params = json.load(f)

# Get the filename
video_filename = params.get("video_file")
print("🎥 Transcribing video file:", video_filename)

# Input Video Filename
#video_filename = input("Enter the name of the video file (e.g., MyVideo.mp4): ")

In [None]:

# Define base paths
base_path = "/content/drive/MyDrive/ArabicVideoSummariser"
videos_path = os.path.join(base_path, "videos")
transcripts_path = os.path.join(base_path, "transcripts")

video_path = os.path.join(videos_path, video_filename)
video_name = os.path.splitext(video_filename)[0]
transcript_path = os.path.join(transcripts_path, f"{video_name}_ar.txt")
translation_path = os.path.join(transcripts_path, f"{video_name}_en.txt")
trascription_json_path = os.path.join(transcripts_path, f"{video_name}_ar.json")
translation_json_path = os.path.join(transcripts_path, f"{video_name}_en.json")

In [None]:
import torch, whisper, json

# Load Whisper model
model = whisper.load_model("large", device="cuda")

# transcribe (Arabic)
result = model.transcribe(video_path, language="ar", task="transcribe", verbose=True)

with open(transcript_path, "w", encoding="utf-8") as f:
    f.write(result['text'])
print(f"✅ Saved Arabic transcript to: {transcript_path}")

with open(transcript_path.replace(".txt", "_with_timecodes.txt"), "w", encoding="utf-8") as f:
    for segment in result["segments"]:
        start = segment["start"]
        end = segment["end"]
        text = segment["text"]
        f.write(f"[{start:.2f} - {end:.2f}] {text}\n")

# ✅ Save full result as JSON (NEW)
with open(trascription_json_path, "w", encoding="utf-8") as f:
    json.dump(result, f, ensure_ascii=False, indent=2)
print(f"✅ Saved full Whisper output (AR) to: {trascription_json_path}")

# Translate (Arabic → English)
result_en = model.transcribe(video_path, language="ar", task="translate", verbose=True)
with open(translation_path, "w", encoding="utf-8") as f:
    f.write(result_en["text"])
print(f"✅ Saved English translation to: {translation_path}")

# Save timecoded translation
with open(translation_path.replace(".txt", "_with_timecodes.txt"), "w", encoding="utf-8") as f:
    for segment in result_en["segments"]:
        start = segment["start"]
        end = segment["end"]
        text = segment["text"]
        f.write(f"[{start:.2f} - {end:.2f}] {text}\n")