<a href="https://colab.research.google.com/github/Raniamea/arabic-video-summarisation/blob/main/01_transcribe.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 🚨 Reset Colab Python Kernel Automatically After Downgrading NumPy
!pip install numpy==1.23.5 --force-reinstall --no-cache-dir
import os; os.kill(os.getpid(), 9)

Collecting numpy==1.23.5
  Downloading numpy-1.23.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.3 kB)
Downloading numpy-1.23.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.1/17.1 MB[0m [31m275.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.0.2
    Uninstalling numpy-2.0.2:
      Successfully uninstalled numpy-2.0.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
blosc2 3.6.1 requires numpy>=1.26, but you have numpy 1.23.5 which is incompatible.
opencv-python-headless 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.23.5 which is incompatible.
opencv-python 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "

In [1]:
# Install Whisper and Torch
!pip install -q git+https://github.com/openai/whisper.git
!pip install -q torch==2.0.1 torchaudio==2.0.2 --index-url https://download.pytorch.org/whl/cu118
!apt-get install ffmpeg
!pip install -q pydub

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.


In [2]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import json
import os

# Path to your params.json on Google Drive
param_path = "/content/drive/MyDrive/ArabicVideoSummariser/params.json"

# Load it
with open(param_path, "r") as f:
    params = json.load(f)

# Get the filename
video_filename = params.get("video_file")
print("🎥 Transcribing video file:", video_filename)

from pydub import AudioSegment
import math

# Define base paths
base_path = "/content/drive/MyDrive/ArabicVideoSummariser"
videos_path = os.path.join(base_path, "videos")
transcripts_path = os.path.join(base_path, "transcripts")

video_path = os.path.join(videos_path, video_filename)
video_name = os.path.splitext(video_filename)[0]
transcript_path = os.path.join(transcripts_path, f"{video_name}_ar.txt")
translation_path = os.path.join(transcripts_path, f"{video_name}_en.txt")
trascription_json_path = os.path.join(transcripts_path, f"{video_name}_ar.json")
translation_json_path = os.path.join(transcripts_path, f"{video_name}_en.json")

# Convert video to audio
audio_path = os.path.join(videos_path, f"{video_name}.wav")
!ffmpeg -y -i "{video_path}" -ar 16000 -ac 1 "{audio_path}"  # Resample to 16kHz mono

# Load audio using pydub
audio = AudioSegment.from_wav(audio_path)
chunk_length_ms = 30 * 1000  # 30 seconds
total_chunks = math.ceil(len(audio) / chunk_length_ms)

print(f"🔊 Audio duration: {len(audio) / 1000:.1f}s, Chunks: {total_chunks}")


🎥 Transcribing video file: PaperMaking.mp4
ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidsta

In [4]:
import torch, whisper, json, gc

torch.cuda.empty_cache()
gc.collect()

model = whisper.load_model("large", device="cuda", in_memory=True)


results = []
for i in range(total_chunks):
    start_ms = i * chunk_length_ms
    end_ms = min((i + 1) * chunk_length_ms, len(audio))
    chunk = audio[start_ms:end_ms]
    chunk_file = f"/content/chunk_{i}.wav"
    chunk.export(chunk_file, format="wav")

    print(f"⏱️ Transcribing chunk {i+1}/{total_chunks} ({start_ms/1000:.1f}s - {end_ms/1000:.1f}s)")
    result = model.transcribe(chunk_file, language="ar", task="transcribe", verbose=False, fp16=False)

    for segment in result['segments']:
        segment["start"] += start_ms / 1000
        segment["end"] += start_ms / 1000
        results.append(segment)

# Save text transcript
with open(transcript_path, "w", encoding="utf-8") as f:
    f.write(" ".join([seg["text"] for seg in results]))

# Save time-coded transcript
with open(transcript_path.replace(".txt", "_with_timecodes.txt"), "w", encoding="utf-8") as f:
    for seg in results:
        f.write(f"[{seg['start']:.2f} - {seg['end']:.2f}] {seg['text']}\n")

# Save JSON
with open(trascription_json_path, "w", encoding="utf-8") as f:
    json.dump({"segments": results}, f, ensure_ascii=False, indent=2)


  import scipy
  return self.fget.__get__(instance, owner)()


⏱️ Transcribing chunk 1/12 (0.0s - 30.0s)


100%|██████████| 3000/3000 [00:02<00:00, 1226.13frames/s]


⏱️ Transcribing chunk 2/12 (30.0s - 60.0s)


100%|██████████| 3000/3000 [00:06<00:00, 448.61frames/s]


⏱️ Transcribing chunk 3/12 (60.0s - 90.0s)


100%|██████████| 3000/3000 [00:05<00:00, 556.52frames/s]


⏱️ Transcribing chunk 4/12 (90.0s - 120.0s)


100%|██████████| 3000/3000 [00:05<00:00, 592.67frames/s]


⏱️ Transcribing chunk 5/12 (120.0s - 150.0s)


100%|██████████| 3000/3000 [00:05<00:00, 501.32frames/s]


⏱️ Transcribing chunk 6/12 (150.0s - 180.0s)


100%|██████████| 3000/3000 [00:01<00:00, 1654.82frames/s]


⏱️ Transcribing chunk 7/12 (180.0s - 210.0s)


100%|██████████| 3000/3000 [00:02<00:00, 1247.34frames/s]


⏱️ Transcribing chunk 8/12 (210.0s - 240.0s)


100%|██████████| 3000/3000 [00:04<00:00, 721.09frames/s]


⏱️ Transcribing chunk 9/12 (240.0s - 270.0s)


100%|██████████| 3000/3000 [00:04<00:00, 626.66frames/s]


⏱️ Transcribing chunk 10/12 (270.0s - 300.0s)


100%|██████████| 3000/3000 [00:02<00:00, 1226.01frames/s]


⏱️ Transcribing chunk 11/12 (300.0s - 330.0s)


100%|██████████| 3000/3000 [00:07<00:00, 421.04frames/s]


⏱️ Transcribing chunk 12/12 (330.0s - 354.7s)


100%|██████████| 2473/2473 [00:03<00:00, 623.59frames/s]


In [None]:
del model
torch.cuda.empty_cache()
gc.collect()