In [None]:
# ================================================
# CELL 1: FIX - Reinstall Compatible Versions
# ================================================

print("🔧 Uninstalling incompatible packages...")
!pip uninstall -y torch torchvision torchaudio transformers

print("\n📦 Installing compatible versions...")
# Install compatible PyTorch and torchvision versions
!pip install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/cu121

# Install transformers with compatible version
!pip install transformers==4.40.0

# Install WhisperX
!pip install whisperx

# Install FFmpeg
!apt-get install ffmpeg -y

print("\n✅ Installation complete with compatible versions!")

# Verify installations
import torch
import torchvision
print(f"\n🔍 Verification:")
print(f"PyTorch version: {torch.__version__}")
print(f"Torchvision version: {torchvision.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")


🔧 Uninstalling incompatible packages...
Found existing installation: torch 2.8.0
Uninstalling torch-2.8.0:
  Successfully uninstalled torch-2.8.0
Found existing installation: torchvision 0.24.0+cu126
Uninstalling torchvision-0.24.0+cu126:
  Successfully uninstalled torchvision-0.24.0+cu126
Found existing installation: torchaudio 2.8.0
Uninstalling torchaudio-2.8.0:
  Successfully uninstalled torchaudio-2.8.0
Found existing installation: transformers 4.57.6
Uninstalling transformers-4.57.6:
  Successfully uninstalled transformers-4.57.6

📦 Installing compatible versions...
Looking in indexes: https://download.pytorch.org/whl/cu121
Collecting torch==2.3.1
  Downloading https://download.pytorch.org/whl/cu121/torch-2.3.1%2Bcu121-cp312-cp312-linux_x86_64.whl (780.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m780.9/780.9 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchvision==0.18.1
  Downloading https://download.pytorch.org/whl/cu121/torchvision-0.

Collecting transformers==4.40.0
  Downloading transformers-4.40.0-py3-none-any.whl.metadata (137 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/137.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.6/137.6 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.20,>=0.19 (from transformers==4.40.0)
  Downloading tokenizers-0.19.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.40.0-py3-none-any.whl (9.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.0/9.0 MB[0m [31m79.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.19.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m116.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizers, transformers
  Attempting uninstall: tokenizer

Collecting torch~=2.8.0 (from whisperx)
  Using cached torch-2.8.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (30 kB)
Collecting torchaudio~=2.8.0 (from whisperx)
  Using cached torchaudio-2.8.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (7.2 kB)
Collecting transformers>=4.48.0 (from whisperx)
  Downloading transformers-5.0.0-py3-none-any.whl.metadata (37 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.8.93 (from torch~=2.8.0->whisperx)
  Using cached nvidia_cuda_nvrtc_cu12-12.8.93-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl.metadata (1.7 kB)
Collecting nvidia-cuda-runtime-cu12==12.8.90 (from torch~=2.8.0->whisperx)
  Using cached nvidia_cuda_runtime_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.7 kB)
Collecting nvidia-cuda-cupti-cu12==12.8.90 (from torch~=2.8.0->whisperx)
  Using cached nvidia_cuda_cupti_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.7 kB)
Collecting nvidia-cudnn-cu12==9.10.2.21 (from t

In [1]:

# ================================================
# CELL 2: Configuration
# ================================================
import whisperx
import gc

# Your HuggingFace token (get from https://huggingface.co/settings/tokens)
HF_TOKEN = "INSERT-YOUR-HF-TOKEN"

# Configuration
device = "cuda"  # Colab provides free GPU

batch_size = 16
compute_type = "float16"

# # Upload your video file to Colab or use Google Drive
# video_file = "your-video.mp4"  # Change this to your file name

In [2]:

# ================================================
# CELL 3: Alternative - Mount Google Drive
# ================================================
# Uncomment if your video is in Google Drive

from google.colab import drive
drive.mount('/content/drive')

# Set path to your video in Drive
video_file = '/content/drive/MyDrive/Udacity_meeting_capstone_record_04-Feb-26.mp4'


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:

# ================================================
# CELL 4: Load and Transcribe
# ================================================
print("🎤 Starting transcription process...\n")

# Load audio
print("1️⃣ Loading audio...")
audio = whisperx.load_audio(video_file)
print("✅ Audio loaded\n")

# Load Whisper model
print("2️⃣ Loading Whisper model...")
model = whisperx.load_model("large-v2", device, compute_type=compute_type)
print("✅ Model loaded\n")

# Transcribe
print("3️⃣ Transcribing... (this takes ~10-15 min for 1-hour video)")
result = model.transcribe(audio, batch_size=batch_size, language='en')
print("✅ Transcription complete\n")


🎤 Starting transcription process...

1️⃣ Loading audio...
✅ Audio loaded

2️⃣ Loading Whisper model...


DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for _speechbrain_save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for _speechbrain_load
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for load
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for _save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for _recover
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer.json: 0.00B [00:00, ?B/s]

vocabulary.txt: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.bin:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

2026-02-05 14:54:58 - whisperx.asr - INFO - No language specified, language will be detected for each audio file (increases inference time)
2026-02-05 14:54:58 - whisperx.vads.pyannote - INFO - Performing voice activity detection using Pyannote...


INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.6.1. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../usr/local/lib/python3.12/dist-packages/whisperx/assets/pytorch_model.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.4.0. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.3.1+cu121. Bad things might happen unless you revert torch to 1.x.
✅ Model loaded

3️⃣ Transcribing... (this takes ~10-15 min for 1-hour video)


It can be re-enabled by calling
   >>> import torch
   >>> torch.backends.cuda.matmul.allow_tf32 = True
   >>> torch.backends.cudnn.allow_tf32 = True
See https://github.com/pyannote/pyannote-audio/issues/1370 for more details.



✅ Transcription complete



In [4]:
# ================================================
# CELL 5: Align Timestamps
# ================================================
print("4️⃣ Aligning timestamps...")
model_a, metadata = whisperx.load_align_model(
    language_code="en",
    device=device
)

result = whisperx.align(
    result["segments"],
    model_a,
    metadata,
    audio,
    device,
    return_char_alignments=False
)
print("✅ Timestamps aligned\n")

# Free memory
del model
gc.collect()

4️⃣ Aligning timestamps...


Downloading: "https://download.pytorch.org/torchaudio/models/wav2vec2_fairseq_base_ls960_asr_ls960.pth" to /root/.cache/torch/hub/checkpoints/wav2vec2_fairseq_base_ls960_asr_ls960.pth
100%|██████████| 360M/360M [00:01<00:00, 276MB/s]


✅ Timestamps aligned



0

In [7]:
# ================================================
# CELL 6: Speaker Diarization (FIXED)
# ================================================
print("5️⃣ Loading speaker diarization model...")

# FIXED: Import from whisperx.diarize
from whisperx.diarize import DiarizationPipeline

diarize_model = DiarizationPipeline(
    use_auth_token=HF_TOKEN,
    device=device
)

print("6️⃣ Identifying speakers...")
diarize_segments = diarize_model(
    audio,
    min_speakers=3,
    max_speakers=3
)
print("✅ Speaker identification complete\n")


5️⃣ Loading speaker diarization model...
2026-02-05 15:02:59 - whisperx.diarize - INFO - Loading diarization model: pyannote/speaker-diarization-3.1


config.yaml:   0%|          | 0.00/469 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/5.91M [00:00<?, ?B/s]

config.yaml:   0%|          | 0.00/399 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/26.6M [00:00<?, ?B/s]

config.yaml:   0%|          | 0.00/221 [00:00<?, ?B/s]

6️⃣ Identifying speakers...


  std = sequences.std(dim=-1, correction=1)


✅ Speaker identification complete



In [8]:
# ================================================
# CELL 7: Assign Speakers to Transcript
# ================================================
print("7️⃣ Assigning speakers...")
result = whisperx.assign_word_speakers(diarize_segments, result)
print("✅ Complete!\n")


7️⃣ Assigning speakers...
✅ Complete!



In [9]:

# ================================================
# CELL 8: Display and Save Results
# ================================================
print("=" * 70)
print(" TRANSCRIPT WITH SPEAKER LABELS")
print("=" * 70 + "\n")

# Display transcript
for segment in result["segments"]:
    speaker = segment.get("speaker", "UNKNOWN")
    text = segment["text"]
    start = segment.get("start", 0)
    end = segment.get("end", 0)
    print(f"{speaker} [{start:.1f}s-{end:.1f}s]: {text}")

# Save to text file
output_file = "transcript_with_speakers.txt"
with open(output_file, "w", encoding="utf-8") as f:
    f.write("VIDEO TRANSCRIPT WITH SPEAKER IDENTIFICATION\n")
    f.write("=" * 70 + "\n\n")
    for segment in result["segments"]:
        speaker = segment.get("speaker", "UNKNOWN")
        text = segment["text"]
        start = segment.get("start", 0)
        end = segment.get("end", 0)
        f.write(f"{speaker} [{start:.1f}s-{end:.1f}s]: {text}\n")

print(f"\n✅ Transcript saved to: {output_file}")

# Download the transcript
from google.colab import files
files.download(output_file)

print("\n🎉 All done! Your transcript has been downloaded.")


 TRANSCRIPT WITH SPEAKER LABELS

SPEAKER_01 [135.0s-140.1s]:  A lot of people saying hello in the chat, sharing where they're from, reporting the weather.
SPEAKER_01 [140.9s-150.1s]: Very glad to see you all and excited this morning, evening, afternoon, to talk about our Masters in AI capstone project.
SPEAKER_01 [151.6s-157.2s]: First, by way of introducing myself, my name is Patrick, and I lead consumer marketing at Udacity.
SPEAKER_01 [157.8s-171.5s]:  And I am very, very excited to be joined by Jared Moulton, our VP of Consumer at Udacity, and more or less the brains behind making this master's in AI a reality for students.
SPEAKER_01 [172.8s-173.7s]: Thanks for joining, Jared.
SPEAKER_02 [174.4s-175.1s]: Thanks for having me.
SPEAKER_02 [175.1s-176.5s]: Hi from Seattle, Washington.
SPEAKER_02 [177.1s-179.8s]: Great to see so many faces from all over the world.
SPEAKER_01 [181.2s-184.4s]:  We are also very fortunate to be joined by Dr.
SPEAKER_01 [184.4s-194.4s]: Brandy Robinson, w

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


🎉 All done! Your transcript has been downloaded.
