In [None]:
# ================================================
# CELL 1: FIX - Reinstall Compatible Versions
# ================================================

print("üîß Uninstalling incompatible packages...")
!pip uninstall -y torch torchvision torchaudio transformers

print("\nüì¶ Installing compatible versions...")
# Install compatible PyTorch and torchvision versions
!pip install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/cu121

# Install transformers with compatible version
!pip install transformers==4.40.0

# Install WhisperX
!pip install whisperx

# Install FFmpeg
!apt-get install ffmpeg -y

print("\n‚úÖ Installation complete with compatible versions!")

# Verify installations
import torch
import torchvision
print(f"\nüîç Verification:")
print(f"PyTorch version: {torch.__version__}")
print(f"Torchvision version: {torchvision.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")


In [None]:

# ================================================
# CELL 2: Configuration
# ================================================
import whisperx
import gc

# Your HuggingFace token (get from https://huggingface.co/settings/tokens)
HF_TOKEN = "INSERT-YOUR-HF-TOKEN"

# Configuration
device = "cuda"  # Colab provides free GPU

batch_size = 16
compute_type = "float16"

# # Upload your video file to Colab or use Google Drive
# video_file = "your-video.mp4"  # Change this to your file name

In [None]:

# ================================================
# CELL 3: Alternative - Mount Google Drive
# ================================================
# Uncomment if your video is in Google Drive

from google.colab import drive
drive.mount('/content/drive')

# Set path to your video in Drive
video_file = '/content/drive/MyDrive/Udacity_meeting_capstone_record_04-Feb-26.mp4'


In [None]:

# ================================================
# CELL 4: Load and Transcribe
# ================================================
print("üé§ Starting transcription process...\n")

# Load audio
print("1Ô∏è‚É£ Loading audio...")
audio = whisperx.load_audio(video_file)
print("‚úÖ Audio loaded\n")

# Load Whisper model
print("2Ô∏è‚É£ Loading Whisper model...")
model = whisperx.load_model("large-v2", device, compute_type=compute_type)
print("‚úÖ Model loaded\n")

# Transcribe
print("3Ô∏è‚É£ Transcribing... (this takes ~10-15 min for 1-hour video)")
result = model.transcribe(audio, batch_size=batch_size, language='en')
print("‚úÖ Transcription complete\n")


In [None]:
# ================================================
# CELL 5: Align Timestamps
# ================================================
print("4Ô∏è‚É£ Aligning timestamps...")
model_a, metadata = whisperx.load_align_model(
    language_code="en",
    device=device
)

result = whisperx.align(
    result["segments"],
    model_a,
    metadata,
    audio,
    device,
    return_char_alignments=False
)
print("‚úÖ Timestamps aligned\n")

# Free memory
del model
gc.collect()

In [None]:
# ================================================
# CELL 6: Speaker Diarization (FIXED)
# ================================================
print("5Ô∏è‚É£ Loading speaker diarization model...")

# FIXED: Import from whisperx.diarize
from whisperx.diarize import DiarizationPipeline

diarize_model = DiarizationPipeline(
    use_auth_token=HF_TOKEN,
    device=device
)

print("6Ô∏è‚É£ Identifying speakers...")
diarize_segments = diarize_model(
    audio,
    min_speakers=3,
    max_speakers=3
)
print("‚úÖ Speaker identification complete\n")


In [None]:
# ================================================
# CELL 7: Assign Speakers to Transcript
# ================================================
print("7Ô∏è‚É£ Assigning speakers...")
result = whisperx.assign_word_speakers(diarize_segments, result)
print("‚úÖ Complete!\n")


In [None]:

# ================================================
# CELL 8: Display and Save Results
# ================================================
print("=" * 70)
print(" TRANSCRIPT WITH SPEAKER LABELS")
print("=" * 70 + "\n")

# Display transcript
for segment in result["segments"]:
    speaker = segment.get("speaker", "UNKNOWN")
    text = segment["text"]
    start = segment.get("start", 0)
    end = segment.get("end", 0)
    print(f"{speaker} [{start:.1f}s-{end:.1f}s]: {text}")

# Save to text file
output_file = "transcript_with_speakers.txt"
with open(output_file, "w", encoding="utf-8") as f:
    f.write("VIDEO TRANSCRIPT WITH SPEAKER IDENTIFICATION\n")
    f.write("=" * 70 + "\n\n")
    for segment in result["segments"]:
        speaker = segment.get("speaker", "UNKNOWN")
        text = segment["text"]
        start = segment.get("start", 0)
        end = segment.get("end", 0)
        f.write(f"{speaker} [{start:.1f}s-{end:.1f}s]: {text}\n")

print(f"\n‚úÖ Transcript saved to: {output_file}")

# Download the transcript
from google.colab import files
files.download(output_file)

print("\nüéâ All done! Your transcript has been downloaded.")
