# Attempt 4

In [2]:
# Complete Audio Transcription with Speaker Diarization for Google Colab
# Designed for free tier constraints with one-file-at-a-time processing

# ============================================
# BLOCK 1: Environment Reset and GPU Check
# ============================================
# Run this first to ensure clean environment
import subprocess
import sys
import os

# Check current environment
print("Checking environment...")
!nvidia-smi -L
!python --version

# Clean any corrupted installations
!rm -rf /usr/local/lib/python3.*/dist-packages/~orch 2>/dev/null || true
!pip cache purge -q

print("\n✓ Environment cleaned. Proceed to Block 2.")

Checking environment...
GPU 0: Tesla T4 (UUID: GPU-3f5ef6a4-1e4f-ecac-a352-ec20ecfd4813)
Python 3.11.13

✓ Environment cleaned. Proceed to Block 2.


In [3]:
# ============================================
# BLOCK 2: Strategic Dependency Installation
# ============================================
# CRITICAL: Run this in exact order
print("Installing dependencies in correct order...")

# Step 1: Force NumPy 1.x to avoid compatibility issues
!pip uninstall -y numpy -q
!pip install numpy==1.24.3 -q

# Step 2: Install PyTorch with specific CUDA version
!pip install torch==2.1.2+cu118 torchaudio==2.1.2+cu118 --index-url https://download.pytorch.org/whl/cu118 -q

# Step 3: Install critical dependencies with version pins
!pip install transformers==4.36.2 -q
!pip install faster-whisper==1.0.3 -q
!pip install ctranslate2==4.4.0 -q  # Critical for Colab

# Step 4: Install audio processing libraries
!pip install pydub==0.25.1 -q
!pip install librosa==0.10.1 -q

# Step 5: Install pyannote.audio
!pip install pyannote.audio==3.1.1 -q

# Step 6: Install WhisperX without dependencies
!pip install --no-deps git+https://github.com/m-bain/whisperx.git@v3.1.1 -q

# Step 7: Install remaining WhisperX requirements
!pip install pandas==2.0.3 -q  # Compatible with Colab
!pip install nltk>=3.8 -q
!pip install ffmpeg-python==0.2.0 -q

print("\n✓ Dependencies installed. Restart runtime if you see errors.")
print("After restart, run from Block 3 onwards.")

Installing dependencies in correct order...
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.3/17.3 MB[0m [31m44.6 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
whisperx 3.1.1 requires setuptools==65.6.3, but you have setuptools 75.2.0 which is incompatible.
whisperx 3.1.1 requires torch==2.0.0, but you have torch 2.1.2+cu118 which is incompatible.
whisperx 3.1.1 requires torchaudio==2.0.1, but you have torchaudio 2.1.2+cu118 which is incompatible.
google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 2.0.3 which is incompatible.
xarray 2025.3.1 requires pandas>=2.1, but you have pandas 2.0.3 which is incompatible.
jax 0.5.2 requires numpy>=1.25, but you have numpy 1.24.3 which is incompatible.
albumentations 2.0.8 requires numpy>=1.24.4, but you have numpy 1.24.3 which is incompatible.
albuc

In [1]:
# ============================================
# BLOCK 3: Import and Verify Installation
# ============================================
import torch
import gc
import os
import json
from pathlib import Path
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Verify installations
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")

# Test critical imports
try:
    import whisperx
    print("✓ WhisperX imported successfully")
except Exception as e:
    print(f"❌ WhisperX import error: {e}")
    print("Please restart runtime and run from Block 3")

try:
    from pyannote.audio import Pipeline
    print("✓ pyannote.audio imported successfully")
except Exception as e:
    print(f"❌ pyannote.audio import error: {e}")

PyTorch version: 2.1.2+cu118
CUDA available: True
GPU: Tesla T4
GPU Memory: 14.7 GB
❌ WhisperX import error: module 'numpy' has no attribute 'dtypes'
Please restart runtime and run from Block 3
❌ pyannote.audio import error: module 'numpy' has no attribute 'dtypes'


In [None]:
# ============================================
# BLOCK 4: Mount Drive and Setup Paths
# ============================================
from google.colab import drive
drive.mount('/content/drive')

# Configure paths - adjust these to your structure
INPUT_PATH = "/content/drive/MyDrive/PRUT-Transcriptions/Recordings_PRUT"
OUTPUT_PATH = "/content/drive/MyDrive/PRUT-Transcriptions/Transcripts"

# Create output directory
os.makedirs(OUTPUT_PATH, exist_ok=True)

# List files
import glob
mp4_files = sorted(glob.glob(os.path.join(INPUT_PATH, "*.mp4")))
mp3_files = sorted(glob.glob(os.path.join(INPUT_PATH, "*.mp3")))
all_files = mp4_files + mp3_files

print(f"\nFound {len(all_files)} audio files:")
print(f"  - {len(mp4_files)} MP4 files")
print(f"  - {len(mp3_files)} MP3 files")

# Check completed files
completed = [f for f in os.listdir(OUTPUT_PATH) if f.endswith('_transcript.md')]
completed_bases = [f.replace('_transcript.md', '') for f in completed]

remaining = []
for f in all_files:
    base = os.path.splitext(os.path.basename(f))[0]
    if base not in completed_bases:
        remaining.append(f)

print(f"\nProgress:")
print(f"  - Completed: {len(completed)}")
print(f"  - Remaining: {len(remaining)}")

In [None]:
# ============================================
# BLOCK 5: Configure HuggingFace Token
# ============================================
# Get HuggingFace token
from google.colab import userdata

# Try to get from Colab secrets first
try:
    HF_TOKEN = userdata.get('HF_TOKEN')
    print("✓ Found HF token in Colab secrets")
except:
    HF_TOKEN = None

# If not in secrets, ask for it
if not HF_TOKEN:
    print("\nHuggingFace token required for speaker diarization.")
    print("Get your token from: https://huggingface.co/settings/tokens")
    print("Accept conditions at: https://huggingface.co/pyannote/speaker-diarization-3.1")
    HF_TOKEN = input("Enter your HuggingFace token: ")

# Set as environment variable
os.environ["HF_TOKEN"] = HF_TOKEN

In [None]:
# ============================================
# BLOCK 6: Audio Conversion Function
# ============================================
from pydub import AudioSegment
import subprocess

def convert_to_wav(input_path, temp_dir="/content/temp_audio"):
    """Convert MP4/MP3 to WAV for processing"""
    os.makedirs(temp_dir, exist_ok=True)

    base_name = os.path.splitext(os.path.basename(input_path))[0]
    output_path = os.path.join(temp_dir, f"{base_name}.wav")

    # Skip if already WAV
    if input_path.lower().endswith('.wav'):
        return input_path

    # Check if already converted
    if os.path.exists(output_path):
        print(f"  Using cached WAV: {output_path}")
        return output_path

    print(f"  Converting to WAV: {os.path.basename(input_path)}")

    try:
        # Use ffmpeg for robust conversion
        cmd = [
            'ffmpeg', '-i', input_path,
            '-acodec', 'pcm_s16le',
            '-ar', '16000',  # 16kHz sample rate
            '-ac', '1',      # Mono
            '-y',            # Overwrite
            output_path
        ]
        subprocess.run(cmd, check=True, capture_output=True)
        print(f"  ✓ Converted successfully")
        return output_path
    except Exception as e:
        print(f"  ❌ Conversion failed: {e}")
        return None

In [None]:
# ============================================
# BLOCK 7: Memory-Efficient Processing Function
# ============================================
def process_single_file(audio_path, output_path, hf_token):
    """Process one file with transcription and diarization"""

    base_name = os.path.splitext(os.path.basename(audio_path))[0]
    transcript_path = os.path.join(output_path, f"{base_name}_transcript.md")

    # Skip if already processed
    if os.path.exists(transcript_path):
        print(f"✓ Already processed: {base_name}")
        return True

    print(f"\n{'='*60}")
    print(f"🎯 Processing: {os.path.basename(audio_path)}")
    print(f"{'='*60}")

    try:
        # Convert to WAV if needed
        wav_path = convert_to_wav(audio_path)
        if not wav_path:
            return False

        # Load audio
        print("\n1️⃣ Loading audio...")
        audio = whisperx.load_audio(wav_path)
        duration = len(audio) / 16000  # 16kHz sample rate
        print(f"  Duration: {duration:.1f} seconds")

        # Initialize WhisperX model
        print("\n2️⃣ Loading transcription model...")
        device = "cuda" if torch.cuda.is_available() else "cpu"
        compute_type = "int8" if device == "cuda" else "int8"

        model = whisperx.load_model(
            "large-v2",  # Using v2 for stability
            device,
            compute_type=compute_type,
            language="en"
        )

        # Transcribe with small batch size
        print("\n3️⃣ Transcribing audio...")
        result = model.transcribe(
            audio,
            batch_size=4,  # Small batch for memory
            language="en",
            suppress_tokens=[-1],  # Keep all tokens including fillers
            condition_on_previous_text=True,
            temperature=0,
            compression_ratio_threshold=2.4,
            logprob_threshold=-1.0,
            no_speech_threshold=0.6
        )

        # Free transcription model memory
        del model
        torch.cuda.empty_cache()
        gc.collect()

        # Align whisper output
        print("\n4️⃣ Aligning timestamps...")
        model_a, metadata = whisperx.load_align_model(
            language_code="en",
            device=device
        )
        result = whisperx.align(
            result["segments"],
            model_a,
            metadata,
            audio,
            device,
            return_char_alignments=False
        )

        # Free alignment model
        del model_a, metadata
        torch.cuda.empty_cache()
        gc.collect()

        # Speaker diarization
        print("\n5️⃣ Identifying speakers...")
        diarize_model = whisperx.DiarizationPipeline(
            use_auth_token=hf_token,
            device=device
        )

        # Run diarization with conservative parameters
        diarize_segments = diarize_model(
            audio,
            min_speakers=2,
            max_speakers=10
        )

        # Assign speakers to words
        result = whisperx.assign_word_speakers(diarize_segments, result)

        # Free diarization model
        del diarize_model
        torch.cuda.empty_cache()
        gc.collect()

        # Save transcript
        print("\n6️⃣ Saving transcript...")
        with open(transcript_path, 'w', encoding='utf-8') as f:
            f.write(f"# Transcript: {base_name}\n\n")
            f.write(f"**Date processed**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
            f.write(f"**Duration**: {duration:.1f} seconds\n\n")
            f.write("---\n\n")

            current_speaker = None
            for segment in result["segments"]:
                speaker = segment.get('speaker', 'UNKNOWN')

                # New speaker section
                if speaker != current_speaker:
                    f.write(f"\n## {speaker}\n\n")
                    current_speaker = speaker

                # Write segment with timestamp
                start = segment['start']
                end = segment['end']
                text = segment['text'].strip()

                # Keep all text including fillers
                f.write(f"[{start:.2f}s - {end:.2f}s] {text}\n\n")

        print(f"\n✅ Successfully saved: {transcript_path}")

        # Clean up temporary WAV if created
        if wav_path != audio_path and os.path.exists(wav_path):
            os.remove(wav_path)

        return True

    except Exception as e:
        print(f"\n❌ Error processing {base_name}: {str(e)}")
        import traceback
        traceback.print_exc()

        # Emergency memory cleanup
        torch.cuda.empty_cache()
        gc.collect()

        return False

In [None]:
# ============================================
# BLOCK 8: Process Next File (Run Repeatedly)
# ============================================
# This block processes ONE file at a time
# Run it repeatedly until all files are done

if remaining:
    next_file = remaining[0]
    print(f"\n🔄 Processing file {len(completed) + 1} of {len(all_files)}")
    print(f"File: {os.path.basename(next_file)}")

    success = process_single_file(next_file, OUTPUT_PATH, HF_TOKEN)

    if success:
        # Update progress
        completed = [f for f in os.listdir(OUTPUT_PATH) if f.endswith('_transcript.md')]
        remaining = remaining[1:]

        print(f"\n📊 Progress: {len(completed)}/{len(all_files)} completed")
        print(f"⏭️  {len(remaining)} files remaining")

        if remaining:
            print("\n🔄 Run this cell again to process the next file")
        else:
            print("\n🎉 All files processed!")
    else:
        print("\n⚠️  File failed. You can:")
        print("1. Run this cell again to retry")
        print("2. Skip by removing it from 'remaining' list")

    # Always clear memory after processing
    torch.cuda.empty_cache()
    gc.collect()

else:
    print("🎉 All files have been processed!")
    print(f"\n📁 Transcripts saved in: {OUTPUT_PATH}")

In [None]:
# ============================================
# BLOCK 9: Verification and Summary
# ============================================
# Run this to see processing summary

print("📊 Processing Summary")
print("=" * 60)

# List all transcripts
transcripts = sorted([f for f in os.listdir(OUTPUT_PATH) if f.endswith('_transcript.md')])

print(f"\nTotal transcripts: {len(transcripts)}")
print("\nCompleted files:")
for t in transcripts:
    size = os.path.getsize(os.path.join(OUTPUT_PATH, t)) / 1024
    print(f"  ✓ {t} ({size:.1f} KB)")

# Check for any missing files
all_bases = [os.path.splitext(os.path.basename(f))[0] for f in all_files]
completed_bases = [f.replace('_transcript.md', '') for f in transcripts]
missing = [b for b in all_bases if b not in completed_bases]

if missing:
    print(f"\n⚠️  Missing transcripts for:")
    for m in missing:
        print(f"  - {m}")
else:
    print("\n✅ All files successfully transcribed!")

In [None]:
# ============================================
# BLOCK 10: Emergency Cleanup (If Needed)
# ============================================
# Run this if you encounter memory errors

print("🧹 Performing emergency cleanup...")

# Clear all GPU memory
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.synchronize()

# Force garbage collection
gc.collect()

# Clear temporary files
temp_dir = "/content/temp_audio"
if os.path.exists(temp_dir):
    import shutil
    shutil.rmtree(temp_dir)
    print(f"✓ Cleared temporary audio files")

# Show memory status
if torch.cuda.is_available():
    print(f"\nGPU Memory: {torch.cuda.memory_allocated()/1024**3:.2f} GB used")
    print(f"GPU Memory: {torch.cuda.memory_reserved()/1024**3:.2f} GB reserved")

print("\n✓ Cleanup complete. You can continue processing.")
Improve
Explain


# Attempt 3

In [None]:
# Clean environment first
!pip uninstall -y torch torchvision torchaudio transformers whisperx pyannote.audio pandas -q
!rm -rf /usr/local/lib/python3.11/dist-packages/~orch
!pip install pandas==2.2.2 -q  # Colab requirement
!nvidia-smi

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m109.4 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
fastai 2.7.19 requires torch<2.7,>=1.10, which is not installed.
fastai 2.7.19 requires torchvision>=0.11, which is not installed.[0m[31m
[0mWed Jun 11 05:18:18 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |               

In [None]:
# Install PyTorch first with specific CUDA version for Colab
!pip install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/cu121

# Install core dependencies with specific versions
!pip install numpy>=1.26.4,<2.1
!pip install transformers==4.44.2
!pip install ctranslate2==4.4.0  # Critical: Colab requires 4.4.0, not 4.5.0+

# Install pyannote.audio with NumPy 2.0 support
!pip install pyannote.audio==3.3.2

# Install WhisperX
!pip install git+https://github.com/m-bain/whisperx.git

# Handle ONNX runtime conflicts
!pip uninstall -y onnxruntime onnxruntime-gpu
!pip install onnxruntime-gpu==1.16.3

In [None]:
# Install torch first with specific version
!pip install torch==2.1.2 torchaudio==2.1.2 --index-url https://download.pytorch.org/whl/cu118 -q

# Install whisperx dependencies separately
!pip install faster-whisper==1.0.3 -q
!pip install pyannote.audio==3.1.1 -q

# Install whisperx without dependencies to avoid conflicts
!pip install --no-deps git+https://github.com/m-bain/whisperx.git@v3.1.1 -q

# Install remaining whisperx requirements manually
!pip install nltk>=3.8 -q
!pip install ffmpeg-python==0.2.0 -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 GB[0m [31m570.7 kB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m89.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.2/89.2 MB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
peft 0.15.2 requires transformers, which is not installed.
timm 1.0.15 requires torchvision, which is not installed.
sentence-transformers 4.1.0 requires transformers<5.0.0,>=4.41.0, which is not installed.
fastai 2.7.19 requires torchvision>=0.11, which is not installed.[0m[31m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m37.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m

In [None]:
import torch
import gc

# Configuration for memory efficiency
device = "cuda" if torch.cuda.is_available() else "cpu"
batch_size = 4  # Reduced from default 16
compute_type = "int8"  # Instead of "float16"

# Load and process transcription first
model = whisperx.load_model("large-v2", device, compute_type="int8")   # why not v3?
result = model.transcribe(audio, batch_size=4)

# Critical: Clear model from memory before diarization
del model
torch.cuda.empty_cache()
gc.collect()

# Then load diarization pipeline
diarize_model = whisperx.DiarizationPipeline(use_auth_token=HF_TOKEN, device=device)

In [None]:
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

# Test whisperx import
try:
    import whisperx
    print("✓ WhisperX imported successfully")
except Exception as e:
    print(f"❌ WhisperX import error: {e}")


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py", line 37, in <module>
    ColabKernelApp.launch_instance()
  File "/usr/local/lib/python3.11/dist-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelapp.py", line 712, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.11/dist-package

PyTorch version: 2.1.2+cu118
CUDA available: True
❌ WhisperX import error: No module named 'transformers'


## Block 2: Install Correct Versions

In [None]:
# Install working combination
!pip install torch==2.1.2 torchaudio==2.1.2 --index-url https://download.pytorch.org/whl/cu118 -q
!pip install faster-whisper==1.0.3 -q
!pip install pyannote.audio==3.1.1 -q
!pip install pydub -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 GB[0m [31m706.6 kB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m50.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.2/89.2 MB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchvision 0.21.0+cu124 requires torch==2.6.0, but you have torch 2.1.2+cu118 which is incompatible.[0m[31m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m37.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.4/34.4 MB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.6/38.6 MB[0m [31m20.2 MB/s[0m eta [36m0

## Block 3: Mount Drive and Setup

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
import gc
import torch
from pathlib import Path

# Setup paths - adjust to your actual path
# Setup paths - adjust to your actual path
INPUT_PATH = "/content/drive/MyDrive/PRUT-Transcriptions/Recordings_PRUT"
OUTPUT_PATH = "/content/drive/MyDrive/PRUT-Transcriptions/Transcripts"

# Create output directory if needed
os.makedirs(OUTPUT_PATH, exist_ok=True)

# Get list of MP4 files
mp4_files = sorted([f for f in os.listdir(INPUT_PATH) if f.endswith('.mp4')])
print(f"Found {len(mp4_files)} MP4 files")

# Check which are already done
completed = [f.replace('_transcript.md', '.mp4') for f in os.listdir(OUTPUT_PATH) if f.endswith('_transcript.md')]
remaining = [f for f in mp4_files if f not in completed]

print(f"Already completed: {len(completed)}")
print(f"Remaining to process: {len(remaining)}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Found 7 MP4 files
Already completed: 0
Remaining to process: 7


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Block 4: Setup WhisperX with Diarization

In [None]:
# Get your HuggingFace token
from google.colab import userdata
HF_TOKEN = userdata.get('HF_TOKEN')  # Add your token in Colab secrets

# If no token in secrets, ask for it
if not HF_TOKEN:
    HF_TOKEN = input("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")

# Install WhisperX
!pip install git+https://github.com/m-bain/whisperx.git -q

import whisperx
device = "cuda"

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m37.4/37.4 MB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m52.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.4/12.4 MB[0m [31m95.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m898.7/898.7 kB[0m [31m53.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m821.2/821.2 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m393.1/393.1 MB[0m [31m3.8 MB/s[0m eta [36m0:00:

## Block 5: Process One File Function

In [None]:
def process_single_file(mp4_file, input_path, output_path, hf_token):
    """Process a single audio file with WhisperX and speaker diarization"""

    input_file = os.path.join(input_path, mp4_file)
    base_name = mp4_file.replace('.mp4', '')
    output_file = os.path.join(output_path, f"{base_name}_transcript.md")

    # Skip if already processed
    if os.path.exists(output_file):
        print(f"✓ Already processed: {mp4_file}")
        return True

    print(f"\n🎯 Processing: {mp4_file}")

    try:
        # Load audio
        audio = whisperx.load_audio(input_file)

        # 1. Transcribe with Whisper
        print("  → Transcribing...")
        model = whisperx.load_model("large-v2", device, compute_type="float16")
        result = model.transcribe(audio, batch_size=16)

        # 2. Align whisper output
        print("  → Aligning...")
        model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
        result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)

        # 3. Diarize with pyannote
        print("  → Speaker diarization...")
        diarize_model = whisperx.DiarizationPipeline(use_auth_token=hf_token, device=device)
        diarize_segments = diarize_model(audio, min_speakers=2, max_speakers=10)
        result = whisperx.assign_word_speakers(diarize_segments, result)

        # 4. Save as markdown
        print("  → Saving transcript...")
        with open(output_file, 'w', encoding='utf-8') as f:
            f.write(f"# Transcript: {mp4_file}\n\n")

            current_speaker = None
            for segment in result["segments"]:
                speaker = segment.get('speaker', 'UNKNOWN')

                # New speaker section
                if speaker != current_speaker:
                    f.write(f"\n## {speaker}\n\n")
                    current_speaker = speaker

                # Write text with timestamp
                start = segment['start']
                end = segment['end']
                text = segment['text'].strip()
                f.write(f"[{start:.2f}s - {end:.2f}s] {text}\n\n")

        print(f"✓ Completed: {mp4_file}")

        # Clean up memory
        del model, model_a, diarize_model
        gc.collect()
        torch.cuda.empty_cache()

        return True

    except Exception as e:
        print(f"❌ Error processing {mp4_file}: {str(e)}")
        return False

## Block 6: Process Next File (Run Multiple Times)

In [None]:
# Process just ONE file at a time to avoid runtime crashes
if remaining:
    next_file = remaining[0]
    print(f"Processing next file: {next_file}")

    success = process_single_file(next_file, INPUT_PATH, OUTPUT_PATH, HF_TOKEN)

    if success:
        print(f"\n✅ Successfully processed {next_file}")
        print(f"⏭️  {len(remaining)-1} files remaining")
        print("\n🔄 Run this cell again to process the next file")
    else:
        print(f"\n❌ Failed to process {next_file}")
        print("Fix the error and run again")
else:
    print("🎉 All files have been processed!")

# Show progress
completed = [f for f in os.listdir(OUTPUT_PATH) if f.endswith('_transcript.md')]
print(f"\nProgress: {len(completed)}/{len(mp4_files)} files completed")

Processing next file: Call Recording - 13Mar2025 1200 BPA.mp4

🎯 Processing: Call Recording - 13Mar2025 1200 BPA.mp4
  → Transcribing...
❌ Error processing Call Recording - 13Mar2025 1200 BPA.mp4: module 'torch.utils._pytree' has no attribute 'register_pytree_node'

❌ Failed to process Call Recording - 13Mar2025 1200 BPA.mp4
Fix the error and run again

Progress: 0/7 files completed


## Block 7: Alternative - Process Without HF Token

In [None]:
# If you don't have a HuggingFace token, use this simpler version
from faster_whisper import WhisperModel

def process_simple(mp4_file, input_path, output_path):
    """Simple transcription without speaker diarization"""

    input_file = os.path.join(input_path, mp4_file)
    base_name = mp4_file.replace('.mp4', '')
    output_file = os.path.join(output_path, f"{base_name}_transcript.md")

    if os.path.exists(output_file):
        print(f"✓ Already processed: {mp4_file}")
        return True

    print(f"\n🎯 Processing: {mp4_file}")

    try:
        model = WhisperModel("large-v3", device="cuda", compute_type="float16")

        segments, info = model.transcribe(
            input_file,
            language="en",
            word_timestamps=True,
            vad_filter=True
        )

        with open(output_file, 'w', encoding='utf-8') as f:
            f.write(f"# Transcript: {mp4_file}\n\n")

            for segment in segments:
                f.write(f"[{segment.start:.2f}s - {segment.end:.2f}s] {segment.text}\n\n")

        print(f"✓ Completed: {mp4_file}")
        return True

    except Exception as e:
        print(f"❌ Error: {str(e)}")
        return False

# Use this if no HF token
if remaining and not HF_TOKEN:
    next_file = remaining[0]
    process_simple(next_file, INPUT_PATH, OUTPUT_PATH)