<a href="https://colab.research.google.com/github/SingularitySmith/PRUT-Transcriber/blob/main/PRUT_Transcriber4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup and Imports

In [None]:
# WhisperX Transcription with Speaker Diarization
# Updated for English transcription with MP4 support

# ============================================
# STEP 1: GPU Setup and Verification
# ============================================
# First, set Runtime to GPU (T4) in Colab: Runtime > Change runtime type > GPU

import tensorflow as tf
import torch

# Verify GPU availability
tf_device = tf.test.gpu_device_name()
torch_device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

if tf_device != '/device:GPU:0' or torch_device.type != 'cuda':
    raise SystemError('GPU not found. Please enable GPU in Runtime settings.')

print(f'TensorFlow GPU: {tf_device}')
print(f'PyTorch GPU: {torch_device}')
print(f'CUDA available: {torch.cuda.is_available()}')

# Check GPU details
!nvidia-smi


In [None]:

# ============================================
# STEP 2: Install Dependencies
# ============================================
!pip install -q pydub
!pip install -q git+https://github.com/m-bain/whisperx.git

# Additional dependencies for video processing
!apt-get -qq install ffmpeg

# ============================================
# STEP 3: Import Libraries and Set Locale
# ============================================
import os
import gc
import locale
from pydub import AudioSegment
from google.colab import drive, userdata
import whisperx

# Set UTF-8 locale
locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')
os.environ['LC_ALL'] = 'C.UTF-8'
os.environ['LANG'] = 'C.UTF-8'

print(f"Locale encoding: {locale.getpreferredencoding()}")


In [None]:

# ============================================
# STEP 4: Mount Google Drive
# ============================================
drive.mount('/content/drive')

# Update these paths to your actual directories
SOURCE_DIR = '/content/drive/My Drive/YourFolder/InputVideos'  # UPDATE THIS
OUTPUT_DIR = '/content/drive/My Drive/YourFolder/Transcriptions'  # UPDATE THIS
TEMP_AUDIO_DIR = '/content/temp_audio'  # Temporary directory for WAV files

# Create directories if they don't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(TEMP_AUDIO_DIR, exist_ok=True)


In [None]:

# ============================================
# STEP 5: Configure WhisperX with Secrets
# ============================================
# Store your HuggingFace token in Colab secrets:
# Click the key icon in the left sidebar > Add a secret named 'HF_TOKEN'

try:
    HF_TOKEN = userdata.get('HF_TOKEN')
except:
    print("Warning: HF_TOKEN not found in secrets. Using hardcoded token.")
    HF_TOKEN = "your_huggingface_token_here"  # Fallback - replace with your token

# WhisperX configuration
device = "cuda"
batch_size = 4  # Adjust based on GPU memory (start with 4, can try 6 or 8)
compute_type = "float32"  # Options: "float32", "float16", "int8"
language = "en"  # Changed from 'de' to 'en'


In [None]:

# ============================================
# STEP 6: Load WhisperX Models
# ============================================
print("Loading WhisperX models...")

# Load main transcription model
model = whisperx.load_model("large-v3", device, language=language, compute_type=compute_type)

# Load alignment model
model_a, metadata = whisperx.load_align_model(language_code=language, device=device)

# Load diarization model
diarize_model = whisperx.DiarizationPipeline(use_auth_token=HF_TOKEN, device=device)

print("All models loaded successfully!")


In [None]:

# ============================================
# STEP 7: Audio Conversion Functions
# ============================================
def convert_to_wav(input_path, output_path):
    """Convert MP4/MP3/other formats to WAV"""
    try:
        audio = AudioSegment.from_file(input_path)
        # Convert to mono, 16kHz for WhisperX
        audio = audio.set_channels(1).set_frame_rate(16000)
        audio.export(output_path, format="wav")
        return True
    except Exception as e:
        print(f"Error converting {input_path}: {e}")
        return False


In [None]:

# ============================================
# STEP 8: Main Transcription Function
# ============================================
def transcribe_with_diarization(audio_path, min_speakers=2, max_speakers=10):
    """Transcribe audio with speaker diarization"""

    # Load audio
    audio = whisperx.load_audio(audio_path)

    # Transcribe
    print(f"Transcribing {os.path.basename(audio_path)}...")
    result = model.transcribe(audio, batch_size=batch_size)

    # Align whisper output
    print("Aligning transcript...")
    result = whisperx.align(result["segments"], model_a, metadata, audio, device,
                           return_char_alignments=False)

    # Diarize
    print("Performing speaker diarization...")
    diarize_segments = diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers)

    # Assign speakers to words
    result = whisperx.assign_word_speakers(diarize_segments, result)

    return result

# ============================================
# STEP 9: Process All Files
# ============================================
# Define speaker mapping
speaker_labels = {}
speaker_counter = 1

# Supported formats
supported_formats = ['.mp4', '.mp3', '.wav', '.m4a', '.flac', '.ogg']

# Process all files
for filename in os.listdir(SOURCE_DIR):
    file_ext = os.path.splitext(filename)[1].lower()

    if file_ext in supported_formats:
        try:
            input_path = os.path.join(SOURCE_DIR, filename)
            base_name = os.path.splitext(filename)[0]

            # Convert to WAV if needed
            if file_ext != '.wav':
                print(f"\nConverting {filename} to WAV...")
                wav_path = os.path.join(TEMP_AUDIO_DIR, f"{base_name}.wav")
                if not convert_to_wav(input_path, wav_path):
                    continue
            else:
                wav_path = input_path

            # Transcribe with diarization
            result = transcribe_with_diarization(wav_path)

            # Map speakers to sequential labels
            for segment in result["segments"]:
                if 'speaker' in segment and segment['speaker'] not in speaker_labels:
                    speaker_labels[segment['speaker']] = f"Speaker{speaker_counter}"
                    speaker_counter += 1

            # Save transcription
            output_path = os.path.join(OUTPUT_DIR, f"{base_name}_transcript.txt")
            with open(output_path, 'w', encoding='utf-8') as f:
                for segment in result["segments"]:
                    speaker = speaker_labels.get(segment.get('speaker', 'Unknown'), 'Unknown')
                    start = segment['start']
                    end = segment['end']
                    text = segment['text']
                    f.write(f"{speaker} [{start:.2f}-{end:.2f}]: {text}\n")

            print(f"✓ Saved transcript to {output_path}")

            # Clean up temporary WAV file
            if file_ext != '.wav' and os.path.exists(wav_path):
                os.remove(wav_path)

        except RuntimeError as e:
            if "out of memory" in str(e):
                print(f"⚠️  Out of memory for {filename}. Clearing cache...")
                torch.cuda.empty_cache()
                gc.collect()
                # Optionally reduce batch size
                batch_size = max(1, batch_size - 1)
                print(f"Reduced batch size to {batch_size}")
                continue
            else:
                print(f"❌ Error processing {filename}: {e}")

        except Exception as e:
            print(f"❌ Error processing {filename}: {e}")

        # Clear GPU memory after each file
        torch.cuda.empty_cache()
        gc.collect()

print("\n✅ Transcription complete!")
print(f"Processed files saved to: {OUTPUT_DIR}")

# ============================================
# STEP 10: Clean Up (Optional)
# ============================================
# Run this to free GPU memory when done
del model, model_a, diarize_model
torch.cuda.empty_cache()
gc.collect()
print("GPU memory cleared")

# ============================================
# OPTIONAL: Advanced Configuration
# ============================================
"""
Advanced options you can modify:

1. Batch Size:
   - Start with 4
   - Increase to 6 or 8 if GPU has enough memory
   - Decrease to 2 or 1 if you get out-of-memory errors

2. Compute Type:
   - "float32": Best accuracy (default)
   - "float16": Faster, slightly less accurate
   - "int8": Fastest, least accurate

3. Model Size:
   - "large-v3": Best accuracy (current)
   - "medium": Faster, good accuracy
   - "small": Much faster, lower accuracy
   - "base": Fastest, lowest accuracy

4. Speaker Count:
   - Adjust min_speakers and max_speakers based on your audio
   - Set both to same number if you know exact speaker count

5. Language:
   - Change language parameter for other languages
   - See WhisperX documentation for supported languages
"""