<a href="https://colab.research.google.com/github/SingularitySmith/PRUT-Transcriber/blob/main/PRUT_Transcriber4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# Mount Google Drive
drive.mount('/content/drive')

# Define paths
INPUT_PATH = "/content/drive/My Drive/PRUT-Transcriptions/Recordings_PRUT_MP4"
OUTPUT_PATH = "/content/drive/My Drive/PRUT-Transcriptions/Recordings_PRUT"




In [1]:
# SIMPLE WORKING TRANSCRIPTION SYSTEM
# Based on the approach that was working

# ============================================
# CELL 1: Complete Setup and Processing
# ============================================

import os
import glob
import json
import time
import gc
import subprocess
from datetime import datetime
from google.colab import drive

# Mount Drive
# if not os.path.exists('/content/drive'):
#    drive.mount('/content/drive')

In [2]:

# Mount Google Drive
drive.mount('/content/drive')

# Define paths - adjust these to your actual locations
INPUT_PATH = "/content/drive/My Drive/PRUT-Transcriptions/Recordings_PRUT"
OUTPUT_PATH = "/content/drive/My Drive/PRUT-Transcriptions/Transcripts"


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# ============================================
# BLOCK 2: File Discovery and Status
# ============================================
"""
Run this to see what files need processing
"""

# Get list of audio files
mp4_files = sorted(glob.glob(os.path.join(INPUT_PATH, "*.mp4")))
mp3_files = sorted(glob.glob(os.path.join(INPUT_PATH, "*.mp3")))
all_audio_files = mp4_files + mp3_files

print(f"\n📁 Found {len(all_audio_files)} audio files:")
for i, f in enumerate(all_audio_files, 1):
    print(f"  {i}. {os.path.basename(f)}")

# Check what's already been transcribed
completed_files = []
remaining_files = []

for audio_file in all_audio_files:
    base_name = os.path.splitext(os.path.basename(audio_file))[0]
    transcript_path = os.path.join(OUTPUT_PATH, f"{base_name}_transcript.txt")

    if os.path.exists(transcript_path):
        completed_files.append(audio_file)
    else:
        remaining_files.append(audio_file)

print(f"\n📊 Status:")
print(f"  ✓ Completed: {len(completed_files)}")
print(f"  ⏳ Remaining: {len(remaining_files)}")

if remaining_files:
    print(f"\n🎯 Next file to process: {os.path.basename(remaining_files[0])}")



📁 Found 8 audio files:
  1. Call Recording - 13Mar2025 1200 BPA.mp4
  2. Call Recording - 13Mar25 1130 BK.mp4
  3. Call Recording - 13Mar25 1300 HB.mp4
  4. Call Recording - 19Mar2025 0800 JD.mp4
  5. Call Recording - 19Mar25 0900 - AJ.mp4
  6. Call Recording - 19Mar25 1730 - MO.mp4
  7. Call Recording - 20Mar2025 1200 LN.mp4
  8. Call Recording - 26Mar2025 0830 SA.mp4

📊 Status:
  ✓ Completed: 0
  ⏳ Remaining: 8

🎯 Next file to process: Call Recording - 13Mar2025 1200 BPA.mp4


In [22]:

    # ============================================
    # INSTALL WHISPER (MINIMAL VERSION)
    # ============================================

    if remaining_files and not os.path.exists('/usr/local/bin/whisper'):
        print("\n📦 Installing OpenAI Whisper...")
        subprocess.run(['pip', 'install', '-q', 'openai-whisper'], check=True)
        print("✓ Whisper installed")


In [None]:

    # ============================================
    # PROCESS FILES ONE BY ONE
    # ============================================

    if remaining_files:
        print("\n" + "="*60)
        print("STARTING TRANSCRIPTION")
        print("="*60)

        # Process only first 3 files to avoid timeout
        files_to_process = remaining_files[:3]

        for idx, audio_file in enumerate(files_to_process):
            base_name = os.path.splitext(os.path.basename(audio_file))[0]
            output_file = os.path.join(OUTPUT_PATH, f"{base_name}_transcript.txt")

            print(f"\n[{idx+1}/{len(files_to_process)}] Processing: {os.path.basename(audio_file)}")

            try:
                # Use whisper command line (more stable than Python API)
                start_time = time.time()

                # Run whisper with minimal settings
                cmd = [
                    'whisper',
                    audio_file,
                    '--model', 'tiny',
                    '--language', 'en',
                    '--output_format', 'txt',
                    '--output_dir', OUTPUT_PATH,
                    '--verbose', 'False'
                ]

                result = subprocess.run(cmd, capture_output=True, text=True)

                if result.returncode == 0:
                    # Rename output file to our format
                    whisper_output = os.path.join(OUTPUT_PATH, f"{base_name}.txt")
                    if os.path.exists(whisper_output):
                        os.rename(whisper_output, output_file)

                    elapsed = time.time() - start_time
                    print(f"✓ Success! Processed in {elapsed:.1f} seconds")
                    print(f"  Saved to: {os.path.basename(output_file)}")
                else:
                    print(f"❌ Error: {result.stderr}")

            except Exception as e:
                print(f"❌ Failed: {e}")

            # Cool down between files
            if idx < len(files_to_process) - 1:
                print("\n⏳ Cooling down for 5 seconds...")
                time.sleep(5)

            # Force garbage collection
            gc.collect()

        print("\n" + "="*60)
        print("SESSION COMPLETE")
        print(f"Processed {len(files_to_process)} files")
        print(f"Remaining: {len(remaining_files) - len(files_to_process)} files")
        print("\nRun this cell again to continue processing!")
        print("="*60)
    else:
        print("\n✅ All files have been transcribed!")

# V9


In [3]:
# MANUAL PATH FIX - DIRECT SOLUTION
# Run this to manually set the correct path and verify it works

import os
import json
from google.colab import drive

# Mount Drive
if not os.path.exists('/content/drive'):
    drive.mount('/content/drive')

print("="*60)
print("MANUAL PATH CONFIGURATION")
print("="*60)

# Based on your working code, the correct path should be:
CORRECT_BASE = "/content/drive/My Drive"  # No space in MyDrive
CORRECT_SOURCE = f"{CORRECT_BASE}/PRUT-Transcriptions/Recordings_PRUT"
CORRECT_OUTPUT = f"{CORRECT_BASE}/PRUT-Transcriptions/Transcripts"

print(f"Testing path: {CORRECT_SOURCE}")

if os.path.exists(CORRECT_SOURCE):
    files = os.listdir(CORRECT_SOURCE)
    audio_files = [f for f in files if any(f.lower().endswith(ext)
                   for ext in ['.mp4', '.mp3', '.wav', '.m4a', '.flac', '.ogg'])]

    print(f"✓ Path exists!")
    print(f"✓ Found {len(audio_files)} audio files")

    if audio_files:
        print("\nAudio files found:")
        for f in audio_files[:5]:
            print(f"  - {f}")

        # Create a fixed configuration file
        config = {
            "DRIVE_MOUNT": CORRECT_BASE,
            "SOURCE_DIR": CORRECT_SOURCE,
            "OUTPUT_DIR": CORRECT_OUTPUT,
            "verified": True,
            "file_count": len(audio_files)
        }

        config_path = f"{CORRECT_BASE}/PRUT-Transcriptions/path_config.json"
        with open(config_path, 'w') as f:
            json.dump(config, f, indent=2)

        print(f"\n✅ Configuration saved to: {config_path}")
        print("\nNOW DO THIS:")
        print("1. Copy this line:")
        print(f'   SOURCE_DIR = "{CORRECT_SOURCE}"')
        print("2. Paste it at the top of your main transcription code")
        print("3. Run the main code again")
else:
    print("✗ Path does not exist!")
    print("\nTrying to list what's actually in your PRUT folder...")

    prut_base = f"{CORRECT_BASE}/PRUT-Transcriptions"
    if os.path.exists(prut_base):
        print(f"\nContents of {prut_base}:")
        for item in os.listdir(prut_base):
            item_path = os.path.join(prut_base, item)
            if os.path.isdir(item_path):
                print(f"  📁 {item}/")
                # Check inside folders
                sub_items = os.listdir(item_path)
                for sub in sub_items[:3]:
                    print(f"     - {sub}")
            else:
                print(f"  📄 {item}")

print("\n" + "="*60)

MANUAL PATH CONFIGURATION
Testing path: /content/drive/My Drive/PRUT-Transcriptions/Recordings_PRUT
✓ Path exists!
✓ Found 8 audio files

Audio files found:
  - Call Recording - 19Mar25 0900 - AJ.mp4
  - Call Recording - 13Mar2025 1200 BPA.mp4
  - Call Recording - 26Mar2025 0830 SA.mp4
  - Call Recording - 19Mar25 1730 - MO.mp4
  - Call Recording - 19Mar2025 0800 JD.mp4

✅ Configuration saved to: /content/drive/My Drive/PRUT-Transcriptions/path_config.json

NOW DO THIS:
1. Copy this line:
   SOURCE_DIR = "/content/drive/My Drive/PRUT-Transcriptions/Recordings_PRUT"
2. Paste it at the top of your main transcription code
3. Run the main code again



In [5]:
# WhisperX Ultra-Resilient Transcription System
# Single-cell design with aggressive memory management

"""
CODE BLOCK 1: COMPLETE SYSTEM - RUN THIS SINGLE CELL REPEATEDLY
This cell contains the entire system and can be run after crashes
"""

import os
import json
import time
import gc
import subprocess
import sys
from datetime import datetime
from google.colab import drive

In [6]:
# ============================================
# CODE BLOCK 1.1: CONFIGURATION - FIXED
# ============================================

# HARDCODED CORRECT PATHS (based on your working code)
DRIVE_MOUNT = '/content/drive/My Drive'  # NO SPACE
BASE_DIR = f'{DRIVE_MOUNT}/PRUT-Transcriptions'
SOURCE_DIR = f'{BASE_DIR}/Recordings_PRUT'
OUTPUT_DIR = f'{BASE_DIR}/Transcripts'
WAV_DIR = f'{BASE_DIR}/WAV_Cache'
CHECKPOINT_FILE = f'{BASE_DIR}/checkpoint.json'
LOG_FILE = f'{BASE_DIR}/processing.log'

# Create directories
for dir_path in [BASE_DIR, OUTPUT_DIR, WAV_DIR]:
    os.makedirs(dir_path, exist_ok=True)

print(f"Using paths:")
print(f"  Source: {SOURCE_DIR}")
print(f"  Output: {OUTPUT_DIR}")

Using paths:
  Source: /content/drive/My Drive/PRUT-Transcriptions/Recordings_PRUT
  Output: /content/drive/My Drive/PRUT-Transcriptions/Transcripts


In [8]:

# ============================================
# CODE BLOCK 1.2: LOGGING SYSTEM
# ============================================

def log_message(message, level="INFO"):
    """Log to both console and file"""
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    log_entry = f"[{timestamp}] {level}: {message}"
    print(log_entry)

    try:
        with open(LOG_FILE, 'a') as f:
            f.write(log_entry + "\n")
    except:
        pass  # Don't fail if can't write log

In [9]:

# ============================================
# CODE BLOCK 1.3: CHECKPOINT MANAGEMENT
# ============================================

def load_checkpoint():
    """Load progress from checkpoint file"""
    if os.path.exists(CHECKPOINT_FILE):
        with open(CHECKPOINT_FILE, 'r') as f:
            return json.load(f)
    return {
        'processed_files': [],
        'failed_files': {},
        'current_mode': 'ultra_minimal',  # Start with absolute minimum
        'model_loaded': None,
        'last_update': None,
        'session_count': 0
    }

def save_checkpoint(checkpoint):
    """Save progress to checkpoint file"""
    checkpoint['last_update'] = datetime.now().isoformat()
    checkpoint['session_count'] = checkpoint.get('session_count', 0) + 1
    with open(CHECKPOINT_FILE, 'w') as f:
        json.dump(checkpoint, f, indent=2)
    log_message(f"Checkpoint saved (session #{checkpoint['session_count']})")

In [10]:

# ============================================
# CODE BLOCK 1.4: PROCESSING MODES
# ============================================

PROCESSING_MODES = {
    'ultra_minimal': {
        'method': 'whisper_api',  # Use OpenAI Whisper API directly
        'model': 'tiny',           # Smallest possible model
        'skip_vad': True,          # Skip voice activity detection
        'skip_align': True,        # Skip alignment
        'skip_diarize': True,      # Skip diarization
        'chunk_duration': 180      # 3-minute chunks
    },
    'minimal': {
        'method': 'whisperx',
        'model': 'base',
        'skip_vad': True,
        'skip_align': False,
        'skip_diarize': True,
        'chunk_duration': 300
    },
    'standard': {
        'method': 'whisperx',
        'model': 'small',
        'skip_vad': False,
        'skip_align': False,
        'skip_diarize': True,
        'chunk_duration': 600
    },
    'high': {
        'method': 'whisperx',
        'model': 'medium',
        'skip_vad': False,
        'skip_align': False,
        'skip_diarize': False,
        'chunk_duration': 900
    }
}

In [11]:

# ============================================
# CODE BLOCK 1.5: SAFE DEPENDENCY INSTALLATION
# ============================================

def ensure_dependencies(mode):
    """Install only necessary dependencies for current mode"""
    log_message(f"Checking dependencies for {mode} mode...")

    # Basic dependencies always needed
    basic_deps = ['pydub']
    for dep in basic_deps:
        try:
            __import__(dep)
        except ImportError:
            log_message(f"Installing {dep}...")
            subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', dep])

    # FFmpeg
    if subprocess.call(['which', 'ffmpeg'], stdout=subprocess.DEVNULL) != 0:
        log_message("Installing ffmpeg...")
        subprocess.call(['apt-get', '-qq', 'update'])
        subprocess.call(['apt-get', '-qq', 'install', 'ffmpeg'])

    # Mode-specific dependencies
    if PROCESSING_MODES[mode]['method'] == 'whisper_api':
        try:
            import whisper
        except ImportError:
            log_message("Installing OpenAI Whisper...")
            subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', 'openai-whisper'])

    elif PROCESSING_MODES[mode]['method'] == 'whisperx':
        try:
            import whisperx
        except ImportError:
            log_message("Installing WhisperX...")
            # Install with specific order to avoid conflicts
            subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', 'torch==2.0.0', 'torchaudio==2.0.0', '--index-url', 'https://download.pytorch.org/whl/cu118'])
            subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', 'git+https://github.com/m-bain/whisperx.git'])

In [12]:

# ============================================
# CODE BLOCK 1.6: ULTRA-MINIMAL TRANSCRIPTION
# ============================================

def transcribe_ultra_minimal(audio_path):
    """Use OpenAI Whisper directly - most stable option"""
    import whisper

    log_message("Loading tiny Whisper model...")
    model = whisper.load_model("tiny")

    log_message("Transcribing with OpenAI Whisper...")
    result = model.transcribe(audio_path, language='en')

    # Convert to WhisperX-like format
    segments = []
    if 'segments' in result:
        for seg in result['segments']:
            segments.append({
                'start': seg['start'],
                'end': seg['end'],
                'text': seg['text']
            })
    else:
        # Fallback if no segments
        segments.append({
            'start': 0,
            'end': 0,
            'text': result.get('text', '')
        })

    # Clean up model
    del model
    gc.collect()

    return {'segments': segments}

In [13]:

# ============================================
# CODE BLOCK 1.7: WHISPERX TRANSCRIPTION
# ============================================

def transcribe_whisperx(audio_path, mode_config):
    """Use WhisperX with configurable features"""
    import whisperx
    import torch

    device = "cuda" if torch.cuda.is_available() else "cpu"

    # Load only necessary models
    log_message(f"Loading {mode_config['model']} model on {device}...")

    # Load with minimal configuration
    model = whisperx.load_model(
        mode_config['model'],
        device,
        compute_type="int8",  # Always use int8 for stability
        language='en',
        asr_options={
            "suppress_numerals": True,
            "max_new_tokens": None,
            "clip_timestamps": None,
            "hallucination_silence_threshold": None,
            "hotwords": None
        } if mode_config['skip_vad'] else {}
    )

    # Load audio
    audio = whisperx.load_audio(audio_path)

    # Transcribe with minimal batch size
    log_message("Transcribing...")
    result = model.transcribe(audio, batch_size=1)

    # Optional alignment
    if not mode_config['skip_align']:
        try:
            log_message("Aligning transcript...")
            model_a, metadata = whisperx.load_align_model(language_code='en', device=device)
            result = whisperx.align(result["segments"], model_a, metadata, audio, device)
            del model_a
        except Exception as e:
            log_message(f"Alignment failed: {e}", "WARNING")

    # Clean up
    del model
    torch.cuda.empty_cache() if device == "cuda" else None
    gc.collect()

    return result

In [14]:

# ============================================
# CODE BLOCK 1.8: AUDIO PROCESSING
# ============================================

def convert_to_wav(input_path, output_path):
    """Convert audio file to WAV format"""
    try:
        from pydub import AudioSegment
        log_message(f"Converting to WAV: {os.path.basename(input_path)}")

        audio = AudioSegment.from_file(input_path)
        audio = audio.set_channels(1).set_frame_rate(16000)
        audio.export(output_path, format="wav")

        log_message(f"Saved WAV to cache: {os.path.basename(output_path)}")
        return True
    except Exception as e:
        log_message(f"Conversion failed: {e}", "ERROR")
        return False

def split_audio_for_processing(wav_path, chunk_duration):
    """Split audio into smaller chunks"""
    from pydub import AudioSegment

    audio = AudioSegment.from_wav(wav_path)
    total_duration = len(audio) / 1000  # seconds

    if total_duration <= chunk_duration:
        return [(wav_path, 0)]  # No need to split

    chunks = []
    chunk_ms = chunk_duration * 1000

    for i in range(0, len(audio), chunk_ms):
        chunk = audio[i:i + chunk_ms]
        chunk_path = wav_path.replace('.wav', f'_chunk_{i//1000}.wav')
        chunk.export(chunk_path, format="wav")
        chunks.append((chunk_path, i/1000))

    log_message(f"Split into {len(chunks)} chunks of {chunk_duration}s each")
    return chunks

In [15]:

# ============================================
# CODE BLOCK 1.9: MAIN PROCESSING FUNCTION
# ============================================

def process_single_file(filepath, checkpoint):
    """Process a single file with current mode"""
    mode = checkpoint['current_mode']
    mode_config = PROCESSING_MODES[mode]
    base_name = os.path.splitext(os.path.basename(filepath))[0]

    log_message(f"\n{'='*60}")
    log_message(f"Processing: {os.path.basename(filepath)}")
    log_message(f"Mode: {mode}")
    log_message(f"{'='*60}")

    try:
        # Get or create WAV file
        wav_path = os.path.join(WAV_DIR, f"{base_name}.wav")
        if not os.path.exists(wav_path):
            if not filepath.endswith('.wav'):
                if not convert_to_wav(filepath, wav_path):
                    raise Exception("Failed to convert to WAV")
            else:
                import shutil
                shutil.copy2(filepath, wav_path)

        # Check file size and split if needed
        file_size_mb = os.path.getsize(wav_path) / (1024*1024)
        log_message(f"File size: {file_size_mb:.1f} MB")

        chunks = split_audio_for_processing(wav_path, mode_config['chunk_duration'])
        all_segments = []

        # Process each chunk
        for chunk_idx, (chunk_path, start_offset) in enumerate(chunks):
            if len(chunks) > 1:
                log_message(f"Processing chunk {chunk_idx+1}/{len(chunks)}...")

            # Choose transcription method
            if mode_config['method'] == 'whisper_api':
                result = transcribe_ultra_minimal(chunk_path)
            else:
                result = transcribe_whisperx(chunk_path, mode_config)

            # Adjust timestamps and collect segments
            for segment in result.get('segments', []):
                segment['start'] = segment.get('start', 0) + start_offset
                segment['end'] = segment.get('end', 0) + start_offset
                all_segments.append(segment)

            # Clean up chunk if it's temporary
            if chunk_path != wav_path:
                os.remove(chunk_path)

            # Force garbage collection after each chunk
            gc.collect()
            time.sleep(1)  # Brief pause

        # Save transcript
        output_path = os.path.join(OUTPUT_DIR, f"{base_name}_transcript.txt")
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(f"# Transcription of: {os.path.basename(filepath)}\n")
            f.write(f"# Processing mode: {mode}\n")
            f.write(f"# Processed on: {datetime.now().isoformat()}\n\n")

            for seg_idx, segment in enumerate(all_segments):
                start = segment.get('start', 0)
                end = segment.get('end', 0)
                text = segment.get('text', '').strip()

                if text:  # Only write non-empty segments
                    f.write(f"[{start:.2f}-{end:.2f}] {text}\n")

        log_message(f"SUCCESS: Saved transcript to {os.path.basename(output_path)}")

        # Update checkpoint
        checkpoint['processed_files'].append(os.path.basename(filepath))
        save_checkpoint(checkpoint)

        return True

    except Exception as e:
        log_message(f"FAILED: {e}", "ERROR")

        # Log failure
        checkpoint['failed_files'][os.path.basename(filepath)] = {
            'error': str(e),
            'mode': mode,
            'timestamp': datetime.now().isoformat()
        }
        save_checkpoint(checkpoint)

        return False


In [16]:

# ============================================
# CODE BLOCK 1.10: GET PENDING FILES
# ============================================

def get_pending_files(checkpoint):
    """Get list of files not yet processed"""
    all_files = []
    supported_formats = ['.mp4', '.mp3', '.wav', '.m4a', '.flac', '.ogg']

    # Check if source directory exists
    if not os.path.exists(SOURCE_DIR):
        log_message(f"Source directory not found: {SOURCE_DIR}", "ERROR")
        log_message("Please check the path or place audio files in this directory", "ERROR")
        return None  # Return None to indicate path error

    try:
        files_in_dir = os.listdir(SOURCE_DIR)
        if not files_in_dir:
            log_message(f"Source directory is empty: {SOURCE_DIR}", "WARNING")
            return []

        for filename in files_in_dir:
            if os.path.splitext(filename)[1].lower() in supported_formats:
                all_files.append(filename)

        if not all_files:
            log_message(f"No audio files found in {SOURCE_DIR}", "WARNING")
            log_message(f"Looking for: {', '.join(supported_formats)}", "INFO")

    except Exception as e:
        log_message(f"Error reading source directory: {e}", "ERROR")
        return None

    # Filter out already processed files
    pending = []
    for f in all_files:
        if f not in checkpoint['processed_files']:
            base_name = os.path.splitext(f)[0]
            transcript_path = os.path.join(OUTPUT_DIR, f"{base_name}_transcript.txt")

            if not os.path.exists(transcript_path):
                pending.append(f)
            else:
                # File was processed but not in checkpoint
                checkpoint['processed_files'].append(f)
                log_message(f"Found existing transcript for {f}, updating checkpoint")

    return sorted(pending)  # Sort for consistent ordering

In [17]:

# ============================================
# CODE BLOCK 1.11: MAIN EXECUTION
# ============================================

def main():
    """Main execution function"""

    # Mount Google Drive
    try:
        from google.colab import drive
        if not os.path.exists('/content/drive'):
            drive.mount('/content/drive')
            log_message("Google Drive mounted")

            # Re-check mount point after mounting
            global DRIVE_MOUNT, BASE_DIR, SOURCE_DIR, OUTPUT_DIR, WAV_DIR, CHECKPOINT_FILE, LOG_FILE
            for mount in ['/content/drive/MyDrive', '/content/drive/My Drive']:
                if os.path.exists(mount):
                    DRIVE_MOUNT = mount
                    BASE_DIR = f'{DRIVE_MOUNT}/PRUT-Transcriptions'
                    SOURCE_DIR = f'{BASE_DIR}/Recordings_PRUT'
                    OUTPUT_DIR = f'{BASE_DIR}/Transcripts'
                    WAV_DIR = f'{BASE_DIR}/WAV_Cache'
                    CHECKPOINT_FILE = f'{BASE_DIR}/checkpoint.json'
                    LOG_FILE = f'{BASE_DIR}/processing.log'

                    # Create directories
                    for dir_path in [BASE_DIR, SOURCE_DIR, OUTPUT_DIR, WAV_DIR]:
                        os.makedirs(dir_path, exist_ok=True)
                    break
        else:
            log_message("Google Drive already mounted")
    except Exception as e:
        log_message(f"Could not mount Drive: {e}", "WARNING")

    # Show current configuration
    log_message(f"Configuration:")
    log_message(f"  Source: {SOURCE_DIR}")
    log_message(f"  Output: {OUTPUT_DIR}")

    # Load checkpoint
    checkpoint = load_checkpoint()

    log_message("\n" + "="*60)
    log_message(f"SESSION #{checkpoint.get('session_count', 0) + 1} STARTING")
    log_message(f"Progress: {len(checkpoint['processed_files'])} files completed")
    log_message(f"Failed: {len(checkpoint['failed_files'])} files")
    log_message(f"Current mode: {checkpoint['current_mode']}")
    log_message("="*60)

    # Get pending files
    pending_files = get_pending_files(checkpoint)

    # Check if path error
    if pending_files is None:
        log_message("\nPath configuration error. Please:")
        log_message("1. Run the path finder diagnostic tool to find your files")
        log_message("2. Update SOURCE_DIR in the configuration")
        log_message("3. Or place audio files in: " + SOURCE_DIR)
        return

    log_message(f"Files remaining: {len(pending_files)}")

    if not pending_files:
        # Only upgrade if we actually processed files (not just empty directory)
        if len(checkpoint['processed_files']) > 0:
            log_message("\nAll files processed in current mode!")

            # Check if we should upgrade mode
            modes = list(PROCESSING_MODES.keys())
            current_idx = modes.index(checkpoint['current_mode'])

            if current_idx < len(modes) - 1:
                next_mode = modes[current_idx + 1]
                log_message(f"\nUpgrading to {next_mode} mode for better quality...")
                checkpoint['current_mode'] = next_mode
                checkpoint['processed_files'] = []
                checkpoint['failed_files'] = {}
                save_checkpoint(checkpoint)
                pending_files = get_pending_files(checkpoint)
            else:
                log_message("\nALL PROCESSING COMPLETE!")
                return
        else:
            log_message("\nNo audio files found to process.")
            log_message(f"Please add audio files to: {SOURCE_DIR}")
            log_message(f"Supported formats: .mp4, .mp3, .wav, .m4a, .flac, .ogg")
            return

    # Ensure dependencies for current mode
    ensure_dependencies(checkpoint['current_mode'])

    # Process files one by one
    processed_count = 0
    max_files_per_session = 3  # Process fewer files per session for stability

    for filename in pending_files[:max_files_per_session]:
        filepath = os.path.join(SOURCE_DIR, filename)

        # Check if file exists
        if not os.path.exists(filepath):
            log_message(f"File not found: {filename}", "WARNING")
            continue

        success = process_single_file(filepath, checkpoint)
        processed_count += 1

        # Longer cooldown between files
        if processed_count < len(pending_files):
            log_message("Cooling down for 15 seconds...")
            time.sleep(15)

        # Force garbage collection
        gc.collect()

    # Summary
    log_message("\n" + "="*60)
    log_message(f"SESSION COMPLETE")
    log_message(f"Processed this session: {processed_count}")
    log_message(f"Total completed: {len(checkpoint['processed_files'])}")
    log_message(f"Still pending: {len(pending_files) - processed_count}")
    log_message("="*60)

    if len(pending_files) > processed_count:
        log_message("\nRun this cell again to continue processing!")

    # Final cleanup
    gc.collect()

In [21]:

# ============================================
# CODE BLOCK 1.12: EXECUTE MAIN
# ============================================

if __name__ == "__main__":
    try:
        main()
    except KeyboardInterrupt:
        log_message("\nProcess interrupted by user", "WARNING")
    except Exception as e:
        log_message(f"\nFATAL ERROR: {e}", "ERROR")
        import traceback
        log_message(traceback.format_exc(), "ERROR")

[2025-06-18 20:41:11] INFO: Google Drive already mounted
[2025-06-18 20:41:11] INFO: Configuration:
[2025-06-18 20:41:11] INFO:   Source: /content/drive/My Drive/PRUT-Transcriptions/Recordings_PRUT
[2025-06-18 20:41:11] INFO:   Output: /content/drive/My Drive/PRUT-Transcriptions/Transcripts
[2025-06-18 20:41:11] INFO: 
[2025-06-18 20:41:11] INFO: SESSION #9 STARTING
[2025-06-18 20:41:11] INFO: Progress: 8 files completed
[2025-06-18 20:41:11] INFO: Failed: 0 files
[2025-06-18 20:41:11] INFO: Current mode: ultra_minimal
[2025-06-18 20:41:11] INFO: Files remaining: 0
[2025-06-18 20:41:11] INFO: 
All files processed in current mode!
[2025-06-18 20:41:11] INFO: 
Upgrading to minimal mode for better quality...
[2025-06-18 20:41:11] INFO: Checkpoint saved (session #9)
[2025-06-18 20:41:11] INFO: Found existing transcript for Call Recording - 19Mar25 0900 - AJ.mp4, updating checkpoint
[2025-06-18 20:41:11] INFO: Found existing transcript for Call Recording - 13Mar2025 1200 BPA.mp4, updating c

In [14]:

# ============================================
# CODE BLOCK 1.13: USAGE INSTRUCTIONS
# ============================================

"""
CRASH-PROOF TRANSCRIPTION SYSTEM

SETUP:
1. GPU Runtime: Use T4 or CPU (system adapts automatically)
2. Run the PATH FINDER tool first to locate your audio files
3. Update SOURCE_DIR if needed
4. Run this single cell - it handles everything

AFTER CRASH:
1. Reconnect to runtime
2. Run this same cell again
3. It automatically resumes from checkpoint

PROCESSING MODES (AUTOMATIC PROGRESSION):
1. ultra_minimal: OpenAI Whisper tiny model (most stable)
2. minimal: WhisperX base model, no VAD
3. standard: WhisperX small model with VAD
4. high: WhisperX medium model with diarization

FEATURES:
- Saves after EVERY file
- Logs all operations
- 3-minute chunks for stability
- Automatic mode progression
- Crash recovery built-in

MONITORING:
- Check progress: cat /content/drive/MyDrive/PRUT-Transcriptions/checkpoint.json
- View logs: cat /content/drive/MyDrive/PRUT-Transcriptions/processing.log

TROUBLESHOOTING:
- If crashes persist, manually edit checkpoint.json:
  "current_mode": "ultra_minimal"
- Delete specific files from "processed_files" array to reprocess

PATH ISSUES:
If you see "Source directory not found", try:
1. Run the path finder diagnostic tool
2. Manually set the correct path at the top of this code
3. Or create the expected directory and add files:
   !mkdir -p "/content/drive/MyDrive/PRUT-Transcriptions/Recordings_PRUT"

MANUAL PATH SETUP:
# If your files are in a different location, update the configuration:
SOURCE_DIR = '/content/drive/MyDrive/YOUR_ACTUAL_PATH/audio_files'
"""

'\nCRASH-PROOF TRANSCRIPTION SYSTEM\n\nSETUP:\n1. GPU Runtime: Use T4 or CPU (system adapts automatically)\n2. Run the PATH FINDER tool first to locate your audio files\n3. Update SOURCE_DIR if needed\n4. Run this single cell - it handles everything\n\nAFTER CRASH:\n1. Reconnect to runtime\n2. Run this same cell again\n3. It automatically resumes from checkpoint\n\nPROCESSING MODES (AUTOMATIC PROGRESSION):\n1. ultra_minimal: OpenAI Whisper tiny model (most stable)\n2. minimal: WhisperX base model, no VAD\n3. standard: WhisperX small model with VAD\n4. high: WhisperX medium model with diarization\n\nFEATURES:\n- Saves after EVERY file\n- Logs all operations\n- 3-minute chunks for stability\n- Automatic mode progression\n- Crash recovery built-in\n\nMONITORING:\n- Check progress: cat /content/drive/MyDrive/PRUT-Transcriptions/checkpoint.json\n- View logs: cat /content/drive/MyDrive/PRUT-Transcriptions/processing.log\n\nTROUBLESHOOTING:\n- If crashes persist, manually edit checkpoint.json