<a href="https://colab.research.google.com/github/SingularitySmith/PRUT-Transcriber/blob/main/PRUT-Transcriber3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ============================================
# BLOCK 1: Minimal Setup - Run Once Per Session
# ============================================
"""
This block sets up the absolute minimum required for transcription.
No speaker diarization, no fancy features - just pure transcription.
"""

import subprocess
import os
import sys

print("🎯 Setting up minimal transcription environment...")

# Install only what we absolutely need
!pip install -q openai-whisper==20231117
!pip install -q ffmpeg-python

print("✓ Basic setup complete")

# ============================================
# BLOCK 2: Mount Drive and Setup Paths
# ============================================
"""
Run this once per session to mount your Google Drive
"""

from google.colab import drive
drive.mount('/content/drive')

# Configure your paths here
INPUT_PATH = "/content/drive/MyDrive/PRUT-Transcriptions/Recordings_PRUT"
OUTPUT_PATH = "/content/drive/MyDrive/PRUT-Transcriptions/Transcripts"

# Create output directory
os.makedirs(OUTPUT_PATH, exist_ok=True)

# Get list of files
import glob
mp4_files = sorted(glob.glob(os.path.join(INPUT_PATH, "*.mp4")))
mp3_files = sorted(glob.glob(os.path.join(INPUT_PATH, "*.mp3")))
all_files = mp4_files + mp3_files

print(f"\nFound {len(all_files)} audio files")

# Check what's already done
completed = []
for f in all_files:
    base_name = os.path.splitext(os.path.basename(f))[0]
    transcript_path = os.path.join(OUTPUT_PATH, f"{base_name}_transcript.md")
    if os.path.exists(transcript_path):
        completed.append(base_name)

print(f"Already completed: {len(completed)}")
print(f"Remaining: {len(all_files) - len(completed)}")

# Create a list of files to process
remaining_files = []
for f in all_files:
    base_name = os.path.splitext(os.path.basename(f))[0]
    if base_name not in completed:
        remaining_files.append(f)

print("\nFiles to process:")
for i, f in enumerate(remaining_files):
    print(f"{i+1}. {os.path.basename(f)}")

# ============================================
# BLOCK 3: Process Single File
# ============================================
"""
RUN THIS BLOCK REPEATEDLY - ONCE FOR EACH FILE
It will automatically process the next unprocessed file
"""

import whisper
import datetime
import gc

if remaining_files:
    # Get the next file to process
    current_file = remaining_files[0]
    base_name = os.path.splitext(os.path.basename(current_file))[0]

    print(f"\n{'='*60}")
    print(f"Processing: {os.path.basename(current_file)}")
    print(f"{'='*60}")

    try:
        # Load Whisper model (base model for speed/memory)
        print("Loading Whisper model...")
        model = whisper.load_model("base")

        # Transcribe
        print("Transcribing (this may take a few minutes)...")
        result = model.transcribe(
            current_file,
            language="en",
            word_timestamps=True,
            verbose=True
        )

        # Save transcript
        transcript_path = os.path.join(OUTPUT_PATH, f"{base_name}_transcript.md")

        with open(transcript_path, 'w', encoding='utf-8') as f:
            # Header
            f.write(f"# Transcript: {base_name}\n\n")
            f.write(f"**File**: {os.path.basename(current_file)}\n")
            f.write(f"**Processed**: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
            f.write(f"**Duration**: {result.get('duration', 'Unknown')} seconds\n\n")
            f.write("---\n\n")

            # Segments with timestamps
            for segment in result['segments']:
                start = segment['start']
                end = segment['end']
                text = segment['text'].strip()

                # Keep all text including fillers
                f.write(f"[{start:.2f}s - {end:.2f}s] {text}\n\n")

        print(f"\n✅ Successfully saved: {transcript_path}")

        # Update remaining files list
        remaining_files.pop(0)

        print(f"\n📊 Progress: {len(all_files) - len(remaining_files)}/{len(all_files)} completed")
        print(f"Files remaining: {len(remaining_files)}")

        if remaining_files:
            print("\n🔄 Run this block again to process the next file")
        else:
            print("\n🎉 All files processed!")

    except Exception as e:
        print(f"\n❌ Error: {str(e)}")
        print("Try running this block again")

    finally:
        # Clean up memory
        if 'model' in locals():
            del model
        gc.collect()

else:
    print("✅ All files have been processed!")
    print(f"Check your transcripts in: {OUTPUT_PATH}")

# ============================================
# BLOCK 4: Check Progress (Optional)
# ============================================
"""
Run this anytime to see your progress
"""

print("📊 Transcription Progress Report")
print("=" * 60)

# List all completed transcripts
transcripts = glob.glob(os.path.join(OUTPUT_PATH, "*_transcript.md"))
print(f"\nCompleted transcripts: {len(transcripts)}")

for t in sorted(transcripts):
    size_kb = os.path.getsize(t) / 1024
    print(f"  ✓ {os.path.basename(t)} ({size_kb:.1f} KB)")

# Show remaining files
all_bases = [os.path.splitext(os.path.basename(f))[0] for f in all_files]
completed_bases = [os.path.basename(t).replace('_transcript.md', '') for t in transcripts]
remaining_bases = [b for b in all_bases if b not in completed_bases]

if remaining_bases:
    print(f"\nRemaining files ({len(remaining_bases)}):")
    for r in remaining_bases:
        print(f"  ⏳ {r}")
else:
    print("\n✅ All files transcribed!")

# ============================================
# BLOCK 5: Convert Single Transcript to Simple Format
# ============================================
"""
Optional: Creates a simplified version without timestamps
"""

# List available transcripts
transcripts = glob.glob(os.path.join(OUTPUT_PATH, "*_transcript.md"))
print("Available transcripts:")
for i, t in enumerate(transcripts):
    print(f"{i+1}. {os.path.basename(t)}")

# Select which one to simplify
choice = input("\nEnter number to create simplified version (or 'skip'): ")

if choice != 'skip' and choice.isdigit():
    idx = int(choice) - 1
    if 0 <= idx < len(transcripts):
        source_path = transcripts[idx]

        # Read transcript
        with open(source_path, 'r', encoding='utf-8') as f:
            content = f.read()

        # Extract just the text (remove timestamps)
        import re
        pattern = r'\[\d+\.\d+s - \d+\.\d+s\] (.+)'
        matches = re.findall(pattern, content)

        # Save simplified version
        simple_path = source_path.replace('_transcript.md', '_simple.txt')
        with open(simple_path, 'w', encoding='utf-8') as f:
            f.write(' '.join(matches))

        print(f"✓ Created simplified version: {os.path.basename(simple_path)}")