<a href="https://colab.research.google.com/github/SingularitySmith/PRUT-Transcriber/blob/main/PRUT-Transcriber3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
# ============================================
# BLOCK 1: Setup and Imports
# ============================================
"""
Run this first to set up your environment and import necessary modules
"""

import os
import subprocess
import glob
from pathlib import Path
from datetime import datetime
from google.colab import drive

print("🎯 Setting up Whisper.cpp transcription environment...")

# Mount Google Drive
drive.mount('/content/drive')

# Define paths - adjust these to your actual locations
INPUT_PATH = "/content/drive/MyDrive/PRUT-Transcriptions/Recordings_PRUT"
OUTPUT_PATH = "/content/drive/MyDrive/PRUT-Transcriptions/Transcripts"

# Create output directory
os.makedirs(OUTPUT_PATH, exist_ok=True)

# Install whisper.cpp if not already done
if not os.path.exists('/content/whisper.cpp'):
    print("Installing whisper.cpp...")
    !git clone https://github.com/ggerganov/whisper.cpp
    !cd whisper.cpp && make
else:
    print("✓ Whisper.cpp already installed")

# Download model if not already present
model_path = "/content/whisper.cpp/models/ggml-base.en.bin"
if not os.path.exists(model_path):
    print("Downloading base.en model...")
    !cd whisper.cpp && ./models/download-ggml-model.sh base.en
else:
    print("✓ Model already downloaded")

print("\n✅ Setup complete!")


🎯 Setting up Whisper.cpp transcription environment...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✓ Whisper.cpp already installed
✓ Model already downloaded

✅ Setup complete!


In [12]:

# ============================================
# BLOCK 2: File Discovery and Status
# ============================================
"""
Run this to see what files need processing
"""

# Get list of audio files
mp4_files = sorted(glob.glob(os.path.join(INPUT_PATH, "*.mp4")))
mp3_files = sorted(glob.glob(os.path.join(INPUT_PATH, "*.mp3")))
all_audio_files = mp4_files + mp3_files

print(f"\n📁 Found {len(all_audio_files)} audio files:")
for i, f in enumerate(all_audio_files, 1):
    print(f"  {i}. {os.path.basename(f)}")

# Check what's already been transcribed
completed_files = []
remaining_files = []

for audio_file in all_audio_files:
    base_name = os.path.splitext(os.path.basename(audio_file))[0]
    transcript_path = os.path.join(OUTPUT_PATH, f"{base_name}_transcript.txt")

    if os.path.exists(transcript_path):
        completed_files.append(audio_file)
    else:
        remaining_files.append(audio_file)

print(f"\n📊 Status:")
print(f"  ✓ Completed: {len(completed_files)}")
print(f"  ⏳ Remaining: {len(remaining_files)}")

if remaining_files:
    print(f"\n🎯 Next file to process: {os.path.basename(remaining_files[0])}")



📁 Found 8 audio files:
  1. Call Recording - 13Mar2025 1200 BPA.mp4
  2. Call Recording - 13Mar25 1130 BK.mp4
  3. Call Recording - 13Mar25 1300 HB.mp4
  4. Call Recording - 19Mar2025 0800 JD.mp4
  5. Call Recording - 19Mar25 0900 - AJ.mp4
  6. Call Recording - 19Mar25 1730 - MO.mp4
  7. Call Recording - 20Mar2025 1200 LN.mp4
  8. Call Recording - 26Mar2025 0830 SA.mp4

📊 Status:
  ✓ Completed: 0
  ⏳ Remaining: 8

🎯 Next file to process: Call Recording - 13Mar2025 1200 BPA.mp4


In [13]:

# ============================================
# BLOCK 3: Audio Conversion Function
# ============================================
"""
Helper function to convert MP4/MP3 to WAV for whisper.cpp
"""

def convert_to_wav(input_path):
    """Convert audio file to 16kHz WAV format required by whisper.cpp"""
    output_path = f"/tmp/{os.path.basename(input_path)}.wav"

    # Skip if already WAV
    if input_path.lower().endswith('.wav'):
        return input_path

    print(f"Converting to WAV: {os.path.basename(input_path)}")

    # Use ffmpeg to convert to 16kHz mono WAV
    cmd = [
        'ffmpeg', '-i', input_path,
        '-ar', '16000',      # 16kHz sample rate
        '-ac', '1',          # Mono
        '-c:a', 'pcm_s16le', # 16-bit PCM
        '-y',                # Overwrite
        output_path
    ]

    result = subprocess.run(cmd, capture_output=True, text=True)

    if result.returncode == 0:
        print(f"✓ Converted successfully")
        return output_path
    else:
        print(f"❌ Conversion failed: {result.stderr}")
        return None


In [None]:

# ============================================
# BLOCK 4: Process Single File (Run Repeatedly)
# ============================================
"""
RUN THIS BLOCK REPEATEDLY - Once for each file
Processes exactly one file then stops
"""

if remaining_files:
    # Get next file to process
    current_file = remaining_files[0]
    base_name = os.path.splitext(os.path.basename(current_file))[0]

    print(f"\n{'='*60}")
    print(f"🎯 Processing: {os.path.basename(current_file)}")
    print(f"{'='*60}")

    try:
        # Convert to WAV
        wav_path = convert_to_wav(current_file)
        if not wav_path:
            raise Exception("Failed to convert audio file")

        # Prepare output path
        transcript_path = os.path.join(OUTPUT_PATH, f"{base_name}_transcript.txt")

        # Run whisper.cpp
        print("\n📝 Transcribing with whisper.cpp...")
        print("(This may take several minutes)")

        # First, let's find where the binary actually is
        possible_paths = [
            '/content/whisper.cpp/build/bin/whisper-cli',
            '/content/whisper.cpp/bin/whisper-cli',
            '/content/whisper.cpp/main'
        ]

        whisper_binary = None
        for path in possible_paths:
            if os.path.exists(path):
                whisper_binary = path
                print(f"✓ Found whisper binary at: {path}")
                break

        if not whisper_binary:
            # If still not found, let's search for it
            result = subprocess.run(['find', '/content/whisper.cpp', '-name', 'whisper-cli', '-type', 'f'],
                                  capture_output=True, text=True)
            if result.stdout:
                whisper_binary = result.stdout.strip().split('\n')[0]
                print(f"✓ Found whisper binary at: {whisper_binary}")
            else:
                raise Exception("Cannot find whisper-cli binary. Try recompiling with: !cd whisper.cpp && make clean && make")

        cmd = [
            whisper_binary,
            '-m', '/content/whisper.cpp/models/ggml-base.en.bin',
            '-f', wav_path,
            '-of', transcript_path.replace('.txt', ''),  # whisper.cpp adds extension
            '--print-colors',
            '--print-progress',
            '-l', 'en',
            '-t', '8',  # Use 8 threads
            '--no-timestamps'  # Remove if you want timestamps
        ]

        # Run transcription
        result = subprocess.run(cmd, capture_output=True, text=True)

        if result.returncode == 0:
            print(f"\n✅ Transcription complete!")

            # Read and format the transcript
            with open(transcript_path, 'r', encoding='utf-8') as f:
                content = f.read()

            # Add header to transcript
            final_content = f"# Transcript: {base_name}\n\n"
            final_content += f"**File**: {os.path.basename(current_file)}\n"
            final_content += f"**Processed**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n"
            final_content += "---\n\n"
            final_content += content

            # Save formatted transcript
            with open(transcript_path, 'w', encoding='utf-8') as f:
                f.write(final_content)

            print(f"✓ Saved: {transcript_path}")

            # Update remaining files
            remaining_files.pop(0)

            print(f"\n📊 Progress: {len(all_audio_files) - len(remaining_files)}/{len(all_audio_files)}")

            if remaining_files:
                print(f"\n🔄 Next up: {os.path.basename(remaining_files[0])}")
                print("Run this block again to process the next file")
            else:
                print("\n🎉 All files processed!")

        else:
            print(f"\n❌ Transcription failed")
            print(f"Error: {result.stderr}")

        # Clean up temporary WAV file
        if wav_path != current_file and os.path.exists(wav_path):
            os.remove(wav_path)

    except Exception as e:
        print(f"\n❌ Error: {str(e)}")
        print("Try running this block again")

else:
    print("✅ All files have been processed!")
    print(f"\n📁 Transcripts saved in: {OUTPUT_PATH}")




🎯 Processing: Call Recording - 13Mar2025 1200 BPA.mp4
Converting to WAV: Call Recording - 13Mar2025 1200 BPA.mp4
✓ Converted successfully

📝 Transcribing with whisper.cpp...
(This may take several minutes)
✓ Found whisper binary at: /content/whisper.cpp/build/bin/whisper-cli


In [None]:

# ============================================
# BLOCK 5: Alternative - Use Larger Model
# ============================================
"""
Optional: Download and use a larger model for better accuracy
"""

print("Available models:")
print("1. tiny.en    (39 MB) - Fastest, least accurate")
print("2. base.en    (142 MB) - Good balance (currently using)")
print("3. small.en   (466 MB) - Better accuracy")
print("4. medium.en  (1.5 GB) - High accuracy")
print("5. large-v3   (3.1 GB) - Best accuracy")

model_choice = input("\nEnter model name to download (or 'skip'): ")

if model_choice != 'skip' and model_choice in ['tiny.en', 'base.en', 'small.en', 'medium.en', 'large-v3']:
    print(f"\nDownloading {model_choice} model...")
    !cd whisper.cpp && ./models/download-ggml-model.sh {model_choice}
    print(f"\n✓ Model downloaded. Update the model path in Block 4 to use it.")


In [None]:

# ============================================
# BLOCK 6: Check All Transcripts
# ============================================
"""
Run this anytime to see all completed transcripts
"""

print("📊 Transcription Summary")
print("=" * 60)

transcripts = glob.glob(os.path.join(OUTPUT_PATH, "*_transcript.txt"))
print(f"\nTotal transcripts: {len(transcripts)}")

total_size = 0
for t in sorted(transcripts):
    size = os.path.getsize(t) / 1024
    total_size += size
    print(f"  ✓ {os.path.basename(t)} ({size:.1f} KB)")

print(f"\nTotal size: {total_size:.1f} KB")

# Show sample from first transcript
if transcripts:
    print(f"\n📄 Sample from {os.path.basename(transcripts[0])}:")
    with open(transcripts[0], 'r', encoding='utf-8') as f:
        lines = f.readlines()
        print("".join(lines[:10]) + "...")


In [None]:
# ============================================
# BLOCK 7: Emergency Cleanup
# ============================================
"""
Run this if you need to clean up temporary files or restart
"""

print("🧹 Cleaning up temporary files...")

# Remove temporary WAV files
tmp_files = glob.glob("/tmp/*.wav")
for f in tmp_files:
    os.remove(f)
    print(f"  Removed: {os.path.basename(f)}")

print("\n✓ Cleanup complete")

# Show disk usage
!df -h /tmp

In [1]:
# ALTERNATIVE: Whisper.cpp - No Python dependencies
!git clone https://github.com/ggerganov/whisper.cpp
!cd whisper.cpp && make

# Download model
!cd whisper.cpp && ./models/download-ggml-model.sh base.en

# Transcribe
!cd whisper.cpp && ./main -m models/ggml-base.en.bin -f your_audio.wav

Cloning into 'whisper.cpp'...
remote: Enumerating objects: 20144, done.[K
remote: Counting objects: 100% (143/143), done.[K
remote: Compressing objects: 100% (59/59), done.[K
remote: Total 20144 (delta 95), reused 84 (delta 84), pack-reused 20001 (from 2)[K
Receiving objects: 100% (20144/20144), 23.61 MiB | 7.33 MiB/s, done.
Resolving deltas: 100% (14082/14082), done.
cmake -B build 
  Compatibility with CMake < 3.10 will be removed from a future version of
  CMake.

  Update the VERSION argument <min> value.  Or, use the <min>...<max> syntax
  to tell CMake that the project requires at least <min> but has been updated
  to work with policies introduced by <max> or earlier.

[0m
-- The C compiler identification is GNU 11.4.0
-- The CXX compiler identification is GNU 11.4.0
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Check for working C compiler: /usr/bin/cc - skipped
-- Detecting C compile features
-- Detecting C compile features - done
-- Detecting

In [1]:
# ============================================
# BLOCK 1: Minimal Setup - Run Once Per Session
# ============================================
"""
This block sets up the absolute minimum required for transcription.
No speaker diarization, no fancy features - just pure transcription.
"""

import subprocess
import os
import sys

print("🎯 Setting up minimal transcription environment...")

# Install only what we absolutely need
!pip install -q openai-whisper==20231117
!pip install -q ffmpeg-python

print("✓ Basic setup complete")

🎯 Setting up minimal transcription environment...
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m168.1/168.1 MB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m69.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m779.2/779.2 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.6/410.6 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m87.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m75.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82

In [4]:
# ============================================
# BLOCK 1: Setup and Imports
# ============================================
"""
Run this first to set up your environment and import necessary modules
"""

import os
import subprocess
import glob
from pathlib import Path
from datetime import datetime
from google.colab import drive

print("🎯 Setting up Whisper.cpp transcription environment...")

# Mount Google Drive
drive.mount('/content/drive')

# Define paths - adjust these to your actual locations
INPUT_PATH = "/content/drive/MyDrive/PRUT-Transcriptions/Recordings_PRUT"
OUTPUT_PATH = "/content/drive/MyDrive/PRUT-Transcriptions/Transcripts"

# Create output directory
os.makedirs(OUTPUT_PATH, exist_ok=True)

# Install whisper.cpp if not already done
if not os.path.exists('/content/whisper.cpp'):
    print("Installing whisper.cpp...")
    !git clone https://github.com/ggerganov/whisper.cpp
    !cd whisper.cpp && make
else:
    print("✓ Whisper.cpp already installed")

# Download model if not already present
model_path = "/content/whisper.cpp/models/ggml-base.en.bin"
if not os.path.exists(model_path):
    print("Downloading base.en model...")
    !cd whisper.cpp && ./models/download-ggml-model.sh base.en
else:
    print("✓ Model already downloaded")

print("\n✅ Setup complete!")

🎯 Setting up Whisper.cpp transcription environment...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✓ Whisper.cpp already installed
✓ Model already downloaded

✅ Setup complete!


In [5]:
# ============================================
# BLOCK 2: File Discovery and Status
# ============================================
"""
Run this to see what files need processing
"""

# Get list of audio files
mp4_files = sorted(glob.glob(os.path.join(INPUT_PATH, "*.mp4")))
mp3_files = sorted(glob.glob(os.path.join(INPUT_PATH, "*.mp3")))
all_audio_files = mp4_files + mp3_files

print(f"\n📁 Found {len(all_audio_files)} audio files:")
for i, f in enumerate(all_audio_files, 1):
    print(f"  {i}. {os.path.basename(f)}")

# Check what's already been transcribed
completed_files = []
remaining_files = []

for audio_file in all_audio_files:
    base_name = os.path.splitext(os.path.basename(audio_file))[0]
    transcript_path = os.path.join(OUTPUT_PATH, f"{base_name}_transcript.txt")

    if os.path.exists(transcript_path):
        completed_files.append(audio_file)
    else:
        remaining_files.append(audio_file)

print(f"\n📊 Status:")
print(f"  ✓ Completed: {len(completed_files)}")
print(f"  ⏳ Remaining: {len(remaining_files)}")

if remaining_files:
    print(f"\n🎯 Next file to process: {os.path.basename(remaining_files[0])}")


📁 Found 8 audio files:
  1. Call Recording - 13Mar2025 1200 BPA.mp4
  2. Call Recording - 13Mar25 1130 BK.mp4
  3. Call Recording - 13Mar25 1300 HB.mp4
  4. Call Recording - 19Mar2025 0800 JD.mp4
  5. Call Recording - 19Mar25 0900 - AJ.mp4
  6. Call Recording - 19Mar25 1730 - MO.mp4
  7. Call Recording - 20Mar2025 1200 LN.mp4
  8. Call Recording - 26Mar2025 0830 SA.mp4

📊 Status:
  ✓ Completed: 0
  ⏳ Remaining: 8

🎯 Next file to process: Call Recording - 13Mar2025 1200 BPA.mp4


In [9]:

# ============================================
# BLOCK 3: Audio Conversion Function
# ============================================
"""
Helper function to convert MP4/MP3 to WAV for whisper.cpp
"""

def convert_to_wav(input_path):
    """Convert audio file to 16kHz WAV format required by whisper.cpp"""
    output_path = f"/tmp/{os.path.basename(input_path)}.wav"

    # Skip if already WAV
    if input_path.lower().endswith('.wav'):
        return input_path

    print(f"Converting to WAV: {os.path.basename(input_path)}")

    # Use ffmpeg to convert to 16kHz mono WAV
    cmd = [
        'ffmpeg', '-i', input_path,
        '-ar', '16000',      # 16kHz sample rate
        '-ac', '1',          # Mono
        '-c:a', 'pcm_s16le', # 16-bit PCM
        '-y',                # Overwrite
        output_path
    ]

    result = subprocess.run(cmd, capture_output=True, text=True)

    if result.returncode == 0:
        print(f"✓ Converted successfully")
        return output_path
    else:
        print(f"❌ Conversion failed: {result.stderr}")
        return None

In [10]:
# ============================================
# BLOCK 4: Process Single File (Run Repeatedly)
# ============================================
"""
RUN THIS BLOCK REPEATEDLY - Once for each file
Processes exactly one file then stops
"""

if remaining_files:
    # Get next file to process
    current_file = remaining_files[0]
    base_name = os.path.splitext(os.path.basename(current_file))[0]

    print(f"\n{'='*60}")
    print(f"🎯 Processing: {os.path.basename(current_file)}")
    print(f"{'='*60}")

    try:
        # Convert to WAV
        wav_path = convert_to_wav(current_file)
        if not wav_path:
            raise Exception("Failed to convert audio file")

        # Prepare output path
        transcript_path = os.path.join(OUTPUT_PATH, f"{base_name}_transcript.txt")

        # Run whisper.cpp
        print("\n📝 Transcribing with whisper.cpp...")
        print("(This may take several minutes)")

        cmd = [
            '/content/whisper.cpp/bin/whisper-cli',
            '-m', '/content/whisper.cpp/models/ggml-base.en.bin',
            '-f', wav_path,
            '-of', transcript_path.replace('.txt', ''),  # whisper.cpp adds extension
            '--print-colors',
            '--print-progress',
            '-l', 'en',
            '-t', '8',  # Use 8 threads
            '--no-timestamps'  # Remove if you want timestamps
        ]

        # Run transcription
        result = subprocess.run(cmd, capture_output=True, text=True)

        if result.returncode == 0:
            print(f"\n✅ Transcription complete!")

            # Read and format the transcript
            with open(transcript_path, 'r', encoding='utf-8') as f:
                content = f.read()

            # Add header to transcript
            final_content = f"# Transcript: {base_name}\n\n"
            final_content += f"**File**: {os.path.basename(current_file)}\n"
            final_content += f"**Processed**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n"
            final_content += "---\n\n"
            final_content += content

            # Save formatted transcript
            with open(transcript_path, 'w', encoding='utf-8') as f:
                f.write(final_content)

            print(f"✓ Saved: {transcript_path}")

            # Update remaining files
            remaining_files.pop(0)

            print(f"\n📊 Progress: {len(all_audio_files) - len(remaining_files)}/{len(all_audio_files)}")

            if remaining_files:
                print(f"\n🔄 Next up: {os.path.basename(remaining_files[0])}")
                print("Run this block again to process the next file")
            else:
                print("\n🎉 All files processed!")

        else:
            print(f"\n❌ Transcription failed")
            print(f"Error: {result.stderr}")

        # Clean up temporary WAV file
        if wav_path != current_file and os.path.exists(wav_path):
            os.remove(wav_path)

    except Exception as e:
        print(f"\n❌ Error: {str(e)}")
        print("Try running this block again")

else:
    print("✅ All files have been processed!")
    print(f"\n📁 Transcripts saved in: {OUTPUT_PATH}")


🎯 Processing: Call Recording - 13Mar2025 1200 BPA.mp4
Converting to WAV: Call Recording - 13Mar2025 1200 BPA.mp4
✓ Converted successfully

📝 Transcribing with whisper.cpp...
(This may take several minutes)

❌ Error: [Errno 2] No such file or directory: '/content/whisper.cpp/bin/whisper-cli'
Try running this block again


In [None]:
# ============================================
# BLOCK 5: Alternative - Use Larger Model
# ============================================
"""
Optional: Download and use a larger model for better accuracy
"""

print("Available models:")
print("1. tiny.en    (39 MB) - Fastest, least accurate")
print("2. base.en    (142 MB) - Good balance (currently using)")
print("3. small.en   (466 MB) - Better accuracy")
print("4. medium.en  (1.5 GB) - High accuracy")
print("5. large-v3   (3.1 GB) - Best accuracy")

model_choice = input("\nEnter model name to download (or 'skip'): ")

if model_choice != 'skip' and model_choice in ['tiny.en', 'base.en', 'small.en', 'medium.en', 'large-v3']:
    print(f"\nDownloading {model_choice} model...")
    !cd whisper.cpp && ./models/download-ggml-model.sh {model_choice}
    print(f"\n✓ Model downloaded. Update the model path in Block 4 to use it.")

In [None]:
# ============================================
# BLOCK 6: Check All Transcripts
# ============================================
"""
Run this anytime to see all completed transcripts
"""

print("📊 Transcription Summary")
print("=" * 60)

transcripts = glob.glob(os.path.join(OUTPUT_PATH, "*_transcript.txt"))
print(f"\nTotal transcripts: {len(transcripts)}")

total_size = 0
for t in sorted(transcripts):
    size = os.path.getsize(t) / 1024
    total_size += size
    print(f"  ✓ {os.path.basename(t)} ({size:.1f} KB)")

print(f"\nTotal size: {total_size:.1f} KB")

# Show sample from first transcript
if transcripts:
    print(f"\n📄 Sample from {os.path.basename(transcripts[0])}:")
    with open(transcripts[0], 'r', encoding='utf-8') as f:
        lines = f.readlines()
        print("".join(lines[:10]) + "...")

In [None]:
# ============================================
# BLOCK 7: Emergency Cleanup
# ============================================
"""
Run this if you need to clean up temporary files or restart
"""

print("🧹 Cleaning up temporary files...")

# Remove temporary WAV files
tmp_files = glob.glob("/tmp/*.wav")
for f in tmp_files:
    os.remove(f)
    print(f"  Removed: {os.path.basename(f)}")

print("\n✓ Cleanup complete")

# Show disk usage
!df -h /tmp

In [3]:

# ============================================
# BLOCK 2: Mount Drive and Setup Paths
# ============================================
"""
Run this once per session to mount your Google Drive
"""

from google.colab import drive
drive.mount('/content/drive')

# Configure your paths here
INPUT_PATH = "/content/drive/MyDrive/PRUT-Transcriptions/Recordings_PRUT"
OUTPUT_PATH = "/content/drive/MyDrive/PRUT-Transcriptions/Transcripts"

# Create output directory
os.makedirs(OUTPUT_PATH, exist_ok=True)

# Get list of files
import glob
mp4_files = sorted(glob.glob(os.path.join(INPUT_PATH, "*.mp4")))
mp3_files = sorted(glob.glob(os.path.join(INPUT_PATH, "*.mp3")))
all_files = mp4_files + mp3_files

print(f"\nFound {len(all_files)} audio files")

# Check what's already done
completed = []
for f in all_files:
    base_name = os.path.splitext(os.path.basename(f))[0]
    transcript_path = os.path.join(OUTPUT_PATH, f"{base_name}_transcript.md")
    if os.path.exists(transcript_path):
        completed.append(base_name)

print(f"Already completed: {len(completed)}")
print(f"Remaining: {len(all_files) - len(completed)}")

# Create a list of files to process
remaining_files = []
for f in all_files:
    base_name = os.path.splitext(os.path.basename(f))[0]
    if base_name not in completed:
        remaining_files.append(f)

print("\nFiles to process:")
for i, f in enumerate(remaining_files):
    print(f"{i+1}. {os.path.basename(f)}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


NameError: name 'os' is not defined

In [None]:

# ============================================
# BLOCK 3: Process Single File
# ============================================
"""
RUN THIS BLOCK REPEATEDLY - ONCE FOR EACH FILE
It will automatically process the next unprocessed file
"""

import whisper
import datetime
import gc

if remaining_files:
    # Get the next file to process
    current_file = remaining_files[0]
    base_name = os.path.splitext(os.path.basename(current_file))[0]

    print(f"\n{'='*60}")
    print(f"Processing: {os.path.basename(current_file)}")
    print(f"{'='*60}")

    try:
        # Load Whisper model (base model for speed/memory)
        print("Loading Whisper model...")
        model = whisper.load_model("large")

        # Transcribe
        print("Transcribing (this may take a few minutes)...")
        result = model.transcribe(
            current_file,
            language="en",
            word_timestamps=True,
            verbose=True
        )

        # Save transcript
        transcript_path = os.path.join(OUTPUT_PATH, f"{base_name}_transcript.md")

        with open(transcript_path, 'w', encoding='utf-8') as f:
            # Header
            f.write(f"# Transcript: {base_name}\n\n")
            f.write(f"**File**: {os.path.basename(current_file)}\n")
            f.write(f"**Processed**: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
            f.write(f"**Duration**: {result.get('duration', 'Unknown')} seconds\n\n")
            f.write("---\n\n")

            # Segments with timestamps
            for segment in result['segments']:
                start = segment['start']
                end = segment['end']
                text = segment['text'].strip()

                # Keep all text including fillers
                f.write(f"[{start:.2f}s - {end:.2f}s] {text}\n\n")

        print(f"\n✅ Successfully saved: {transcript_path}")

        # Update remaining files list
        remaining_files.pop(0)

        print(f"\n📊 Progress: {len(all_files) - len(remaining_files)}/{len(all_files)} completed")
        print(f"Files remaining: {len(remaining_files)}")

        if remaining_files:
            print("\n🔄 Run this block again to process the next file")
        else:
            print("\n🎉 All files processed!")

    except Exception as e:
        print(f"\n❌ Error: {str(e)}")
        print("Try running this block again")

    finally:
        # Clean up memory
        if 'model' in locals():
            del model
        gc.collect()

else:
    print("✅ All files have been processed!")
    print(f"Check your transcripts in: {OUTPUT_PATH}")


In [None]:

# ============================================
# BLOCK 4: Check Progress (Optional)
# ============================================
"""
Run this anytime to see your progress
"""

print("📊 Transcription Progress Report")
print("=" * 60)

# List all completed transcripts
transcripts = glob.glob(os.path.join(OUTPUT_PATH, "*_transcript.md"))
print(f"\nCompleted transcripts: {len(transcripts)}")

for t in sorted(transcripts):
    size_kb = os.path.getsize(t) / 1024
    print(f"  ✓ {os.path.basename(t)} ({size_kb:.1f} KB)")

# Show remaining files
all_bases = [os.path.splitext(os.path.basename(f))[0] for f in all_files]
completed_bases = [os.path.basename(t).replace('_transcript.md', '') for t in transcripts]
remaining_bases = [b for b in all_bases if b not in completed_bases]

if remaining_bases:
    print(f"\nRemaining files ({len(remaining_bases)}):")
    for r in remaining_bases:
        print(f"  ⏳ {r}")
else:
    print("\n✅ All files transcribed!")


In [None]:

# ============================================
# BLOCK 5: Convert Single Transcript to Simple Format
# ============================================
"""
Optional: Creates a simplified version without timestamps
"""

# List available transcripts
transcripts = glob.glob(os.path.join(OUTPUT_PATH, "*_transcript.md"))
print("Available transcripts:")
for i, t in enumerate(transcripts):
    print(f"{i+1}. {os.path.basename(t)}")

# Select which one to simplify
choice = input("\nEnter number to create simplified version (or 'skip'): ")

if choice != 'skip' and choice.isdigit():
    idx = int(choice) - 1
    if 0 <= idx < len(transcripts):
        source_path = transcripts[idx]

        # Read transcript
        with open(source_path, 'r', encoding='utf-8') as f:
            content = f.read()

        # Extract just the text (remove timestamps)
        import re
        pattern = r'\[\d+\.\d+s - \d+\.\d+s\] (.+)'
        matches = re.findall(pattern, content)

        # Save simplified version
        simple_path = source_path.replace('_transcript.md', '_simple.txt')
        with open(simple_path, 'w', encoding='utf-8') as f:
            f.write(' '.join(matches))

        print(f"✓ Created simplified version: {os.path.basename(simple_path)}")