In [None]:
import os
import subprocess
import mlx_whisper

# --- Configuration ---
AUDIO_FOLDER = "/Volumes/HezeSamsung/life/Pod-treePhil"
# Audio file extensions
AUDIO_EXTENSIONS = [".wav", ".mp3", ".flac", ".ogg", ".m4a"]
# Video file extensions that should be processed
VIDEO_EXTENSIONS = [".mp4", ".mov", ".avi", ".mkv"]
PROGRESS_INTERVAL = 300
# Model repository identifier from Hugging Face.
MODEL_ID = "mlx-community/whisper-large-v3-turbo"
# ----------------------

def is_audio_file(filename: str) -> bool:
    """Check if a file is an audio file based on its extension."""
    return any(filename.lower().endswith(ext) for ext in AUDIO_EXTENSIONS)

def is_video_file(filename: str) -> bool:
    """Check if a file is a video file based on its extension."""
    return any(filename.lower().endswith(ext) for ext in VIDEO_EXTENSIONS)

# Process each file in the folder.
for filename in os.listdir(AUDIO_FOLDER):
    file_path = os.path.join(AUDIO_FOLDER, filename)
    input_path = None  # will be set to the file to transcribe
    
    if is_audio_file(filename):
        input_path = file_path
    elif is_video_file(filename):
        # Extract audio from video using ffmpeg.
        base_name = os.path.splitext(filename)[0]
        temp_audio_path = os.path.join(AUDIO_FOLDER, base_name + "_extracted.mp3")
        print(f"\nExtracting audio from video file: {file_path}")
        # The '-y' flag overwrites the file if it exists,
        # '-q:a 0' ensures highest quality audio extraction,
        # '-map a' extracts the audio stream.
        ffmpeg_command = ["ffmpeg", "-y", "-i", file_path, "-q:a", "0", "-map", "a", temp_audio_path]
        subprocess.run(ffmpeg_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        input_path = temp_audio_path
    else:
        # Skip files that are neither audio nor video.
        continue

    base_name = os.path.splitext(filename)[0]
    output_file = os.path.join(AUDIO_FOLDER, base_name + ".txt")
    
    print(f"\nProcessing file: {input_path}")
    
    # Transcribe the file using mlx_whisper.
    result = mlx_whisper.transcribe(
        input_path,
        path_or_hf_repo=MODEL_ID
    )
    
    # Use the full transcript from the "text" key.
    full_transcript = result["text"]
    
    # Optionally, log progress based on the segments.
    next_progress = PROGRESS_INTERVAL
    for segment in result["segments"]:
        if segment["start"] >= next_progress:
            print(f"Reached approximately {int(segment['start'])} seconds in '{filename}'")
            next_progress += PROGRESS_INTERVAL
    
    # Write the full transcript to a text file.
    with open(output_file, "w", encoding="utf-8") as f:
        f.write(full_transcript)
    
    print(f"Transcription saved to: {output_file}")
    
    # If a temporary audio file was created from a video, remove it.
    if is_video_file(filename) and os.path.exists(temp_audio_path):
        os.remove(temp_audio_path) 


Extracting audio from video file: /Volumes/HezeSamsung/Lectures/Mec/rec/Lecture6_Dimension_part2.mp4

Processing file: /Volumes/HezeSamsung/Lectures/Mec/rec/Lecture6_Dimension_part2_extracted.mp3


Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Reached approximately 301 seconds in 'Lecture6_Dimension_part2.mp4'
Reached approximately 602 seconds in 'Lecture6_Dimension_part2.mp4'
Reached approximately 900 seconds in 'Lecture6_Dimension_part2.mp4'
Reached approximately 1200 seconds in 'Lecture6_Dimension_part2.mp4'
Reached approximately 1500 seconds in 'Lecture6_Dimension_part2.mp4'
Reached approximately 1800 seconds in 'Lecture6_Dimension_part2.mp4'
Reached approximately 2100 seconds in 'Lecture6_Dimension_part2.mp4'
Reached approximately 2400 seconds in 'Lecture6_Dimension_part2.mp4'
Reached approximately 2700 seconds in 'Lecture6_Dimension_part2.mp4'
Reached approximately 3000 seconds in 'Lecture6_Dimension_part2.mp4'
Reached approximately 3300 seconds in 'Lecture6_Dimension_part2.mp4'
Reached approximately 3600 seconds in 'Lecture6_Dimension_part2.mp4'
Reached approximately 3901 seconds in 'Lecture6_Dimension_part2.mp4'
Reached approximately 4200 seconds in 'Lecture6_Dimension_part2.mp4'
Transcription saved to: /Volumes/Heze