<a href="https://colab.research.google.com/github/Vtheonly/Junk-Yard/blob/main/Get%20trasncribtions/French_to_Text.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U openai-whisper ffmpeg-python

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
from IPython import get_ipython
from IPython.display import display
import os # For file and directory operations
from google.colab import drive # For Google Drive access
import whisper
import concurrent.futures # Added for parallel processing
import time # Added for timing
import threading # <<< ADDED IMPORT


try:
    drive.mount('/content/drive')
    print("Google Drive mounted successfully.")
except Exception as e:
    print(f"Error mounting Google Drive: {e}")
    if "already mounted" in str(e).lower():
        print("Drive seems to be already mounted.")
    else:
        raise

base_drive_path = '/content/drive/MyDrive/'
input_folder_name = 'React 2/compressed'
output_subfolder_name = 'transcripts'

input_folder_path = os.path.join(base_drive_path, input_folder_name)
output_folder_path = os.path.join(input_folder_path, output_subfolder_name)

if not os.path.exists(output_folder_path):
    os.makedirs(output_folder_path)
    print(f"Created output directory: {output_folder_path}")
else:
    print(f"Output directory already exists: {output_folder_path}")

model = None
try:
    print("Loading Whisper model (this may take a moment)...")
    model = whisper.load_model("base", device="cuda")
    print("Whisper model loaded successfully on GPU.")
except Exception as e:
    print(f"Error loading Whisper model on GPU: {e}")
    print("Attempting to load on CPU...")
    try:
        model = whisper.load_model("base", device="cpu")
        print("Whisper model loaded successfully on CPU.")
    except Exception as e_cpu:
        print(f"Failed to load model on CPU as well: {e_cpu}")


def transcribe_audio_file(audio_file_path, model_instance, output_dir, use_fp16_if_cuda, lock): # Added lock
    """
    Transcribes a single audio file and saves the transcript.
    Returns True on success, False on failure.
    """
    mp3_filename = os.path.basename(audio_file_path)
    start_time = time.time()
    try:
        fp16_setting = False
        if model_instance.device.type == 'cuda' and use_fp16_if_cuda:
            fp16_setting = True

        with lock: # <<< USE THE LOCK
            print(f"[Thread] Starting locked transcription for: {mp3_filename}")
            result = model_instance.transcribe(audio_file_path, fp16=fp16_setting)

        if 'text' in result and result['text'].strip():
            transcript_text = result["text"]
            txt_filename = os.path.splitext(mp3_filename)[0] + ".txt"
            output_txt_path = os.path.join(output_dir, txt_filename)
            with open(output_txt_path, "w", encoding="utf-8") as txt_file:
                txt_file.write(transcript_text)
            elapsed_time = time.time() - start_time
            print(f"  [Thread] SUCCESS: Transcription for {mp3_filename} saved to {output_txt_path} (took {elapsed_time:.2f}s)")
            return True
        else:
            elapsed_time = time.time() - start_time
            print(f"  [Thread] WARNING: Transcription for {mp3_filename} was empty or no text found (took {elapsed_time:.2f}s).")
            if result:
                print(f"    Result keys: {result.keys()}")
                if 'segments' in result and not result['segments']:
                    print("    Note: No speech segments were detected.")
            return False
    except Exception as e:
        elapsed_time = time.time() - start_time
        print(f"  [Thread] ERROR processing {mp3_filename} (took {elapsed_time:.2f}s): {e}")
        return False

if model:
    mp3_file_paths = []
    if os.path.exists(input_folder_path):
        print(f"\nLooking for MP3 files in: {input_folder_path}")
        for filename in os.listdir(input_folder_path):
            if filename.lower().endswith(".mp3"):
                mp3_file_paths.append(os.path.join(input_folder_path, filename))
    else:
        print(f"ERROR: Input folder not found: {input_folder_path}")
        print("Please check the 'base_drive_path' and 'input_folder_name' variables.")

    if not mp3_file_paths:
        print("No MP3 files found in the specified directory.")
    else:
        print(f"Found {len(mp3_file_paths)} MP3 file(s) to process.")

        if model.device.type == 'cuda':
            USE_FP16_ON_CUDA = False
            num_workers = min(4, os.cpu_count() if os.cpu_count() else 1) # Still can use multiple workers for I/O
            print(f"Using {num_workers} worker threads (GPU detected, fp16 on CUDA: {USE_FP16_ON_CUDA}). Lock will serialize GPU access.")
        else:
            USE_FP16_ON_CUDA = False
            num_workers = os.cpu_count() if os.cpu_count() else 2
            print(f"Using {num_workers} worker threads (CPU detected). Lock will serialize model access.")

        transcription_lock = threading.Lock() # <<< CREATE LOCK INSTANCE

        overall_start_time = time.time()
        successful_transcriptions = 0
        failed_transcriptions = 0

        with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
            future_to_file = {
                executor.submit(transcribe_audio_file, audio_path, model, output_folder_path, USE_FP16_ON_CUDA, transcription_lock): audio_path # Pass lock
                for audio_path in mp3_file_paths
            }

            for future in concurrent.futures.as_completed(future_to_file):
                file_path = future_to_file[future]
                try:
                    success = future.result()
                    if success:
                        successful_transcriptions += 1
                    else:
                        failed_transcriptions +=1
                except Exception as exc:
                    failed_transcriptions +=1
                    print(f"  [Main] Generated an exception for {os.path.basename(file_path)}: {exc}")

        overall_elapsed_time = time.time() - overall_start_time
        print(f"\n--- All MP3 files processing submitted. ---")
        print(f"Total time taken: {overall_elapsed_time:.2f} seconds.")
        print(f"Successful transcriptions: {successful_transcriptions}")
        print(f"Failed/Empty transcriptions: {failed_transcriptions}")
else:
    print("Whisper model could not be loaded. Cannot proceed with transcription.")

