In [3]:
#-----------------------------------------------------
# CELL 1: Install/Upgrade Dependencies
#-----------------------------------------------------
# Uncomment and run this cell if you haven't installed the libraries
# or want to ensure you have the latest compatible versions.
# !pip install -U openai-whisper
# !pip install -U torch torchvision torchaudio
# On some systems (like local Linux/macOS or cloud VMs), you might need ffmpeg:
# !sudo apt update && sudo apt install ffmpeg # Example for Debian/Ubuntu
# !conda install ffmpeg # Example for Conda environments

print("Dependencies checked/installed (ensure you ran the install commands if needed).")

#-----------------------------------------------------
# CELL 2: Import Libraries
#-----------------------------------------------------
import os
import whisper
import torch
import warnings

# Suppress specific warnings if they become noisy, but be aware of why they appear
warnings.filterwarnings("ignore", category=UserWarning, module='torch.functional')

print("Libraries imported.")

#-----------------------------------------------------
# CELL 3: Define Audio File Path & Check Existence
#-----------------------------------------------------

audio_file_path = "/content/New recording 11 (1).m4a" # <<< --- CHANGE THIS!!!

# <<< --- !!! CRITICAL: CHANGE THIS LINE ABOVE !!! --- >>>

# Check if the file exists before proceeding
if not os.path.exists(audio_file_path):
    print(f"!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
    print(f"ERROR: Audio file not found at the specified path:")
    print(f"'{audio_file_path}'")
    print(f"!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
    print("Please ensure:")
    print("  1. You have uploaded the audio file correctly.")
    print("  2. The 'audio_file_path' variable in this script EXACTLY matches")
    print("     the location and name of your file.")
    # Stop execution if file not found
    raise FileNotFoundError(f"Audio file not found at {audio_file_path}")
else:
    print(f"Audio file found: {audio_file_path}")
    print(f"File size: {os.path.getsize(audio_file_path) / (1024*1024):.2f} MB")


#-----------------------------------------------------
# CELL 4: Load the Whisper Model (Using large-v3)
#-----------------------------------------------------
# Choose model size. "large-v3" is generally the most powerful public model.
# Alternatives if "large-v3" causes memory issues: "large-v2", "medium", "small", "base", "tiny"
model_size = "large-v3"

# Check for GPU availability (CUDA for NVIDIA)
if torch.cuda.is_available():
    device = "cuda"
    print("CUDA (NVIDIA GPU) detected. Attempting to use GPU.")
    # Check GPU memory (optional but helpful)
    try:
        total_mem = torch.cuda.get_device_properties(0).total_memory / (1024**3)
        print(f"GPU Detected: {torch.cuda.get_device_name(0)} | Total Memory: {total_mem:.2f} GB")
        if total_mem < 10 and model_size.startswith("large"):
             print("WARNING: GPU memory might be insufficient for 'large' models. Consider 'medium' or 'small'.")
    except Exception as e:
        print(f"Could not get GPU details: {e}")
else:
    device = "cpu"
    print("CUDA (NVIDIA GPU) not detected. Using CPU.")
    if model_size.startswith("large"):
        print(f"WARNING: Loading model '{model_size}' on CPU will be VERY SLOW and require significant RAM.")
    elif model_size == "medium":
         print(f"WARNING: Loading model '{model_size}' on CPU will be slow.")

print(f"\nAttempting to load Whisper model '{model_size}' onto device: {device}")

model = None # Initialize model variable
try:
    model = whisper.load_model(model_size, device=device)
    print(f"Whisper model '{model_size}' loaded successfully onto {device}.")
except Exception as e:
    print(f"!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
    print(f"ERROR: Failed to load model '{model_size}' onto device '{device}':")
    print(f"{e}")
    print(f"!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
    if device == "cuda":
        print("\nThis could be due to insufficient GPU memory (VRAM).")
        print("Suggestions:")
        print("  - Try a smaller model size (e.g., 'large-v2', 'medium').")
        print("  - Ensure no other processes are using significant GPU memory.")
        print("  - Try restarting your runtime/kernel.")
        print("\nAttempting to load on CPU as a fallback (will be much slower)...")
        try:
            device = "cpu"
            model = whisper.load_model(model_size, device=device)
            print(f"Whisper model '{model_size}' loaded successfully onto {device}.")
            print("WARNING: Transcription will proceed on CPU and will likely be very slow.")
        except Exception as e_cpu:
             print(f"!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
             print(f"ERROR: Failed to load model '{model_size}' on CPU as well:")
             print(f"{e_cpu}")
             print(f"!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
             print("This might indicate insufficient RAM or other system issues.")
             raise RuntimeError(f"Could not load Whisper model '{model_size}' on either GPU or CPU.")
    else: # Failed on CPU initially
         print("\nThis could be due to insufficient RAM or issues downloading/accessing model files.")
         raise RuntimeError(f"Could not load Whisper model '{model_size}' on CPU.")

#-----------------------------------------------------
# CELL 5: Transcribe the Bengali Audio
#-----------------------------------------------------
print(f"\nStarting transcription for: {os.path.basename(audio_file_path)}")
print(f"Using model: {model_size}, Device: {device}")
print("This process can take a significant amount of time, especially with large models/long audio.")
print("Please be patient...")

# Set fp16 (16-bit floating point) precision:
# - Use fp16=True on compatible NVIDIA GPUs for faster inference (usually automatic).
# - Use fp16=False on CPU or if encountering numerical instability on GPU.
# Whisper often handles this automatically, but explicit setting can sometimes help.
use_fp16 = (device == "cuda") # Only use FP16 if we are definitely on a GPU

result = None # Initialize result variable
try:
    # Specify language='bn' for Bengali
    # Specify task='transcribe' explicitly (though it's the default)
    result = model.transcribe(audio_file_path, language='bn', task='transcribe', fp16=use_fp16)
    print("\nTranscription complete!")
except Exception as e:
    print(f"!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
    print(f"ERROR: An error occurred during transcription:")
    print(f"{e}")
    print(f"!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
    print("Possible causes:")
    print("  - Issues with the audio file format or corruption (try converting to WAV).")
    print("  - Insufficient memory (RAM or VRAM) during processing.")
    print("  - Internal errors within the Whisper model/library.")
    # Stop execution if transcription fails
    raise RuntimeError(f"Transcription failed for {audio_file_path}.")

#-----------------------------------------------------
# CELL 6: Display the Transcription
#-----------------------------------------------------
print("\n--- Transcription Result (Bengali Text) ---")

if result and "text" in result:
    transcribed_text = result["text"]
    print(transcribed_text)
else:
    print("ERROR: Transcription result was empty or did not contain text.")
    print("Result dictionary:", result) # Print the whole result for debugging

# Optional: Print segments with timestamps if needed for more detail
# print("\n--- Segments with Timestamps ---")
# if result and "segments" in result:
#     for segment in result["segments"]:
#         start = segment.get('start', 'N/A')
#         end = segment.get('end', 'N/A')
#         text = segment.get('text', '')
#         # Format timestamps nicely
#         start_str = f"{int(start // 3600):02}:{int((start % 3600) // 60):02}:{start % 60:05.2f}" if isinstance(start, (int, float)) else str(start)
#         end_str = f"{int(end // 3600):02}:{int((end % 3600) // 60):02}:{end % 60:05.2f}" if isinstance(end, (int, float)) else str(end)
#         print(f"[{start_str} -> {end_str}] {text}")
# elif result:
#     print("No segments found in the result.")
# else:
#     print("No result object available to extract segments.")

print("\n--- End of Script ---")

Dependencies checked/installed (ensure you ran the install commands if needed).
Libraries imported.
Audio file found: /content/New recording 11 (1).m4a
File size: 1.46 MB
CUDA (NVIDIA GPU) not detected. Using CPU.

Attempting to load Whisper model 'large-v3' onto device: cpu


100%|█████████████████████████████████████| 2.88G/2.88G [00:42<00:00, 72.7MiB/s]


Whisper model 'large-v3' loaded successfully onto cpu.

Starting transcription for: New recording 11 (1).m4a
Using model: large-v3, Device: cpu
This process can take a significant amount of time, especially with large models/long audio.
Please be patient...

Transcription complete!

--- Transcription Result (Bengali Text) ---
 উপরুক্ত শিল্প্র পতিষ্ঠান্টি সনাম ধন্ন রক্তানিকারো গুরুপ নমান গুরুপের একটি পতিষ্ঠান নমান গুরুপ দির্গো বছোর জাবত স� বছোর বিবিন্ন খাতে সরকারের রক্তানি ট্রাফি অর্জন করেছে আমাদের গুরুপ প্রায় আশি হাজার মানুষের কর্ম সংগস্থান করেছে নির্ভাউ করা COVID-19 পরাবোর্তি সময়ে রক্তানি করেযা দেশের সল্পতা কাছামাল সাবরারে ঘার্তি দক্ক সমিকের গ্রামে এস্থানাংতো বিবিন্ন নিযন্তন বহেবোত কারনে আমরা কাংকিত বাপ্ষা করতে বাইনি

--- End of Script ---
