In [None]:
!pip install openai-whisper noisereduce librosa soundfile numpy
!apt update && apt install -y ffmpeg


Collecting openai-whisper
  Downloading openai-whisper-20240930.tar.gz (800 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/800.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m800.5/800.5 kB[0m [31m23.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting noisereduce
  Downloading noisereduce-3.0.3-py3-none-any.whl.metadata (14 kB)
Collecting tiktoken (from openai-whisper)
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->openai-whisper)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->openai-whisper)
  Downloading nvidia_c

In [None]:
import os
import torch
import librosa
import librosa.display
import numpy as np
import soundfile as sf
import whisper
import noisereduce as nr
from google.colab import drive

In [None]:
# 🔹 Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# 🔹 Define Paths
AUDIO_FOLDER = "/content/drive/MyDrive/Captcha_Dataset/audio"  # Update if needed
OUTPUT_FILE = "/content/drive/MyDrive/Captcha_Dataset/audio_transcriptions_whisper.txt"

In [None]:
# 🔹 Load Whisper Model
device = "cuda" if torch.cuda.is_available() else "cpu"
model = whisper.load_model("medium").to(device)

# 🔹 Preprocessing Function (Noise Reduction, Normalization, Resampling)
def preprocess_audio(audio_path):
    y, sr = librosa.load(audio_path, sr=16000)  # Convert to 16 kHz mono
    y = nr.reduce_noise(y=y, sr=sr)  # Noise reduction
    y = librosa.util.normalize(y)  # Normalize amplitude
    return y, sr

# 🔹 Feature Extraction (MFCCs & Spectrogram)
def extract_features(y, sr):
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    spectrogram = librosa.feature.melspectrogram(y=y, sr=sr)
    return mfcc, spectrogram

In [None]:


# 🔹 Transcription Function
def transcribe_audio():
    with open(OUTPUT_FILE, "w") as f:
        for audio_file in sorted(os.listdir(AUDIO_FOLDER)):
            if audio_file.endswith((".mp3", ".wav", ".m4a")):
                audio_path = os.path.join(AUDIO_FOLDER, audio_file)
                print(f"Processing: {audio_file}")

                # Preprocess audio
                y, sr = preprocess_audio(audio_path)

                # Extract features (Optional: Can be used for debugging)
                mfcc, spectrogram = extract_features(y, sr)

                # Save preprocessed audio for Whisper
                preprocessed_audio_path = "/tmp/processed_audio.wav"
                sf.write(preprocessed_audio_path, y, sr)

                # Transcribe using Whisper
                result = model.transcribe(preprocessed_audio_path)
                transcription = result["text"].strip()

                # Save transcription
                f.write(f"{audio_file}: {transcription}\n")
                print(f"Saved: {audio_file} -> {transcription}")

# 🔹 Run Transcription
transcribe_audio()
print(f"✅ Transcriptions saved to {OUTPUT_FILE}")


Processing: captcha_0001.wav
Saved: captcha_0001.wav -> capital A, capital B, capital O, small r, small z
Processing: captcha_0002.wav
Saved: captcha_0002.wav -> small x, capital L, capital M, small b, small r, capital T.
Processing: captcha_0003.wav
Saved: captcha_0003.wav -> Small v, capital V, capital R, capital G, and zero.
Processing: captcha_0004.wav
Saved: captcha_0004.wav -> small f small v 2 small h capital y small y
Processing: captcha_0005.wav
Saved: captcha_0005.wav -> Find small v, 0, capital U, small f, small s.
Processing: captcha_0006.wav
Saved: captcha_0006.wav -> 
Processing: captcha_0007.wav
Saved: captcha_0007.wav -> small h, capital H, small r, certain, capital V, small n.
Processing: captcha_0008.wav
Saved: captcha_0008.wav -> small h, small m, capital R, small f, capital Z, small v.
Processing: captcha_0009.wav
Saved: captcha_0009.wav -> small w, capital y, capital d, 0, small a, small t
Processing: captcha_0010.wav
Saved: captcha_0010.wav -> 7 small g capital y 