In [1]:
!pip install librosa torchaudio openai-whisper tqdm
!apt-get install ffmpeg
!pip install noisereduce scipy matplotlib

Collecting openai-whisper
  Downloading openai-whisper-20240930.tar.gz (800 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m800.5/800.5 kB[0m [31m35.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch==2.5.1->torchaudio)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch==2.5.1->torchaudio)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch==2.5.1->torchaudio)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch==2.5.1->torchaudio)
  Downloading nvidia_cu

In [3]:
import os
import torch
import librosa
import librosa.display
import numpy as np
import torchaudio
import torchaudio.transforms as T
import whisper
import matplotlib.pyplot as plt
import noisereduce as nr
import scipy.signal
from tqdm import tqdm
from glob import glob
from google.colab import drive


In [4]:
drive.mount('/content/drive')  # Mount Google Drive

Mounted at /content/drive


In [5]:
# Define paths
AUDIO_DIR = '/content/drive/MyDrive/captchaDatabase/captchas/audio'
OUTPUT_FILE = '/content/drive/MyDrive/captcha_transcriptions.txt'

In [6]:
# Load model
model = whisper.load_model("medium").to("cuda")

100%|█████████████████████████████████████| 1.42G/1.42G [00:19<00:00, 78.7MiB/s]
  checkpoint = torch.load(fp, map_location=device)


In [7]:
def load_audio(file_path, sr=16000):
    """Load an audio file."""
    audio, sample_rate = librosa.load(file_path, sr=sr)
    return audio, sample_rate

In [8]:
def noise_reduction(audio, sr):
    """Reduce noise from the audio signal."""
    reduced_noise_audio = nr.reduce_noise(y=audio, sr=sr)
    return reduced_noise_audio

In [9]:
def normalize_audio(audio):
    """Normalize audio signal to range [-1,1]."""
    return librosa.util.normalize(audio)

In [10]:
def extract_mfccs(audio, sr, n_mfcc=13):
    """Extract MFCC features from the audio signal."""
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)
    return mfccs

In [11]:
def extract_spectrogram(audio, sr, n_fft=2048, hop_length=512):
    """Extract spectrogram from audio signal."""
    spectrogram = librosa.stft(audio, n_fft=n_fft, hop_length=hop_length)
    spectrogram_db = librosa.amplitude_to_db(abs(spectrogram))
    return spectrogram_db

In [12]:
def process_audio_files(audio_dir, sample_size=500):
    """Process a limited number of audio files in the dataset."""
    audio_files = sorted(glob(os.path.join(audio_dir, '*.wav')))[:sample_size]  # Get first 500 files
    features = {}

    for file in tqdm(audio_files, desc="Processing audio files"):
        file_name = os.path.basename(file)

        # Load and preprocess audio
        audio, sr = load_audio(file)
        audio = noise_reduction(audio, sr)
        audio = normalize_audio(audio)

        # Extract features
        mfccs = extract_mfccs(audio, sr)
        spectrogram = extract_spectrogram(audio, sr)

        features[file_name] = {'mfccs': mfccs, 'spectrogram': spectrogram}

    return features


In [13]:
def transcribe_audio(audio_dir, output_file, sample_size=500):
    """Transcribe a limited number of audio CAPTCHA files using Whisper and save results."""
    model = whisper.load_model("base")
    audio_files = sorted(glob(os.path.join(audio_dir, '*.wav')))[:sample_size]  # Get first 500 files

    with open(output_file, 'w') as f:
        for file in tqdm(audio_files, desc="Transcribing audio files"):
            file_name = os.path.basename(file)
            result = model.transcribe(file)
            transcript = result['text']
            f.write(f"{file_name}: {transcript}\n")

# Run preprocessing, feature extraction, and transcription for first 500 samples
features = process_audio_files(AUDIO_DIR, sample_size=500)
transcribe_audio(AUDIO_DIR, OUTPUT_FILE, sample_size=500)
print("Feature extraction and transcription completed!")

Processing audio files: 100%|██████████| 500/500 [07:32<00:00,  1.10it/s]
100%|███████████████████████████████████████| 139M/139M [00:04<00:00, 31.0MiB/s]
  checkpoint = torch.load(fp, map_location=device)
Transcribing audio files: 100%|██████████| 500/500 [10:00<00:00,  1.20s/it]

Feature extraction and transcription completed!



