In [1]:
pip install openai-whisper

Collecting openai-whisper
  Downloading openai-whisper-20240930.tar.gz (800 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m800.5/800.5 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting triton>=2.0.0 (from openai-whisper)
  Downloading triton-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading triton-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (253.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.1/253.1 MB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[?25hBuilding wheels for collected packages: openai-whisper
  Building wheel for openai-whisper (pyproject.toml) ... [?25l[?25hdone
  Created wheel for openai-whisper: filename=openai_whisper-20240930-py3-none-any.

In [2]:
!pip install jiwer

Collecting jiwer
  Downloading jiwer-3.1.0-py3-none-any.whl.metadata (2.6 kB)
Collecting click>=8.1.8 (from jiwer)
  Downloading click-8.1.8-py3-none-any.whl.metadata (2.3 kB)
Collecting rapidfuzz>=3.9.7 (from jiwer)
  Downloading rapidfuzz-3.12.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading jiwer-3.1.0-py3-none-any.whl (22 kB)
Downloading click-8.1.8-py3-none-any.whl (98 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading rapidfuzz-3.12.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m35.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: rapidfuzz, click, jiwer
  Attempting uninstall: click
    Found existing installation: click 8.1.7
    Uninstalling click-8.1.7:
      Successfully uninstalled click-8

In [3]:
import whisper
import os
import random
import torch
import librosa
import numpy as np
import matplotlib.pyplot as plt
from pydub import AudioSegment
from easyocr import Reader
from transformers import WhisperProcessor
from torch.utils.data import DataLoader
from torch import nn
import re
import soundfile as sf
from jiwer import wer, cer  
from tqdm import tqdm 

# Dataset Path

In [4]:
# Directories
audio_dir = '/kaggle/input/captcha-dataset/captchas/audio'
images_dir = '/kaggle/input/captcha-dataset/captchas/images'

audio_files = [f for f in os.listdir(audio_dir) if f.endswith('.wav')]
image_files = [f for f in os.listdir(images_dir) if f.endswith('.png')]

print(f"Total audio files: {len(audio_files)}")
print(f"Total image files: {len(image_files)}")

Total audio files: 10000
Total image files: 10000


# Implementation of Whisper AI(small) and OCR to generate text for 10 random samples

In [21]:
audio_dir = '/kaggle/input/captcha-dataset/captchas/audio'
image_dir = '/kaggle/input/captcha-dataset/captchas/images'

# Initialize EasyOCR and Whisper
ocr_reader = Reader(['en'])  # EasyOCR setup
whisper_model = whisper.load_model("base")  # Whisper model

# Dictionary to convert number words to digits
number_map = {
    "zero": "0", "one": "1", "two": "2", "three": "3", "four": "4",
    "five": "5", "six": "6", "seven": "7", "eight": "8", "nine": "9"
}

# Dictionary for common misheard words
misheard_map = {
    "mall": "small", "moll": "small", "capitun": "capital", "capitan": "capital",
    "apital": "capital", "capitole": "capital", "zimro": "0", "smaller": "a"
}

# Step 1: Replace number words with digits
def replace_number_words(text):
    for word, digit in number_map.items():
        text = re.sub(rf"\b{word}\b", digit, text, flags=re.IGNORECASE)
    return text

# Step 2: Fix common misheard words
def fix_misheard_words(text):
    for wrong, correct in misheard_map.items():
        text = re.sub(rf"\b{wrong}\b", correct, text, flags=re.IGNORECASE)
    return text

# Step 3: Replace "for" with "4"
def replace_for_with_4(text):
    text = re.sub(r"\bfor\b", "4", text, flags=re.IGNORECASE)
    return text

# Step 4: Process capitalization and handle "edge" replacement
def process_capitalization(text):
    matches = re.findall(r"(capital|small) (\w+)", text, re.IGNORECASE)
    cleaned_text = ""

    i = 0
    while i < len(text):
        match_found = False
        
        for match in matches:
            marker, letter = match
            marker_index = text.lower().find(f"{marker.lower()} {letter.lower()}")
            
            if marker_index == i:
                if marker.lower() == "capital":
                    cleaned_text += letter.upper()
                else:
                    cleaned_text += letter.lower()
                i += len(marker) + 2
                match_found = True
                break

        if not match_found:
            cleaned_text += text[i]
            i += 1

    # Handle "edge" replacement based on the previous marker
    cleaned_text = re.sub(r"\bsmall\s+edge\b", "h", cleaned_text, flags=re.IGNORECASE)
    cleaned_text = re.sub(r"\bcapital\s+edge\b", "H", cleaned_text, flags=re.IGNORECASE)

    # Remove remaining "capital" or "small" words
    cleaned_text = re.sub(r"\b(capital|small)\b", "", cleaned_text, flags=re.IGNORECASE)
    cleaned_text = re.sub(r"[^a-zA-Z0-9]", "", cleaned_text)  # Keep only A-Z, a-z, 0-9
    return cleaned_text

# Function to get Whisper transcription
def get_whisper_transcription(audio_file):
    text = ""
    
    # Try Whisper transcription up to 3 times if length is < 6
    for _ in range(3):
        result = whisper_model.transcribe(audio_file)
        text = result["text"]
        text = replace_number_words(text)  # Replace number words with digits
        text = fix_misheard_words(text)  # Fix misheard words
        text = replace_for_with_4(text)  # Replace "for" with "4"
        text = process_capitalization(text)  # Process capitalization & "edge"
        
        if len(text) >= 6:  # Ensure minimum length of 6
            break

    return text[:6]  # Ensure the output is exactly 6 characters

# Function to get text from image using EasyOCR
def get_image_text(image_file):
    ocr_result = ocr_reader.readtext(image_file)
    text = ''.join([res[1] for res in ocr_result])
    return text

def process_random_samples(audio_dir, image_dir, num_samples=10):
    audio_files = random.sample(os.listdir(audio_dir), num_samples)
    
    for audio_file in audio_files:
        audio_path = os.path.join(audio_dir, audio_file)
        image_file = audio_file.replace('.wav', '.png')  # Assuming same filename for audio and image
        image_path = os.path.join(image_dir, image_file)

        whisper_text = get_whisper_transcription(audio_path)
        ground_truth_text = get_image_text(image_path)

        print(f"Audio file: {audio_file}")
        print(f"Whisper AI Transcription: {whisper_text}")
        print(f"Ground Truth (OCR): {ground_truth_text}")
        print("="*50)

# Run the function to process random samples
process_random_samples(audio_dir, image_dir, num_samples=10)

  checkpoint = torch.load(fp, map_location=device)


Audio file: captcha_2800.wav
Whisper AI Transcription: P0h1kQ
Ground Truth (OCR): PohzkQ
Audio file: captcha_9383.wav
Whisper AI Transcription: pK2Udc
Ground Truth (OCR): pKZUdc
Audio file: captcha_0862.wav
Whisper AI Transcription: mVv2N
Ground Truth (OCR): mBZv2N
Audio file: captcha_2692.wav
Whisper AI Transcription: F4aE13
Ground Truth (OCR): FaaE13
Audio file: captcha_5985.wav
Whisper AI Transcription: ASVKVA
Ground Truth (OCR): AsVKBE
Audio file: captcha_8862.wav
Whisper AI Transcription: apGv5v
Ground Truth (OCR): epGvsb
Audio file: captcha_2196.wav
Whisper AI Transcription: LHlzs7
Ground Truth (OCR): LHlzs7
Audio file: captcha_0318.wav
Whisper AI Transcription: PHW1Qb
Ground Truth (OCR): PHWIQb
Audio file: captcha_3554.wav
Whisper AI Transcription: wi8ciu
Ground Truth (OCR): wi8ciu
Audio file: captcha_0087.wav
Whisper AI Transcription: auIGL0
Ground Truth (OCR): aulGLO


# Whisper AI(small) Matching Accuracy Scores

In [28]:
audio_dir = '/kaggle/input/captcha-dataset/captchas/audio'
image_dir = '/kaggle/input/captcha-dataset/captchas/images'

ocr_reader = Reader(['en'])  # EasyOCR setup
whisper_model = whisper.load_model("base")  # Whisper model

# Dictionary to convert number words to digits
number_map = {
    "zero": "0", "one": "1", "two": "2", "three": "3", "four": "4",
    "five": "5", "six": "6", "seven": "7", "eight": "8", "nine": "9"
}

# Dictionary for common misheard words
misheard_map = {
    "mall": "small", "moll": "small", "capitun": "capital", "capitan": "capital",
    "apital": "capital", "capitole": "capital", "zimro": "0", "smaller": "a"
}

# Step 1: Replace number words with digits
def replace_number_words(text):
    for word, digit in number_map.items():
        text = re.sub(rf"\b{word}\b", digit, text, flags=re.IGNORECASE)
    return text

# Step 2: Fix common misheard words
def fix_misheard_words(text):
    for wrong, correct in misheard_map.items():
        text = re.sub(rf"\b{wrong}\b", correct, text, flags=re.IGNORECASE)
    return text

# Step 3: Replace "for" with "4"
def replace_for_with_4(text):
    text = re.sub(r"\bfor\b", "4", text, flags=re.IGNORECASE)
    return text

# Step 4: Process capitalization and handle "edge" replacement
def process_capitalization(text):
    matches = re.findall(r"(capital|small) (\w+)", text, re.IGNORECASE)
    cleaned_text = ""

    i = 0
    while i < len(text):
        match_found = False
        
        for match in matches:
            marker, letter = match
            marker_index = text.lower().find(f"{marker.lower()} {letter.lower()}")
            
            if marker_index == i:
                if marker.lower() == "capital":
                    cleaned_text += letter.upper()
                else:
                    cleaned_text += letter.lower()
                i += len(marker) + 2
                match_found = True
                break

        if not match_found:
            cleaned_text += text[i]
            i += 1

    # Handle "edge" replacement based on the previous marker
    cleaned_text = re.sub(r"\bsmall\s+edge\b", "h", cleaned_text, flags=re.IGNORECASE)
    cleaned_text = re.sub(r"\bcapital\s+edge\b", "H", cleaned_text, flags=re.IGNORECASE)

    # Remove remaining "capital" or "small" words
    cleaned_text = re.sub(r"\b(capital|small)\b", "", cleaned_text, flags=re.IGNORECASE)
    cleaned_text = re.sub(r"[^a-zA-Z0-9]", "", cleaned_text)  # Keep only A-Z, a-z, 0-9
    return cleaned_text

# Function to get Whisper transcription
def get_whisper_transcription(audio_file):
    text = ""
    
    # Try Whisper transcription up to 3 times if length is < 6
    for _ in range(3):
        result = whisper_model.transcribe(audio_file)
        text = result["text"]
        text = replace_number_words(text)  # Replace number words with digits
        text = fix_misheard_words(text)  # Fix misheard words
        text = replace_for_with_4(text)  # Replace "for" with "4"
        text = process_capitalization(text)  # Process capitalization & "edge"
        
        if len(text) >= 6:  # Ensure minimum length of 6
            break

    return text[:6]  # Ensure the output is exactly 6 characters

def get_image_text(image_file):
    ocr_result = ocr_reader.readtext(image_file)
    text = ''.join([res[1] for res in ocr_result])
    return text

def process_all_samples(audio_dir, image_dir):
    all_files = os.listdir(audio_dir)
    total_files = len(all_files)
    cer_scores = []
    
    for i, audio_file in enumerate(tqdm(all_files, desc="Processing")):
        audio_path = os.path.join(audio_dir, audio_file)
        image_file = audio_file.replace('.wav', '.png')  # Assuming matching names
        image_path = os.path.join(image_dir, image_file)

        whisper_text = get_whisper_transcription(audio_path)

        ground_truth_text = get_image_text(image_path)

        if not ground_truth_text or not whisper_text:
            continue 

        # Compute CER (Character Error Rate)
        error_rate = cer(ground_truth_text, whisper_text)  # CER calculation
        accuracy = max(0, 100 - (error_rate * 100))  
        cer_scores.append(accuracy)

        # Print progress every 500 samples
        if (i + 1) % 500 == 0:
            avg_accuracy = sum(cer_scores) / len(cer_scores)
            print(f"\nProgress: {i+1}/{total_files} samples processed. Current Accuracy: {avg_accuracy:.2f}%\n")

    # Final average accuracy
    avg_accuracy = sum(cer_scores) / len(cer_scores) if cer_scores else 0
    print(f"\nFinal Accuracy after processing {total_files} samples: {avg_accuracy:.2f}%\n")

# Run the function to process all samples
process_all_samples(audio_dir, image_dir)

  checkpoint = torch.load(fp, map_location=device)
Processing:  10%|█         | 1000/10000 [07:28<55:47,  2.69it/s] 


Progress: 1000/10000 samples processed. Current Accuracy: 72.88%



Processing:  15%|█▌        | 1500/10000 [11:10<57:26,  2.47it/s]  


Progress: 1500/10000 samples processed. Current Accuracy: 73.08%



Processing:  20%|██        | 2000/10000 [15:20<56:21,  2.37it/s]   


Progress: 2000/10000 samples processed. Current Accuracy: 72.77%



Processing:  25%|██▌       | 2500/10000 [19:17<49:48,  2.51it/s]  


Progress: 2500/10000 samples processed. Current Accuracy: 73.43%



Processing:  30%|███       | 3000/10000 [23:24<46:10,  2.53it/s]  


Progress: 3000/10000 samples processed. Current Accuracy: 73.33%



Processing:  35%|███▌      | 3500/10000 [27:19<1:15:09,  1.44it/s]


Progress: 3500/10000 samples processed. Current Accuracy: 73.29%



Processing:  40%|████      | 4000/10000 [31:07<47:06,  2.12it/s]  


Progress: 4000/10000 samples processed. Current Accuracy: 73.37%



Processing:  45%|████▌     | 4500/10000 [34:53<54:42,  1.68it/s]  


Progress: 4500/10000 samples processed. Current Accuracy: 73.20%



Processing:  50%|█████     | 5000/10000 [38:43<32:36,  2.56it/s]  


Progress: 5000/10000 samples processed. Current Accuracy: 73.24%



Processing:  55%|█████▌    | 5500/10000 [42:38<3:39:31,  2.93s/it]


Progress: 5500/10000 samples processed. Current Accuracy: 73.14%



Processing:  60%|██████    | 6000/10000 [46:33<26:30,  2.51it/s]  


Progress: 6000/10000 samples processed. Current Accuracy: 73.11%



Processing:  65%|██████▌   | 6500/10000 [50:21<29:47,  1.96it/s]


Progress: 6500/10000 samples processed. Current Accuracy: 73.12%



Processing:  70%|███████   | 7000/10000 [54:10<19:20,  2.59it/s]  


Progress: 7000/10000 samples processed. Current Accuracy: 73.26%



Processing:  75%|███████▌  | 7500/10000 [57:57<16:19,  2.55it/s]  


Progress: 7500/10000 samples processed. Current Accuracy: 73.26%



Processing:  80%|████████  | 8000/10000 [1:01:33<14:55,  2.23it/s]


Progress: 8000/10000 samples processed. Current Accuracy: 73.17%



Processing:  85%|████████▌ | 8500/10000 [1:05:35<17:43,  1.41it/s]  


Progress: 8500/10000 samples processed. Current Accuracy: 73.01%



Processing:  90%|█████████ | 9000/10000 [1:09:19<07:59,  2.09it/s]


Progress: 9000/10000 samples processed. Current Accuracy: 72.99%



Processing:  95%|█████████▌| 9500/10000 [1:13:19<03:09,  2.64it/s]


Progress: 9500/10000 samples processed. Current Accuracy: 72.98%



Processing: 100%|██████████| 10000/10000 [1:17:04<00:00,  2.16it/s]


Progress: 10000/10000 samples processed. Current Accuracy: 73.07%


Final Accuracy after processing 10000 samples: 73.07%






# Whisper AI(small) - No. of texts matched

In [30]:
audio_dir = '/kaggle/input/captcha-dataset/captchas/audio'
image_dir = '/kaggle/input/captcha-dataset/captchas/images'

ocr_reader = Reader(['en'])  # EasyOCR setup
whisper_model = whisper.load_model("base")  # Whisper model

# Dictionary to convert number words to digits
number_map = {
    "zero": "0", "one": "1", "two": "2", "three": "3", "four": "4",
    "five": "5", "six": "6", "seven": "7", "eight": "8", "nine": "9"
}

# Dictionary for common misheard words
misheard_map = {
    "mall": "small", "moll": "small", "capitun": "capital", "capitan": "capital",
    "apital": "capital", "capitole": "capital", "zimro": "0", "smaller": "a"
}

# Step 1: Replace number words with digits
def replace_number_words(text):
    for word, digit in number_map.items():
        text = re.sub(rf"\b{word}\b", digit, text, flags=re.IGNORECASE)
    return text

# Step 2: Fix common misheard words
def fix_misheard_words(text):
    for wrong, correct in misheard_map.items():
        text = re.sub(rf"\b{wrong}\b", correct, text, flags=re.IGNORECASE)
    return text

# Step 3: Replace "for" with "4"
def replace_for_with_4(text):
    text = re.sub(r"\bfor\b", "4", text, flags=re.IGNORECASE)
    return text

# Step 4: Process capitalization and handle "edge" replacement
def process_capitalization(text):
    matches = re.findall(r"(capital|small) (\w+)", text, re.IGNORECASE)
    cleaned_text = ""

    i = 0
    while i < len(text):
        match_found = False
        
        for match in matches:
            marker, letter = match
            marker_index = text.lower().find(f"{marker.lower()} {letter.lower()}")
            
            if marker_index == i:
                if marker.lower() == "capital":
                    cleaned_text += letter.upper()
                else:
                    cleaned_text += letter.lower()
                i += len(marker) + 2
                match_found = True
                break

        if not match_found:
            cleaned_text += text[i]
            i += 1

    # Handle "edge" replacement based on the previous marker
    cleaned_text = re.sub(r"\bsmall\s+edge\b", "h", cleaned_text, flags=re.IGNORECASE)
    cleaned_text = re.sub(r"\bcapital\s+edge\b", "H", cleaned_text, flags=re.IGNORECASE)

    # Remove remaining "capital" or "small" words
    cleaned_text = re.sub(r"\b(capital|small)\b", "", cleaned_text, flags=re.IGNORECASE)
    cleaned_text = re.sub(r"[^a-zA-Z0-9]", "", cleaned_text)  # Keep only A-Z, a-z, 0-9
    return cleaned_text

# Function to get Whisper transcription
def get_whisper_transcription(audio_file):
    text = ""
    
    # Try Whisper transcription up to 3 times if length is < 6
    for _ in range(3):
        result = whisper_model.transcribe(audio_file)
        text = result["text"]
        text = replace_number_words(text)  # Replace number words with digits
        text = fix_misheard_words(text)  # Fix misheard words
        text = replace_for_with_4(text)  # Replace "for" with "4"
        text = process_capitalization(text)  # Process capitalization & "edge"
        
        if len(text) >= 6:  # Ensure minimum length of 6
            break

    return text[:6]  # Ensure the output is exactly 6 characters

def get_image_text(image_file):
    ocr_result = ocr_reader.readtext(image_file)
    text = ''.join([res[1] for res in ocr_result])
    return text

def process_all_samples(audio_dir, image_dir):
    all_files = os.listdir(audio_dir)
    total_files = len(all_files)
    exact_match_count = 0  # Counter for exact matches
    
    for i, audio_file in enumerate(tqdm(all_files, desc="Processing")):
        audio_path = os.path.join(audio_dir, audio_file)
        image_file = audio_file.replace('.wav', '.png')  # Assuming matching names
        image_path = os.path.join(image_dir, image_file)

        # Get transcription from Whisper
        whisper_text = get_whisper_transcription(audio_path)
        
        # Get ground truth text from OCR
        ground_truth_text = get_image_text(image_path)

        if not ground_truth_text or not whisper_text:
            continue  

        if whisper_text == ground_truth_text:
            exact_match_count += 1

        if (i + 1) % 500 == 0:
            print(f"\nProgress: {i+1}/{total_files} samples processed. Exact Matches: {exact_match_count}\n")

    # Final count of exact matches
    print(f"\nFinal Exact Matches after processing {total_files} samples: {exact_match_count}\n")

# Run the function to process all samples
process_all_samples(audio_dir, image_dir)

  checkpoint = torch.load(fp, map_location=device)
Processing:  10%|█         | 1000/10000 [07:26<55:55,  2.68it/s] 


Progress: 1000/10000 samples processed. Exact Matches: 248



Processing:  15%|█▌        | 1500/10000 [11:07<59:14,  2.39it/s]  


Progress: 1500/10000 samples processed. Exact Matches: 369



Processing:  20%|██        | 2000/10000 [14:58<55:49,  2.39it/s]  


Progress: 2000/10000 samples processed. Exact Matches: 480



Processing:  25%|██▌       | 2500/10000 [18:43<47:07,  2.65it/s]  


Progress: 2500/10000 samples processed. Exact Matches: 625



Processing:  30%|███       | 3000/10000 [22:40<43:22,  2.69it/s]  


Progress: 3000/10000 samples processed. Exact Matches: 745



Processing:  35%|███▌      | 3500/10000 [26:24<1:02:44,  1.73it/s]


Progress: 3500/10000 samples processed. Exact Matches: 879



Processing:  40%|████      | 4000/10000 [30:00<44:46,  2.23it/s]  


Progress: 4000/10000 samples processed. Exact Matches: 1017



Processing:  45%|████▌     | 4500/10000 [33:40<52:52,  1.73it/s]  


Progress: 4500/10000 samples processed. Exact Matches: 1143



Processing:  50%|█████     | 5000/10000 [37:22<30:57,  2.69it/s]  


Progress: 5000/10000 samples processed. Exact Matches: 1269



Processing:  55%|█████▌    | 5500/10000 [41:12<3:39:59,  2.93s/it]


Progress: 5500/10000 samples processed. Exact Matches: 1391



Processing:  60%|██████    | 6000/10000 [45:00<24:44,  2.70it/s]  


Progress: 6000/10000 samples processed. Exact Matches: 1512



Processing:  65%|██████▌   | 6500/10000 [48:38<28:41,  2.03it/s]


Progress: 6500/10000 samples processed. Exact Matches: 1637



Processing:  70%|███████   | 7000/10000 [52:19<18:23,  2.72it/s]  


Progress: 7000/10000 samples processed. Exact Matches: 1770



Processing:  75%|███████▌  | 7500/10000 [55:59<15:38,  2.66it/s]  


Progress: 7500/10000 samples processed. Exact Matches: 1900



Processing:  80%|████████  | 8000/10000 [59:30<14:18,  2.33it/s]  


Progress: 8000/10000 samples processed. Exact Matches: 2032



Processing:  85%|████████▌ | 8500/10000 [1:03:26<17:03,  1.47it/s]  


Progress: 8500/10000 samples processed. Exact Matches: 2147



Processing:  90%|█████████ | 9000/10000 [1:06:59<07:28,  2.23it/s]


Progress: 9000/10000 samples processed. Exact Matches: 2278



Processing:  95%|█████████▌| 9500/10000 [1:10:47<03:00,  2.78it/s]


Progress: 9500/10000 samples processed. Exact Matches: 2401



Processing: 100%|██████████| 10000/10000 [1:14:20<00:00,  2.24it/s]


Progress: 10000/10000 samples processed. Exact Matches: 2542


Final Exact Matches after processing 10000 samples: 2542






# Implementation of Whisper AI(medium) and OCR to generate text for 10 random samples

In [7]:
audio_dir = '/kaggle/input/captcha-dataset/captchas/audio'
image_dir = '/kaggle/input/captcha-dataset/captchas/images'

ocr_reader = Reader(['en'])  # EasyOCR setup
whisper_model = whisper.load_model("medium")  # Whisper model

# Dictionary to convert number words to digits
number_map = {
    "zero": "0", "one": "1", "two": "2", "three": "3", "four": "4",
    "five": "5", "six": "6", "seven": "7", "eight": "8", "nine": "9"
}

# Dictionary for common misheard words
misheard_map = {
    "mall": "small", "moll": "small", "capitun": "capital", "capitan": "capital",
    "apital": "capital", "capitole": "capital", "zimro": "0", "smaller": "a"
}

# Step 1: Replace number words with digits
def replace_number_words(text):
    for word, digit in number_map.items():
        text = re.sub(rf"\b{word}\b", digit, text, flags=re.IGNORECASE)
    return text

# Step 2: Fix common misheard words
def fix_misheard_words(text):
    for wrong, correct in misheard_map.items():
        text = re.sub(rf"\b{wrong}\b", correct, text, flags=re.IGNORECASE)
    return text

# Step 3: Replace "for" with "4"
def replace_for_with_4(text):
    text = re.sub(r"\bfor\b", "4", text, flags=re.IGNORECASE)
    return text

# Step 4: Process capitalization and handle "edge" replacement
def process_capitalization(text):
    matches = re.findall(r"(capital|small) (\w+)", text, re.IGNORECASE)
    cleaned_text = ""

    i = 0
    while i < len(text):
        match_found = False
        
        for match in matches:
            marker, letter = match
            marker_index = text.lower().find(f"{marker.lower()} {letter.lower()}")
            
            if marker_index == i:
                if marker.lower() == "capital":
                    cleaned_text += letter.upper()
                else:
                    cleaned_text += letter.lower()
                i += len(marker) + 2
                match_found = True
                break

        if not match_found:
            cleaned_text += text[i]
            i += 1

    # Handle "edge" replacement based on the previous marker
    cleaned_text = re.sub(r"\bsmall\s+edge\b", "h", cleaned_text, flags=re.IGNORECASE)
    cleaned_text = re.sub(r"\bcapital\s+edge\b", "H", cleaned_text, flags=re.IGNORECASE)

    # Remove remaining "capital" or "small" words
    cleaned_text = re.sub(r"\b(capital|small)\b", "", cleaned_text, flags=re.IGNORECASE)
    cleaned_text = re.sub(r"[^a-zA-Z0-9]", "", cleaned_text)  # Keep only A-Z, a-z, 0-9
    return cleaned_text

# Function to get Whisper transcription
def get_whisper_transcription(audio_file):
    text = ""
    
    # Try Whisper transcription up to 3 times if length is < 6
    for _ in range(3):
        result = whisper_model.transcribe(audio_file)
        text = result["text"]
        text = replace_number_words(text)  # Replace number words with digits
        text = fix_misheard_words(text)  # Fix misheard words
        text = replace_for_with_4(text)  # Replace "for" with "4"
        text = process_capitalization(text)  # Process capitalization & "edge"
        
        if len(text) >= 6:  # Ensure minimum length of 6
            break

    return text[:6]  # Ensure the output is exactly 6 characters

def get_image_text(image_file):
    ocr_result = ocr_reader.readtext(image_file)
    text = ''.join([res[1] for res in ocr_result])
    return text

# Main function to run the process for random 10 files
def process_random_samples(audio_dir, image_dir, num_samples=10):
    audio_files = random.sample(os.listdir(audio_dir), num_samples)
    
    for audio_file in audio_files:
        audio_path = os.path.join(audio_dir, audio_file)
        image_file = audio_file.replace('.wav', '.png')  # Assuming same filename for audio and image
        image_path = os.path.join(image_dir, image_file)

        whisper_text = get_whisper_transcription(audio_path)

        ground_truth_text = get_image_text(image_path)

        print(f"Audio file: {audio_file}")
        print(f"Whisper AI Transcription: {whisper_text}")
        print(f"Ground Truth (OCR): {ground_truth_text}")
        print("="*50)

process_random_samples(audio_dir, image_dir, num_samples=10)

  checkpoint = torch.load(fp, map_location=device)


Audio file: captcha_4175.wav
Whisper AI Transcription: mPChvc
Ground Truth (OCR): mPChdc
Audio file: captcha_2383.wav
Whisper AI Transcription: 7X8Fqb
Ground Truth (OCR): 7XBFqb
Audio file: captcha_0550.wav
Whisper AI Transcription: FZQcBF
Ground Truth (OCR): FZQcB4
Audio file: captcha_7525.wav
Whisper AI Transcription: BYVTJD
Ground Truth (OCR): BYVTJD
Audio file: captcha_0435.wav
Whisper AI Transcription: zZWUWQ
Ground Truth (OCR): ZZWUWQ
Audio file: captcha_6497.wav
Whisper AI Transcription: 2BZ51J
Ground Truth (OCR): 2BZ51J
Audio file: captcha_2963.wav
Whisper AI Transcription: w9dKlX
Ground Truth (OCR): wgdKIX
Audio file: captcha_6342.wav
Whisper AI Transcription: RjlwWT
Ground Truth (OCR): RjlwWT
Audio file: captcha_4006.wav
Whisper AI Transcription: JQmbaG
Ground Truth (OCR): JQmbag
Audio file: captcha_3294.wav
Whisper AI Transcription: 5yhjHF
Ground Truth (OCR): SyhjHF


# Whisper AI(medium) Matching Accuracy Scores

In [6]:
audio_dir = '/kaggle/input/captcha-dataset/captchas/audio'
image_dir = '/kaggle/input/captcha-dataset/captchas/images'

ocr_reader = Reader(['en'])  # EasyOCR setup
whisper_model = whisper.load_model("medium")  # Whisper model

# Dictionary to convert number words to digits
number_map = {
    "zero": "0", "one": "1", "two": "2", "three": "3", "four": "4",
    "five": "5", "six": "6", "seven": "7", "eight": "8", "nine": "9"
}

# Dictionary for common misheard words
misheard_map = {
    "mall": "small", "moll": "small", "capitun": "capital", "capitan": "capital",
    "apital": "capital", "capitole": "capital", "zimro": "0", "smaller": "a"
}

# Step 1: Replace number words with digits
def replace_number_words(text):
    for word, digit in number_map.items():
        text = re.sub(rf"\b{word}\b", digit, text, flags=re.IGNORECASE)
    return text

# Step 2: Fix common misheard words
def fix_misheard_words(text):
    for wrong, correct in misheard_map.items():
        text = re.sub(rf"\b{wrong}\b", correct, text, flags=re.IGNORECASE)
    return text

# Step 3: Replace "for" with "4"
def replace_for_with_4(text):
    text = re.sub(r"\bfor\b", "4", text, flags=re.IGNORECASE)
    return text

# Step 4: Process capitalization and handle "edge" replacement
def process_capitalization(text):
    matches = re.findall(r"(capital|small) (\w+)", text, re.IGNORECASE)
    cleaned_text = ""

    i = 0
    while i < len(text):
        match_found = False
        
        for match in matches:
            marker, letter = match
            marker_index = text.lower().find(f"{marker.lower()} {letter.lower()}")
            
            if marker_index == i:
                if marker.lower() == "capital":
                    cleaned_text += letter.upper()
                else:
                    cleaned_text += letter.lower()
                i += len(marker) + 2
                match_found = True
                break

        if not match_found:
            cleaned_text += text[i]
            i += 1

    # Handle "edge" replacement based on the previous marker
    cleaned_text = re.sub(r"\bsmall\s+edge\b", "h", cleaned_text, flags=re.IGNORECASE)
    cleaned_text = re.sub(r"\bcapital\s+edge\b", "H", cleaned_text, flags=re.IGNORECASE)

    # Remove remaining "capital" or "small" words
    cleaned_text = re.sub(r"\b(capital|small)\b", "", cleaned_text, flags=re.IGNORECASE)
    cleaned_text = re.sub(r"[^a-zA-Z0-9]", "", cleaned_text)  # Keep only A-Z, a-z, 0-9
    return cleaned_text

# Function to get Whisper transcription
def get_whisper_transcription(audio_file):
    text = ""
    
    # Try Whisper transcription up to 3 times if length is < 6
    for _ in range(3):
        result = whisper_model.transcribe(audio_file)
        text = result["text"]
        text = replace_number_words(text)  # Replace number words with digits
        text = fix_misheard_words(text)  # Fix misheard words
        text = replace_for_with_4(text)  # Replace "for" with "4"
        text = process_capitalization(text)  # Process capitalization & "edge"
        
        if len(text) >= 6:  # Ensure minimum length of 6
            break

    return text[:6]  # Ensure the output is exactly 6 characters

def get_image_text(image_file):
    ocr_result = ocr_reader.readtext(image_file)
    text = ''.join([res[1] for res in ocr_result])
    return text

def process_all_samples(audio_dir, image_dir):
    all_files = os.listdir(audio_dir)
    total_files = len(all_files)
    cer_scores = []
    
    for i, audio_file in enumerate(tqdm(all_files, desc="Processing")):
        audio_path = os.path.join(audio_dir, audio_file)
        image_file = audio_file.replace('.wav', '.png')  # Assuming matching names
        image_path = os.path.join(image_dir, image_file)

        whisper_text = get_whisper_transcription(audio_path)

        ground_truth_text = get_image_text(image_path)

        if not ground_truth_text or not whisper_text:
            continue  

        error_rate = cer(ground_truth_text, whisper_text)  # CER calculation
        accuracy = max(0, 100 - (error_rate * 100))  # Convert to percentage accuracy
        cer_scores.append(accuracy)

        # Print progress every 500 samples
        if (i + 1) % 500 == 0:
            avg_accuracy = sum(cer_scores) / len(cer_scores)
            print(f"\nProgress: {i+1}/{total_files} samples processed. Current Accuracy: {avg_accuracy:.2f}%\n")

    avg_accuracy = sum(cer_scores) / len(cer_scores) if cer_scores else 0
    print(f"\nFinal Accuracy after processing {total_files} samples: {avg_accuracy:.2f}%\n")

# Run the function to process all samples
process_all_samples(audio_dir, image_dir)

100%|██████████████████████████████████████| 1.42G/1.42G [00:07<00:00, 195MiB/s]
  checkpoint = torch.load(fp, map_location=device)
Processing:   5%|▌         | 500/10000 [10:15<3:02:10,  1.15s/it]


Progress: 500/10000 samples processed. Current Accuracy: 79.25%



Processing:  10%|█         | 1000/10000 [20:49<2:52:04,  1.15s/it]


Progress: 1000/10000 samples processed. Current Accuracy: 79.66%



Processing:  15%|█▌        | 1500/10000 [31:28<2:47:09,  1.18s/it]


Progress: 1500/10000 samples processed. Current Accuracy: 79.42%



Processing:  20%|██        | 2000/10000 [41:45<2:34:18,  1.16s/it]


Progress: 2000/10000 samples processed. Current Accuracy: 78.96%



Processing:  25%|██▌       | 2500/10000 [52:14<2:40:49,  1.29s/it]


Progress: 2500/10000 samples processed. Current Accuracy: 79.27%



Processing:  30%|███       | 3000/10000 [1:02:28<2:07:28,  1.09s/it]


Progress: 3000/10000 samples processed. Current Accuracy: 79.47%



Processing:  35%|███▌      | 3500/10000 [1:12:55<3:18:37,  1.83s/it]


Progress: 3500/10000 samples processed. Current Accuracy: 79.47%



Processing:  40%|████      | 4000/10000 [1:23:45<2:03:55,  1.24s/it] 


Progress: 4000/10000 samples processed. Current Accuracy: 79.56%



Processing:  45%|████▌     | 4500/10000 [1:34:50<2:37:13,  1.72s/it] 


Progress: 4500/10000 samples processed. Current Accuracy: 79.46%



Processing:  50%|█████     | 5000/10000 [1:45:14<1:40:58,  1.21s/it]


Progress: 5000/10000 samples processed. Current Accuracy: 79.53%



Processing:  50%|█████     | 5035/10000 [1:45:56<1:44:27,  1.26s/it]


KeyboardInterrupt: 

# Whisper AI(medium) - No. of texts matched

In [10]:
audio_dir = '/kaggle/input/captcha-dataset/captchas/audio'
image_dir = '/kaggle/input/captcha-dataset/captchas/images'

ocr_reader = Reader(['en'])  # EasyOCR setup
whisper_model = whisper.load_model("medium")  # Whisper model

# Dictionary to convert number words to digits
number_map = {
    "zero": "0", "one": "1", "two": "2", "three": "3", "four": "4",
    "five": "5", "six": "6", "seven": "7", "eight": "8", "nine": "9"
}

# Dictionary for common misheard words
misheard_map = {
    "mall": "small", "moll": "small", "capitun": "capital", "capitan": "capital",
    "apital": "capital", "capitole": "capital", "zimro": "0", "smaller": "a"
}

# Step 1: Replace number words with digits
def replace_number_words(text):
    for word, digit in number_map.items():
        text = re.sub(rf"\b{word}\b", digit, text, flags=re.IGNORECASE)
    return text

# Step 2: Fix common misheard words
def fix_misheard_words(text):
    for wrong, correct in misheard_map.items():
        text = re.sub(rf"\b{wrong}\b", correct, text, flags=re.IGNORECASE)
    return text

# Step 3: Replace "for" with "4"
def replace_for_with_4(text):
    text = re.sub(r"\bfor\b", "4", text, flags=re.IGNORECASE)
    return text

# Step 4: Process capitalization and handle "edge" replacement
def process_capitalization(text):
    matches = re.findall(r"(capital|small) (\w+)", text, re.IGNORECASE)
    cleaned_text = ""

    i = 0
    while i < len(text):
        match_found = False
        
        for match in matches:
            marker, letter = match
            marker_index = text.lower().find(f"{marker.lower()} {letter.lower()}")
            
            if marker_index == i:
                if marker.lower() == "capital":
                    cleaned_text += letter.upper()
                else:
                    cleaned_text += letter.lower()
                i += len(marker) + 2
                match_found = True
                break

        if not match_found:
            cleaned_text += text[i]
            i += 1

    # Handle "edge" replacement based on the previous marker
    cleaned_text = re.sub(r"\bsmall\s+edge\b", "h", cleaned_text, flags=re.IGNORECASE)
    cleaned_text = re.sub(r"\bcapital\s+edge\b", "H", cleaned_text, flags=re.IGNORECASE)

    # Remove remaining "capital" or "small" words
    cleaned_text = re.sub(r"\b(capital|small)\b", "", cleaned_text, flags=re.IGNORECASE)
    cleaned_text = re.sub(r"[^a-zA-Z0-9]", "", cleaned_text)  # Keep only A-Z, a-z, 0-9
    return cleaned_text

# Function to get Whisper transcription
def get_whisper_transcription(audio_file):
    text = ""
    
    # Try Whisper transcription up to 3 times if length is < 6
    for _ in range(3):
        result = whisper_model.transcribe(audio_file)
        text = result["text"]
        text = replace_number_words(text)  # Replace number words with digits
        text = fix_misheard_words(text)  # Fix misheard words
        text = replace_for_with_4(text)  # Replace "for" with "4"
        text = process_capitalization(text)  # Process capitalization & "edge"
        
        if len(text) >= 6:  # Ensure minimum length of 6
            break

    return text[:6]  # Ensure the output is exactly 6 characters

def get_image_text(image_file):
    ocr_result = ocr_reader.readtext(image_file)
    text = ''.join([res[1] for res in ocr_result])
    return text

def process_all_samples(audio_dir, image_dir):
    all_files = os.listdir(audio_dir)
    total_files = len(all_files)
    exact_match_count = 0  # Counter for exact matches
    
    for i, audio_file in enumerate(tqdm(all_files, desc="Processing")):
        audio_path = os.path.join(audio_dir, audio_file)
        image_file = audio_file.replace('.wav', '.png')  # Assuming matching names
        image_path = os.path.join(image_dir, image_file)

        whisper_text = get_whisper_transcription(audio_path)

        ground_truth_text = get_image_text(image_path)

        if not ground_truth_text or not whisper_text:
            continue  

        if whisper_text == ground_truth_text:
            exact_match_count += 1

        if (i + 1) % 500 == 0:
            print(f"\nProgress: {i+1}/{total_files} samples processed. Exact Matches: {exact_match_count}\n")

    # Final count of exact matches
    print(f"\nFinal Exact Matches after processing {total_files} samples: {exact_match_count}\n")

# Run the function to process all samples
process_all_samples(audio_dir, image_dir)

  checkpoint = torch.load(fp, map_location=device)
Processing:   5%|▌         | 500/10000 [10:07<2:57:02,  1.12s/it]


Progress: 500/10000 samples processed. Exact Matches: 151



Processing:  10%|█         | 1000/10000 [20:31<2:51:11,  1.14s/it]


Progress: 1000/10000 samples processed. Exact Matches: 296



Processing:  15%|█▌        | 1500/10000 [31:01<2:45:27,  1.17s/it]


Progress: 1500/10000 samples processed. Exact Matches: 442



Processing:  20%|██        | 2000/10000 [41:03<2:29:49,  1.12s/it]


Progress: 2000/10000 samples processed. Exact Matches: 569



Processing:  25%|██▌       | 2500/10000 [51:25<2:38:26,  1.27s/it]


Progress: 2500/10000 samples processed. Exact Matches: 728



Processing:  30%|███       | 3000/10000 [1:01:34<2:05:21,  1.07s/it]


Progress: 3000/10000 samples processed. Exact Matches: 885



Processing:  35%|███▌      | 3500/10000 [1:11:52<3:17:36,  1.82s/it]


Progress: 3500/10000 samples processed. Exact Matches: 1046



Processing:  40%|████      | 4000/10000 [1:22:21<2:02:06,  1.22s/it]


Progress: 4000/10000 samples processed. Exact Matches: 1199



Processing:  45%|████▌     | 4500/10000 [1:33:16<2:34:40,  1.69s/it] 


Progress: 4500/10000 samples processed. Exact Matches: 1337



Processing:  50%|█████     | 5000/10000 [1:43:28<1:38:10,  1.18s/it]


Progress: 5000/10000 samples processed. Exact Matches: 1492



Processing:  55%|█████▌    | 5500/10000 [1:53:57<1:23:35,  1.11s/it]


Progress: 5500/10000 samples processed. Exact Matches: 1637



Processing:  60%|██████    | 6000/10000 [2:04:27<1:18:03,  1.17s/it]


Progress: 6000/10000 samples processed. Exact Matches: 1775



Processing:  65%|██████▌   | 6500/10000 [2:14:58<1:08:40,  1.18s/it]


Progress: 6500/10000 samples processed. Exact Matches: 1916



Processing:  70%|███████   | 7000/10000 [2:25:28<1:00:21,  1.21s/it]


Progress: 7000/10000 samples processed. Exact Matches: 2065



Processing:  75%|███████▌  | 7500/10000 [2:35:52<47:58,  1.15s/it]  


Progress: 7500/10000 samples processed. Exact Matches: 2220



Processing:  80%|████████  | 8000/10000 [2:46:21<40:47,  1.22s/it]  


Progress: 8000/10000 samples processed. Exact Matches: 2362



Processing:  85%|████████▌ | 8500/10000 [2:56:50<28:41,  1.15s/it]  


Progress: 8500/10000 samples processed. Exact Matches: 2510



Processing:  90%|█████████ | 9000/10000 [3:07:45<25:58,  1.56s/it]


Progress: 9000/10000 samples processed. Exact Matches: 2668



Processing:  95%|█████████▌| 9500/10000 [3:18:12<09:45,  1.17s/it]


Progress: 9500/10000 samples processed. Exact Matches: 2808



Processing: 100%|██████████| 10000/10000 [3:28:28<00:00,  1.25s/it]


Progress: 10000/10000 samples processed. Exact Matches: 2962


Final Exact Matches after processing 10000 samples: 2962




