In [None]:
import os
captcha_images_folder = "captcha_images"
captchas = [os.path.join(captcha_images_folder,f) for f in os.listdir(captcha_images_folder)]

In [None]:
import cv2
def preprocessCAPTCHA(img):
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    grayWBorder = cv2.copyMakeBorder(gray, 8, 8, 8, 8, cv2.BORDER_REPLICATE)
    preprocessed = cv2.threshold(grayWBorder, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)[1]
    return grayWBorder, preprocessed

In [None]:
def getCAPTCHAlabel(pathToFile):
    filename = os.path.basename(pathToFile)
    label = filename.split(".")[0]
    return label

In [None]:
def findBoundingRectanglesOfContours(contours):
    letter_bounding_rectangles = []
    for contour in contours:
        (x, y, w, h) = cv2.boundingRect(contour)
        if w / h > 1.25:
            half_width = int(w / 2)
            letter_bounding_rectangles.append((x, y, half_width, h))
            letter_bounding_rectangles.append((x + half_width, y, half_width, h))
        else:
            letter_bounding_rectangles.append((x, y, w, h))
    return letter_bounding_rectangles

In [None]:
def CAPTCHAtoGrayscaleAndBoundingRectangles(captcha_image_file):
    image = cv2.imread(captcha_image_file)
    gray, preprocessed = preprocessCAPTCHA(image)
    contours = cv2.findContours(preprocessed.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    contours = contours[0]
    letter_bounding_rectangles = findBoundingRectanglesOfContours(contours)
    letter_bounding_rectangles = sorted(letter_bounding_rectangles, key=lambda x: x[0])
    return gray, letter_bounding_rectangles

In [None]:
def boundingRectangleToLetterImage(letter_bounding_box, grayscaled):
    x, y, w, h = letter_bounding_box
    letter_image = grayscaled[y - 2:y + h + 2, x - 2:x + w + 2]
    return letter_image

In [None]:
captcha_processing_output_folder = "extracted_letter_images"
character_counts = {}
def cropBoundingRectanglesAndSaveToFile(letter_bounding_rectangles, gray, captcha_label):
    for letter_bounding_rectangle, current_letter in zip(letter_bounding_rectangles, captcha_label):
        letter_image = boundingRectangleToLetterImage(letter_bounding_rectangle, gray)
        
        save_path = os.path.join(captcha_processing_output_folder, current_letter)
        if not os.path.exists(save_path):
            os.makedirs(save_path)
            
        character_count = character_counts.get(current_letter, 1)
        
        p = os.path.join(save_path, str(character_count)+".png")
        cv2.imwrite(p, letter_image)

        character_counts[current_letter] = character_count + 1 

In [None]:
import imutils
import numpy as np

for captcha_image_file in captchas:
    captcha_label = getCAPTCHAlabel(captcha_image_file)
    gray, letter_bounding_rectangles = CAPTCHAtoGrayscaleAndBoundingRectangles(captcha_image_file)
    if len(letter_bounding_rectangles) != 4:
        continue
    cropBoundingRectanglesAndSaveToFile(letter_bounding_rectangles, gray, captcha_label)