In [None]:
import os
import numpy as np
import cv2
from sklearn.model_selection import train_test_split

data_path = "./../data/raw_data/"

In [None]:
# Function to preprocess raw data into segmented images.
def segmentation_preprocess():
    """
    Cleans data and splits into character by character images
    """

    # Cleans each image and finds contours to try to identify letters.
    for image_name in os.listdir(data_path):
        img_path = data_path + image_name
        img = cv2.imread(img_path)
        _, thresholded = cv2.threshold(img, 240, 255, cv2.THRESH_BINARY_INV)
        to_gray = cv2.cvtColor(thresholded, cv2.COLOR_BGR2GRAY)
        ctrs, _ = cv2.findContours(to_gray, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        ctrs = np.asarray(ctrs, dtype=object)
        my_rects = np.asarray([cv2.boundingRect(ctr) for ctr in ctrs])

        # If less than 4 letters are identified try to manually split the largest found contour in 2.
        while len(my_rects) != 4:
            big_ctr_ind = np.argmax(my_rects[:, 2])
            big_ctr = my_rects[big_ctr_ind]
            split_width = big_ctr[2]//2
            sub_rect_one = np.asarray([big_ctr[0], big_ctr[1], split_width, 0])
            sub_rect_two = np.asarray([big_ctr[0] + split_width, big_ctr[1], big_ctr[2] - split_width, 0])
            my_rects = np.concatenate((my_rects[:big_ctr_ind], np.asarray([sub_rect_one, sub_rect_two]), my_rects[big_ctr_ind + 1:]))
        
        # Save each of the individual characters to image.
        my_rects = np.sort(my_rects, axis=0)
        code = image_name.split(".")[0]
        for i, rect in enumerate(my_rects):
            new_image = to_gray[:,rect[0]-1:rect[0]+rect[2]+1] #Add one pixel of noise from original image
            total_pad = 24 - new_image.shape[1]
            left_pad  = total_pad // 2
            right_pad = total_pad - left_pad
            new_image = np.pad(new_image, pad_width=((0,0), (left_pad, right_pad)), mode="constant")
            cv2.imwrite("../data/segmented_data/" + code + "_" + str(i) + "_" + code[i] + ".png", new_image)

In [2]:
# Functions to split preprocessed data into train/test/val folders.
def get_data():
    """
    Returns the data split into the proper train/test/val split.
    """
    processed_data_path = "./../data/segmented_data"
    images = []
    labels = []
    captcha = []
    captcha_labels = []
    count = 0
    # Create images and labels and in a shape such that each CAPTCHAs individual letter images are kept together
    for image_path in sorted(os.listdir(processed_data_path)):
        count += 1
        image = cv2.imread(processed_data_path + image_path, cv2.IMREAD_GRAYSCALE)

        label = image_path.split('.')[0][-1]
        captcha.append(image)
        captcha_labels.append(label)

        if count % 4 == 0:
            images.append(captcha)
            labels.append(captcha_labels)
            captcha = []
            captcha_labels = []

    X_train, X_testval, Y_train, Y_testval = train_test_split(np.asarray(images, dtype=object), np.asarray(labels, dtype=object), test_size=.3, random_state=42)
    X_test, X_val, Y_test, Y_val = train_test_split(X_testval, Y_testval, test_size=0.5, random_state=42)

    return X_train, X_test, X_val, Y_train, Y_test, Y_val


def split_it_up():
    """
    Gets processed data from original folder and splits it into new folders train, test, and val
    """
    save_folder = "./../data/segmented_data_split/"
    X_train, X_test, X_val, Y_train, Y_test, Y_val = get_data(0.3, "./../data/segmented_data/")

    # For each of train,test,val save the respective images to these new folders.
    for imgs, labels in zip(X_train, Y_train):
        for i, img in enumerate(imgs):
            item_name = ''.join(labels) + "_" + str(i) + "_" + labels[i]
            cv2.imwrite(save_folder + "train/" + item_name + '.png', np.asarray(img, dtype=np.uint8))

    for imgs, labels in zip(X_test, Y_test):
        for i, img in enumerate(imgs):
            item_name = ''.join(labels) + "_" + str(i) + "_" + labels[i]
            cv2.imwrite(save_folder + "test/" + item_name + '.png', np.asarray(img, dtype=np.uint8))

    for imgs, labels in zip(X_val, Y_val):
        for i, img in enumerate(imgs):
            item_name = ''.join(labels) + "_" + str(i) + "_" + labels[i]
            cv2.imwrite(save_folder + "val/" + item_name + '.png', np.asarray(img, dtype=np.uint8))


In [None]:
# Run to preprocess data and split into train/test/val folders for Segmentation & Barcoding
segmentation_preprocess()
split_it_up()

In [None]:
# Function to preprocess raw data into clean images for OCR
def ocr_preprocess():
    """
    Cleans and saves data.
    """
    for image_name in os.listdir(data_path):
        img_path = data_path + image_name
        img = cv2.imread(img_path)
        _, thresholded = cv2.threshold(img, 240, 255, cv2.THRESH_BINARY_INV)
        to_gray = cv2.cvtColor(thresholded, cv2.COLOR_BGR2GRAY)
        cv2.imwrite("../data/ocr_data/" + image_name, to_gray)

In [None]:
# Functions to split preprocessed OCR data into train/test/val folders
def get_data_ocr():
    """
    Returns the data split into the proper train/test/val split.
    """
    processed_data_path = "./../data/ocr_data/"
    captcha = []
    captcha_labels = []
    for image_path in sorted(os.listdir(processed_data_path)):
        image = cv2.imread(processed_data_path + image_path, cv2.IMREAD_GRAYSCALE)

        label = image_path.split('.')[0]
        captcha.append(image)
        captcha_labels.append(list(label))

    X_train, X_testval, Y_train, Y_testval = train_test_split(np.asarray(captcha, dtype=object), np.asarray(captcha_labels, dtype=object), test_size=.3, random_state=42)
    X_test, X_val, Y_test, Y_val = train_test_split(X_testval, Y_testval, test_size=0.5, random_state=42)

    return X_train, X_test, X_val, Y_train, Y_test, Y_val


def split_it_up_ocr():
    """
    Gets processed data from original folder and splits it into new folders train, test, and val
    """
    save_folder = "./../data/ocr_data_split/"
    X_train, X_test, X_val, Y_train, Y_test, Y_val = get_data_ocr()
    for img, label in zip(X_train, Y_train):
        item_name = ''.join(label)
        cv2.imwrite(save_folder + "train/" + item_name + '.png', np.asarray(img, dtype=np.uint8))

    for img, label in zip(X_test, Y_test):
        item_name = ''.join(label)
        cv2.imwrite(save_folder + "test/" + item_name + '.png', np.asarray(img, dtype=np.uint8))

    for img, label in zip(X_val, Y_val):
        item_name = ''.join(label)
        cv2.imwrite(save_folder + "val/" + item_name + '.png', np.asarray(img, dtype=np.uint8))

split_it_up_ocr()

In [None]:
# Run to preprocess data and split into train/test/val folders for OCR.
ocr_preprocess()
split_it_up_ocr()