In [1]:
# Required imports
import numpy as np
import pandas as pd
import cv2
import os

In [2]:
def load_images(folder):
    ''' Load the images of the required symbols/digits,
    find the maximum bounding rectangle for each symbol/digit images.
    Finally resize and reshape into the required dimensions'''
    data = []
    
    for file in os.listdir(folder):
        img = cv2.imread(os.path.join(folder, file), cv2.IMREAD_GRAYSCALE)

        if img is not None:
            img = ~img
            _, thresh = cv2.threshold(img, 127, 255, cv2.THRESH_BINARY)
            ctrs, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
            cnt = sorted(ctrs, key=lambda ctr: cv2.boundingRect(ctr)[0])

            max_area = 0 
            for c in cnt:
                x, y, w, h = cv2.boundingRect(c)
                max_area = max(w*h, max_area) # Get the maximum area bounding rect

                if max_area == w*h:
                    x_max, y_max = x, y
                    w_max, h_max = w, h

            cropped = thresh[y_max:y_max+h_max+10, x_max:x_max+w_max+10] # Get the digit/symbol
            resized = cv2.resize(cropped, (28,28)) # Resize the image to 28x28
            resized = np.reshape(resized, (784,1)) # Reshape to get 784 features
            data.append(resized)
            
    return data

In [3]:
DATA_PATH = "../assets/digits_symbols_dataset/"
DIRS = os.listdir(DATA_PATH)

HELPER_DICT = {
    '0': '0',
    '1': '1',
    '2': '2',
    '3': '3',
    '4': '4',
    '5': '5',
    '6': '6',
    '7': '7',
    '8': '8',
    '9': '9',
    '-': '10',
    '+': '11',
    'times': '12',
    'forward_slash': '13',
    '-': '14',
    '(': '15',
    ')': '16',
    'sin': '17',
    'tan': '18',
    'log': '19',
    'sqrt': '20',
}

In [4]:
final_data = []

In [5]:
ignore = [".DS_Store", ".ipynb_checkpoints"]
flag = True
idx = 1

for folder in DIRS:
    if folder not in ignore:
        if flag: # Append a symbol/digit data first into the final_data
            final_data = load_images(DATA_PATH + folder)
            for i in range(len(final_data)):
                final_data[i] = np.append(final_data[i], [HELPER_DICT[folder]])
            
            flag = False
            print(f"{folder} => {len(final_data)} ************* [Dir Index: {idx}]")
        else:
            # Concatenate the rest of the symbol/digit data with the final data
            data = load_images(DATA_PATH + folder)

            for i in range(len(data)):
                data[i] = np.append(data[i], [HELPER_DICT[folder]])
            final_data = np.concatenate((final_data, data))

            print(f"{folder} => {len(final_data)} ************* [Dir Index: {idx}]")
        idx += 1

forward_slash => 2587 ************* [Dir Index: 1]
times => 5587 ************* [Dir Index: 2]
sin => 8587 ************* [Dir Index: 3]
9 => 11587 ************* [Dir Index: 4]
0 => 14587 ************* [Dir Index: 5]
7 => 17496 ************* [Dir Index: 6]
+ => 20496 ************* [Dir Index: 7]
6 => 23496 ************* [Dir Index: 8]
1 => 26496 ************* [Dir Index: 9]
8 => 29496 ************* [Dir Index: 10]
- => 31996 ************* [Dir Index: 11]
sqrt => 34996 ************* [Dir Index: 12]
log => 36997 ************* [Dir Index: 13]
( => 39997 ************* [Dir Index: 14]
4 => 42997 ************* [Dir Index: 15]
3 => 45997 ************* [Dir Index: 16]
tan => 48447 ************* [Dir Index: 17]
) => 51447 ************* [Dir Index: 18]
2 => 54447 ************* [Dir Index: 19]
5 => 57447 ************* [Dir Index: 20]


In [6]:
train_data = pd.DataFrame(final_data, index=None)
train_data.to_csv("final_train_data.csv", index=False)