In [2]:
import os
import random
import shutil

from google_drive_downloader import GoogleDriveDownloader as gdd
import cv2
import numpy as np

import imgaug as ia
from imgaug import augmenters as iaa

from PIL import Image



In [3]:
dataset_id_in_cloud = "1TZF0qVN1PojR5o_nPR_cl7T9sxH37iMY"

main_dir = "."
data_dir = os.path.join(main_dir, "data")
dataset_zip_dir = os.path.join(data_dir, "roman-numbers-dataset.zip")
dataset_dir = os.path.join(data_dir, "roman-numbers-dataset")

train_dir = os.path.join(data_dir, "train")
test_dir = os.path.join(data_dir, "test")
val_dir = os.path.join(data_dir, "val")

train_portion = 0.6
test_portion = 0.2
val_portion = 0.2

img_height = 64
img_width = 64


### Loading dataset

In [4]:
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

if not os.path.exists(dataset_dir):
    print("Downloading the dataset...")
    gdd.download_file_from_google_drive(
        file_id=dataset_id_in_cloud,
        dest_path=dataset_zip_dir,
        unzip=True)
    print("Completed.")


### Split data

In [5]:
classes = os.listdir(dataset_dir)
n_classes = len(classes)
print(n_classes)
print(classes)


8
['II', 'IV', 'VI', 'V', 'VIII', 'VII', 'III', 'I']


In [6]:
def create_directories(dir_name, classes_list):
    """
    Create folder for each class
    """
    if os.path.exists(dir_name):
        shutil.rmtree(dir_name)
    os.makedirs(dir_name)
    
    for class_name in classes_list:   
        os.makedirs(os.path.join(dir_name, class_name))
        
def resize_img(img_path, img_height, img_width):
    img = cv2.imread(img_path)
    img = cv2.resize(img, (img_height, img_width))
    cv2.imwrite(img_path, img)
    

In [7]:
create_directories(train_dir, classes)
create_directories(val_dir, classes)
create_directories(test_dir, classes)

In [8]:
random.seed(5)

for class_name in classes:
    image_list = os.listdir(os.path.join(dataset_dir, class_name))
    random.shuffle(image_list)
    
    n_train_img = int(len(image_list) * train_portion)
    n_valid_img = int(len(image_list) * val_portion)
    n_test_img = len(image_list) - n_train_img - n_valid_img
    
    train_list = image_list[: n_train_img]
    valid_list = image_list[n_train_img: n_train_img + n_valid_img]
    test_list = image_list[n_train_img + n_valid_img:]
    
    data_lists = [train_list, valid_list, test_list]
    dirs_list = [train_dir, val_dir, test_dir]
    
    for i, data_list in enumerate(data_lists):
        for img_name in data_list:
            shutil.copy2(os.path.join(dataset_dir, class_name, img_name), 
                        os.path.join(dirs_list[i], class_name))
            img_path = os.path.join(dirs_list[i], class_name, img_name)
            resize_img(img_path, img_height, img_width)
            
    print("\n", class_name, ":")
    print("total number of imgs =", len(image_list))
    print("number of imgs for train =", n_train_img)
    print("number of imgs for valid =", n_valid_img)
    print("number of imgs for test =", n_test_img)
    


 II :
total number of imgs = 198
number of imgs for train = 118
number of imgs for valid = 39
number of imgs for test = 41

 IV :
total number of imgs = 197
number of imgs for train = 118
number of imgs for valid = 39
number of imgs for test = 40

 VI :
total number of imgs = 195
number of imgs for train = 117
number of imgs for valid = 39
number of imgs for test = 39

 V :
total number of imgs = 193
number of imgs for train = 115
number of imgs for valid = 38
number of imgs for test = 40

 VIII :
total number of imgs = 197
number of imgs for train = 118
number of imgs for valid = 39
number of imgs for test = 40

 VII :
total number of imgs = 198
number of imgs for train = 118
number of imgs for valid = 39
number of imgs for test = 41

 III :
total number of imgs = 195
number of imgs for train = 117
number of imgs for valid = 39
number of imgs for test = 39

 I :
total number of imgs = 199
number of imgs for train = 119
number of imgs for valid = 39
number of imgs for test = 41


In [9]:
def load_train_data():
    X = list()
    y = list()
    
    for folder in classes:
        path = '{}/{}/'.format(train_dir, folder)
        files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]

        for filename in files:
            img = Image.open(path + filename)
            img = img.resize((64,64))
            img = np.array(img)
            X.append(img)
            y.append(folder)

    return X, y

In [10]:
X, y = load_train_data()

In [24]:
def augmentation(images):
    original_images = images.copy()
    result = list()

    transformations = [
        iaa.Fliplr(1),
        iaa.Flipud(1),
        iaa.Affine(rotate=10),
        iaa.Affine(rotate=22),
        iaa.Affine(rotate=45),
        iaa.Affine(rotate=67),
        iaa.Affine(rotate=90),
        iaa.GaussianBlur(sigma=(0.0, 3.0)),
        iaa.Dropout(p=(0, 0.2)),
        iaa.Affine(translate_percent={"x": (-0.2, 0.2), "y": (-0.2, 0.2)}),
        iaa.CropAndPad(percent=(-0.25, 0.25))
    ]

    for transform in transformations:
        aug_images = transform.augment_images(original_images)
        result.extend(aug_images)
    
    return images + result

In [29]:
aug_images = augmentation(X)
labels = y * (len(aug_images) // len(y))