In [1]:
import os
import random
import shutil

from google_drive_downloader import GoogleDriveDownloader as gdd
import cv2
import numpy as np


In [2]:
dataset_id_in_cloud = "1TZF0qVN1PojR5o_nPR_cl7T9sxH37iMY"

main_dir = "."
data_dir = os.path.join(main_dir, "data")
dataset_zip_dir = os.path.join(data_dir, "roman-numbers-dataset.zip")
dataset_dir = os.path.join(data_dir, "roman-numbers-dataset")

train_dir = os.path.join(data_dir, "train")
test_dir = os.path.join(data_dir, "test")
val_dir = os.path.join(data_dir, "val")

train_portion = 0.6
test_portion = 0.2
val_portion = 0.2

img_height = 64
img_width = 64


### Loading dataset

In [3]:
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

if not os.path.exists(dataset_dir):
    print("Downloading the dataset...")
    gdd.download_file_from_google_drive(
        file_id=dataset_id_in_cloud,
        dest_path=dataset_zip_dir,
        unzip=True)
    print("Completed.")


Downloading the dataset...
Downloading 1TZF0qVN1PojR5o_nPR_cl7T9sxH37iMY into .\data\roman-numbers-dataset.zip... Done.
Unzipping...Done.
Completed.


### Split data

In [4]:
classes = os.listdir(dataset_dir)
n_classes = len(classes)
print(n_classes)
print(classes)


8
['I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII']


In [5]:
def create_directories(dir_name, classes_list):
    """
    Create folder for each class
    """
    if os.path.exists(dir_name):
        shutil.rmtree(dir_name)
    os.makedirs(dir_name)
    
    for class_name in classes_list:   
        os.makedirs(os.path.join(dir_name, class_name))
        
def resize_img(img_path, img_height, img_width):
    img = cv2.imread(img_path)
    img = cv2.resize(img, (img_height, img_width))
    cv2.imwrite(img_path, img)
    

In [6]:
create_directories(train_dir, classes)
create_directories(val_dir, classes)
create_directories(test_dir, classes)


In [7]:
random.seed(5)

for class_name in classes:
    image_list = os.listdir(os.path.join(dataset_dir, class_name))
    random.shuffle(image_list)
    
    n_train_img = int(len(image_list) * train_portion)
    n_valid_img = int(len(image_list) * val_portion)
    n_test_img = len(image_list) - n_train_img - n_valid_img
    
    train_list = image_list[: n_train_img]
    valid_list = image_list[n_train_img: n_train_img + n_valid_img]
    test_list = image_list[n_train_img + n_valid_img:]
    
    data_lists = [train_list, valid_list, test_list]
    dirs_list = [train_dir, val_dir, test_dir]
    
    for i, data_list in enumerate(data_lists):
        for img_name in data_list:
            shutil.copy2(os.path.join(dataset_dir, class_name, img_name), 
                        os.path.join(dirs_list[i], class_name))
            img_path = os.path.join(dirs_list[i], class_name, img_name)
            resize_img(img_path, img_height, img_width)
            
     
    print("\n", class_name, ":")
    print("total number of imgs =", len(image_list))
    print("number of imgs for train =", n_train_img)
    print("number of imgs for valid =", n_valid_img)
    print("number of imgs for test =", n_test_img)
    


 I :
total number of imgs = 199
number of imgs for train = 119
number of imgs for valid = 39
number of imgs for test = 41

 II :
total number of imgs = 198
number of imgs for train = 118
number of imgs for valid = 39
number of imgs for test = 41

 III :
total number of imgs = 195
number of imgs for train = 117
number of imgs for valid = 39
number of imgs for test = 39

 IV :
total number of imgs = 197
number of imgs for train = 118
number of imgs for valid = 39
number of imgs for test = 40

 V :
total number of imgs = 193
number of imgs for train = 115
number of imgs for valid = 38
number of imgs for test = 40

 VI :
total number of imgs = 195
number of imgs for train = 117
number of imgs for valid = 39
number of imgs for test = 39

 VII :
total number of imgs = 198
number of imgs for train = 118
number of imgs for valid = 39
number of imgs for test = 41

 VIII :
total number of imgs = 197
number of imgs for train = 118
number of imgs for valid = 39
number of imgs for test = 40
