In [1]:
import os
import random
import shutil

from google_drive_downloader import GoogleDriveDownloader as gdd
from PIL import Image


In [2]:
dataset_id_in_cloud = "1Gl-YYH-1Rqrcx1htX1-6nqWDyqaRI0UT"
model_id_in_cloud = "1ug2PVwrggSQ7ro2mjGhfXVaZ--j7kLBP"

main_dir = "."
data_dir = os.path.join(main_dir, "data")
dataset_zip_dir = os.path.join(data_dir, "roman-numbers-dataset.zip")
dataset_dir = os.path.join(data_dir, "roman-numbers-dataset")

saved_model_zip_dir = os.path.join(main_dir, "saved_models.zip")
saved_model_dir = os.path.join(main_dir, "saved_models")

train_dir = os.path.join(data_dir, "train")
test_dir = os.path.join(data_dir, "test")
val_dir = os.path.join(data_dir, "val")

train_portion = 0.6
test_portion = 0.2
val_portion = 0.2

img_height = 64
img_width = 64


### Loading dataset

In [3]:
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

if not os.path.exists(dataset_dir):
    print("Downloading the dataset...")
    gdd.download_file_from_google_drive(
        file_id=dataset_id_in_cloud,
        dest_path=dataset_zip_dir,
        unzip=True)
    print("Completed.")


Downloading the dataset...
Downloading 1Gl-YYH-1Rqrcx1htX1-6nqWDyqaRI0UT into .\data\roman-numbers-dataset.zip... Done.
Unzipping...Done.
Completed.


### Split data

In [4]:
classes = os.listdir(dataset_dir)
n_classes = len(classes)
print(n_classes)
print(classes)


8
['I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII']


In [5]:
def create_directories(dir_name, classes_list):
    """
    Create folder for each class
    """
    if os.path.exists(dir_name):
        shutil.rmtree(dir_name)
    os.makedirs(dir_name)
    
    for class_name in classes_list:   
        os.makedirs(os.path.join(dir_name, class_name))
        
def resize_img(img_path, img_height, img_width):
    img = Image.open(img_path)
    img = img.resize((img_height, img_width))
    img.save(img_path)
    

In [6]:
create_directories(train_dir, classes)
create_directories(val_dir, classes)
create_directories(test_dir, classes)


In [7]:
random.seed(5)

total_n_train = 0
total_n_valid = 0
total_n_test = 0

for class_name in classes:
    image_list = os.listdir(os.path.join(dataset_dir, class_name))
    random.shuffle(image_list)
    
    n_train_img = int(len(image_list) * train_portion)
    n_valid_img = int(len(image_list) * val_portion)
    n_test_img = len(image_list) - n_train_img - n_valid_img
    
    train_list = image_list[: n_train_img]
    valid_list = image_list[n_train_img: n_train_img + n_valid_img]
    test_list = image_list[n_train_img + n_valid_img:]
    
    data_lists = [train_list, valid_list, test_list]
    dirs_list = [train_dir, val_dir, test_dir]
    
    for i, data_list in enumerate(data_lists):
        for img_name in data_list:
            shutil.copy2(os.path.join(dataset_dir, class_name, img_name), 
                        os.path.join(dirs_list[i], class_name))
            img_path = os.path.join(dirs_list[i], class_name, img_name)
            resize_img(img_path, img_height, img_width)
            
    print("\n", class_name, ":")
    print("total number of imgs for class=", len(image_list))
    print("number of imgs for train =", n_train_img)
    print("number of imgs for valid =", n_valid_img)
    print("number of imgs for test =", n_test_img)
    
    total_n_train += n_train_img
    total_n_valid += n_valid_img
    total_n_test += n_test_img
    
print("\nTotal train images =", total_n_train )
print("Total validation images =",total_n_valid)
print("Total test images =", total_n_test)
    


 I :
total number of imgs for class= 213
number of imgs for train = 127
number of imgs for valid = 42
number of imgs for test = 44

 II :
total number of imgs for class= 211
number of imgs for train = 126
number of imgs for valid = 42
number of imgs for test = 43

 III :
total number of imgs for class= 211
number of imgs for train = 126
number of imgs for valid = 42
number of imgs for test = 43

 IV :
total number of imgs for class= 206
number of imgs for train = 123
number of imgs for valid = 41
number of imgs for test = 42

 V :
total number of imgs for class= 204
number of imgs for train = 122
number of imgs for valid = 40
number of imgs for test = 42

 VI :
total number of imgs for class= 210
number of imgs for train = 126
number of imgs for valid = 42
number of imgs for test = 42

 VII :
total number of imgs for class= 207
number of imgs for train = 124
number of imgs for valid = 41
number of imgs for test = 42

 VIII :
total number of imgs for class= 194
number of imgs for train

### Load the tensorflow model

In [8]:

if not os.path.exists(saved_model_dir):
    print("Downloading the tensorflow model...")
    gdd.download_file_from_google_drive(
        file_id=model_id_in_cloud,
        dest_path=saved_model_zip_dir,
        unzip=True)
    print("Completed.")


Downloading the tensorflow model...
Downloading 1zMBeehEd0rVqe5MD6a9ct4cxW3YcuthQ into .\saved_models.zip... Done.
Unzipping...Done.
Completed.
