In [None]:
# Copyright 2022 Sony Semiconductor Solutions Corp. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

## Imports

In [None]:
import errno
import json
import jsonschema
import os
import pathlib
import shutil

import numpy as np
import tensorflow as tf

## Load Configurations

Load the configuration file and set the variables.

In [None]:
def validate_symlink(path: pathlib.Path):
    if path.is_symlink():
        msg = "Symbolic link is not supported. Please use real folder or file"
        raise OSError(errno.ELOOP, f"{msg}", f"{path}")


configuration_path = pathlib.Path("./configuration.json")
validate_symlink(configuration_path)

with open(configuration_path, "r") as f:
    app_configuration = json.load(f)

configuration_schema_path = pathlib.Path("./configuration_schema_convert.json")
validate_symlink(configuration_schema_path)

with open(configuration_schema_path, "r") as f:
    json_schema = json.load(f)

# Validate configuration.
jsonschema.validate(app_configuration, json_schema)

dataset_conversion_base_file = app_configuration[
    "dataset_conversion_base_file"
].replace(os.path.sep, "/")
validate_symlink(pathlib.Path(dataset_conversion_base_file))
dataset_conversion_dir = app_configuration["dataset_conversion_dir"].replace(
    os.path.sep, "/"
)
validate_symlink(pathlib.Path(dataset_conversion_dir))
dataset_conversion_validation_split = app_configuration[
    "dataset_conversion_validation_split"
]
dataset_conversion_seed = app_configuration["dataset_conversion_seed"]

## Convert dataset for transfer learning / quantize

In [None]:
# const datas
cvat_exported = "cvat_exported"
train = "training"
validation = "validation"

Extract exported dataset file.

In [None]:
p_file = pathlib.Path(dataset_conversion_dir)
if not os.path.exists(p_file):
    os.makedirs(p_file)

existslist = []
existslist.append(os.path.exists(pathlib.Path(dataset_conversion_dir + "/labels.json")))
existslist.append(os.path.exists(pathlib.Path(dataset_conversion_dir + "/training/")))
existslist.append(os.path.exists(pathlib.Path(dataset_conversion_dir + "/validation/")))
existslist.append(
    os.path.exists(pathlib.Path(dataset_conversion_dir + "/cvat_exported/"))
)

if any(existslist):
    msg = (
        "The dataset_conversion_dir already contains the dataset. "
        "Please remove the dataset or set another directory: "
    )
    raise FileExistsError(f"{msg}{dataset_conversion_dir}")

exported_path = os.path.join(dataset_conversion_dir, cvat_exported)

shutil.unpack_archive(dataset_conversion_base_file, exported_path)

print(f"extracted to {exported_path}.")

Generate Labels Info.

In [None]:
files = os.listdir(exported_path)
files_dir = sorted([f for f in files if os.path.isdir(os.path.join(exported_path, f))])
dict = {files_dir[i]: (i) for i in range(0, len(files_dir))}

with open(dataset_conversion_dir + "/labels.json", "w") as f:
    json.dump(dict, f, ensure_ascii=False)

print("labels.json generated.")

Separate dataset for training and validation.

In [None]:
batch_size = 32
img_height = 224
img_width = 224

train_ds = tf.keras.utils.image_dataset_from_directory(
    exported_path,
    validation_split=dataset_conversion_validation_split,
    subset="training",
    seed=dataset_conversion_seed,
    image_size=(img_height, img_width),
    batch_size=batch_size,
)

val_ds = tf.keras.utils.image_dataset_from_directory(
    exported_path,
    validation_split=dataset_conversion_validation_split,
    subset="validation",
    seed=dataset_conversion_seed,
    image_size=(img_height, img_width),
    batch_size=batch_size,
)

Copy dataset images to separated directory of training and validation.

In [None]:
trn_class_names = np.array(train_ds.class_names)
trn_path = os.path.join(dataset_conversion_dir, train)
train_files = sorted(train_ds.file_paths)
if not os.path.exists(trn_path):
    os.makedirs(trn_path)
for _class_name in trn_class_names:
    _class_path = os.path.join(trn_path, _class_name)
    if not os.path.exists(_class_path):
        os.makedirs(_class_path)

for _file in train_files:
    shutil.copyfile(_file, _file.replace(cvat_exported, train))

print(f"{len(train_files)} {train} files copied.")

val_class_names = np.array(val_ds.class_names)
val_path = os.path.join(dataset_conversion_dir, validation)
val_files = sorted(val_ds.file_paths)
if not os.path.exists(val_path):
    os.makedirs(val_path)
for _class_name in val_class_names:
    _class_path = os.path.join(val_path, _class_name)
    if not os.path.exists(_class_path):
        os.makedirs(_class_path)

for _file in val_files:
    shutil.copyfile(_file, _file.replace(cvat_exported, validation))

print(f"{len(val_files)} {validation} files copied.")