In [None]:
# Good reference: https://www.tensorflow.org/guide/performance/datasets
# Another good reference: https://www.tensorflow.org/guide/datasets
# Might be able to store all data in memory, but will implement at first as though this is not an option

In [None]:
import os
import numpy as np
import tensorflow as tf
from PIL import Image
from dataset.severstal_steel_dataset import load_annotations, rle_to_dense


In [None]:
TRAIN_IMAGE_DIR = './severstal-steel-defect-detection/train_images/'
TRAIN_ANNOTATIONS_FILE = './severstal-steel-defect-detection/train.csv'
TFRECORD_DIR = 'severstal_steel_train'

In [None]:
anns = load_annotations(TRAIN_ANNOTATIONS_FILE)

In [None]:
def _bytes_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

os.makedirs(TFRECORD_DIR, exist_ok=True)

anns_list = list(anns.items())
examples_per_file = 2000
batch_start = 0
file_index = 0
while batch_start < len(anns_list):
    batch_end = min(batch_start + examples_per_file, len(anns_list))
    print(f'Starting batch {batch_start}-{batch_end} out of {len(anns_list)}')
    with tf.python_io.TFRecordWriter(os.path.join(TFRECORD_DIR, f'severstal_steel_{file_index}.tfrecord')) as writer:
        for i in range(batch_start, batch_end):
            img_name, annotations_dict = anns_list[i]

            # Load image
            img_path = os.path.join(TRAIN_IMAGE_DIR, img_name)
            img = np.array(Image.open(img_path))
            img_gray = img[:, :, 0] # All channels are the same

            # Load annotations
            dense_anns = []
            for cls in ['1', '2', '3', '4']:
                dense_ann = rle_to_dense(annotations_dict[cls], img_gray.shape[0], img_gray.shape[1])
                dense_anns.append(dense_ann)
            annotation_array = np.stack(dense_anns, axis=-1)
            annotation_array.astype(np.uint8)

            # Serialize example
            assert img_gray.dtype == np.uint8
            assert annotation_array.dtype == np.uint8
            feature = {
                'image':       _bytes_feature(tf.compat.as_bytes(img_gray.tostring())),
                'annotations': _bytes_feature(tf.compat.as_bytes(annotation_array.tostring()))
            }
            example_proto = tf.train.Example(features=tf.train.Features(feature=feature))

            # Write to TFRecord file
            writer.write(example_proto.SerializeToString())
    batch_start = batch_end
    file_index += 1

In [None]:
def preprocess(anns_file, out_dir):
    anns = load_annotations(TRAIN_ANNOTATIONS_FILE)
    for img_filename, annotations_dict in anns.items():
        img_id, _ = os.path.splitext(img_filename)
        out_file = os.path.join(out_dir, f'{img_id}.npz')
    

In [None]:
def _bytes_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

anns = load_annotations(TRAIN_ANNOTATIONS_FILE)


# create filewriter
writer = tf.python_io.TFRecordWriter(FILEPATH)


# Define the features of your tfrecord
feature = {'image':  _bytes_feature(tf.compat.as_bytes(image.tostring())),
           'label':  _int64_feature(int(label))}


# Serialize to string and write to file
example = tf.train.Example(features=tf.train.Features(feature=feature))
writer.write(example.SerializeToString())