# 2.- Data Transformation

In [2]:
import os
import glob
import xml.etree.ElementTree as ET
import tensorflow as tf

In [3]:
def parse_voc_xml(xml_path):
    """
    Parses a Pascal VOC XML file and returns a dictionary with:
    {
      'filename': 'image_name.jpg',
      'width': 1280,
      'height': 720,
      'objects': [
        {
          'name': 'dog',
          'xmin': 50, 'ymin': 30, 'xmax': 150, 'ymax': 100
        },
        ...
      ]
    }
    """
    tree = ET.parse(xml_path)
    root = tree.getroot()

    data = {}
    data['objects'] = []

    # filename tag
    filename_node = root.find('filename')
    data['filename'] = filename_node.text if filename_node is not None else None

    # size tag (width, height)
    size_node = root.find('size')
    if size_node is not None:
        w_node = size_node.find('width')
        h_node = size_node.find('height')
        data['width'] = int(w_node.text) if w_node is not None else 0
        data['height'] = int(h_node.text) if h_node is not None else 0
    else:
        data['width'] = 0
        data['height'] = 0

    # object tags
    for obj_node in root.findall('object'):
        obj_info = {}
        name_node = obj_node.find('name')
        obj_info['name'] = name_node.text if name_node is not None else "N/A"

        # bounding box
        bndbox_node = obj_node.find('bndbox')
        if bndbox_node is not None:
            xmin_node = bndbox_node.find('xmin')
            ymin_node = bndbox_node.find('ymin')
            xmax_node = bndbox_node.find('xmax')
            ymax_node = bndbox_node.find('ymax')

            obj_info['xmin'] = float(xmin_node.text) if xmin_node is not None else 0
            obj_info['ymin'] = float(ymin_node.text) if ymin_node is not None else 0
            obj_info['xmax'] = float(xmax_node.text) if xmax_node is not None else 0
            obj_info['ymax'] = float(ymax_node.text) if ymax_node is not None else 0

        data['objects'].append(obj_info)

    return data


def _bytes_feature(value):
    """Converts a byte string into a tf.train.Feature of bytes_list."""
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))


def _float_list_feature(value):
    """Converts a float list into a tf.train.Feature of float_list."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=value))


def _int64_feature(value):
    """Converts an integer value into a tf.train.Feature of int64_list."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))


def _int64_list_feature(value):
    """Converts a list of integers into a tf.train.Feature of int64_list."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=value))

In [4]:
def voc_dict_to_tfexample(voc_dict, images_folder):
    """
    Takes the dictionary output by parse_voc_xml(xml_file)
    along with the folder containing images (images_folder).
    
    Returns a tf.train.Example with:
    - image/encoded
    - image/filename
    - image/height, image/width
    - image/object/bbox/xmin, xmax, ymin, ymax
    - image/object/class/text
    """

    filename = voc_dict['filename']
    if filename is None:
        # If <filename> is missing in the XML, we skip
        return None

    img_path = os.path.join(images_folder, filename)
    if not os.path.isfile(img_path):
        # If the image does not exist in JPEGImages folder, skip
        return None

    # Read the image in binary
    with tf.io.gfile.GFile(img_path, 'rb') as fid:
        encoded_image = fid.read()

    width = voc_dict['width']
    height = voc_dict['height']

    xmins = []
    xmaxs = []
    ymins = []
    ymaxs = []
    class_texts = []

    for obj in voc_dict['objects']:
        if width > 0 and height > 0:
            xmins.append(obj['xmin'] / width)
            xmaxs.append(obj['xmax'] / width)
            ymins.append(obj['ymin'] / height)
            ymaxs.append(obj['ymax'] / height)
        else:
            # Avoid zero-division if the XML lacks <size> data
            xmins.append(0.0)
            xmaxs.append(0.0)
            ymins.append(0.0)
            ymaxs.append(0.0)

        # The class name is stored as text (string)
        class_texts.append(obj['name'].encode('utf8'))

    feature_dict = {
        'image/encoded': _bytes_feature(encoded_image),
        'image/filename': _bytes_feature(filename.encode('utf8')),
        'image/format': _bytes_feature(b'jpg'),

        'image/height': _int64_feature(height),
        'image/width': _int64_feature(width),

        'image/object/bbox/xmin': _float_list_feature(xmins),
        'image/object/bbox/xmax': _float_list_feature(xmaxs),
        'image/object/bbox/ymin': _float_list_feature(ymins),
        'image/object/bbox/ymax': _float_list_feature(ymaxs),

        # We store class text; you could also map it to class IDs if needed
        'image/object/class/text':
            tf.train.Feature(bytes_list=tf.train.BytesList(value=class_texts)),
    }

    return tf.train.Example(features=tf.train.Features(feature=feature_dict))


In [5]:
def convert_voc_to_tfrecord(annotations_folder, images_folder, output_tfrecord):
    """
    Reads all .xml files from 'annotations_folder', pairs them
    with images in 'images_folder', and writes a TFRecord to
    'output_tfrecord'.

    Returns the number of successfully written examples and
    the number of errors (e.g. missing files).
    """

    xml_files = glob.glob(os.path.join(annotations_folder, "*.xml"))
    num_written = 0
    num_errors = 0

    with tf.io.TFRecordWriter(output_tfrecord) as writer:
        for xml_file in xml_files:
            voc_info = parse_voc_xml(xml_file)
            tf_example = voc_dict_to_tfexample(voc_info, images_folder)
            if tf_example is not None:
                writer.write(tf_example.SerializeToString())
                num_written += 1
            else:
                num_errors += 1

    return num_written, num_errors

In [6]:
# Adjust these to your actual dataset paths
annotations_dir = "../data/Annotations"
images_dir = "../data/JPEGImages"
output_record = "voc_train.record"

written, errors = convert_voc_to_tfrecord(
    annotations_folder=annotations_dir,
    images_folder=images_dir,
    output_tfrecord=output_record
)

print(f"Conversion done. Written: {written}, Errors: {errors}")
print(f"TFRecord created at: {output_record}")

Conversion done. Written: 20, Errors: 0
TFRecord created at: voc_train.record
