In [None]:
import os
from functools import reduce
import pandas as pd
import numpy as np

from glob import glob     # match specified patterns
from xml.etree import ElementTree as et    # for parsing and creating XML data.
from sklearn.model_selection import train_test_split

In [None]:
# 1. Load all paths of xml files (tagged images) in the data folder
DATA_DIR = "./data_images/"

xml_list = sorted( glob(DATA_DIR + '*.xml') )
xml_list

In [None]:
# 2. Define function that extracts info from single xml
# variables: (can be checked by opening xml file with text editor)
## filename
## size(width, height): image size
## *object(name, xmin, xmax, ymin, ymax): bounding box and label for EACH object

def extract_xml(xml_name):
    """Return a 2D list, with each row as an object in a bbox"""
    root = et.parse(xml_name).getroot()
    
    fname = root.findtext('filename')
    
    size = root.find('size')
    w = size.findtext('width')
    h = size.findtext('height')

    # iterate thru all objects
    obj_it = root.iterfind('object')
    res = []     # image:object is 1-to-n with non-fixed n
    for obj in obj_it:
        tag = obj.findtext('name')
        obj_info = [fname, int(w), int(h), tag]
        bbox_it = obj.find('bndbox').iter().__next__()  # first element is a separator
        for t in bbox_it:
            obj_info.append(int(t.text))
        res.append(obj_info)
        
    return res

In [None]:
# Have a look
extract_xml('./data_images/000001.xml')

In [None]:
# 3. Process all xml files and create a Dataframe
data_list = []
for f_xml in xml_list:
    data_list += extract_xml(f_xml)
df = pd.DataFrame(
    data=data_list, 
    columns=['filename', 'width', 'height', 'tag', 'x_min', 'y_min', 'x_max', 'y_max']
)
df.head(10)

In [None]:
# 4. Yolo uses different bbox info:
## normalized center x and y: x_c, y_c
## normalized bbox width and height: box_w, box_h
df['x_c'] = (df['x_min'] + df['x_max']) * 0.5 / df['width']
df['y_c'] = (df['y_min'] + df['y_max']) * 0.5 / df['height']

df['box_w'] = (df['x_max'] - df['x_min']) / df['width']
df['box_h'] = (df['y_max'] - df['y_min']) / df['height']
df.head()

In [None]:
# 5. Convert original tag (str) to encodings (int)
tag_array = df['tag'].unique()
tag_encode = {tag: i[0] for i, tag in np.ndenumerate(tag_array)}
print(tag_encode)

# insert encoding col next to tag string
df.insert(
    loc=4, 
    column='tag_encode', 
    value=df['tag'].apply(lambda x: tag_encode[x])
)
# have a look at complete df
df 

In [None]:
# 5a. Create data.yaml file; copy the output under 'names:'
for k in tag_encode:
    print(f"{d[k]} : '{k}'")

In [None]:
df['tag'].value_counts()

In [None]:
# 6. Yolo only needs (tag_encode, x_c, y_c, box_w, box_h) stored in txt files
# We also need filename: 000001.jpg -> 000001.txt

cols = ['filename', 'tag_encode']+ list(df.columns[-4:] )
print(cols)
df = df[cols]
df

In [None]:
# 7. Split train and test data: should split filenames, not all objects (bboxes)
# i.e. all objects of each file should go into the same train/test set

train_fnames, test_fnames = train_test_split(df['filename'].unique(), test_size=0.2)
train_df = df.loc[df['filename'].isin(train_fnames)]
test_df = df.loc[df['filename'].isin(test_fnames)]
train_df.head()

In [None]:
import os
from os.path import join as pjoin
from shutil import copy

In [None]:
# 8. Save info to txt files in train and test folders
# This txt file actually has 'space_sep_format'
TRAIN_DIR = "./train_data/"
TEST_DIR = "./test_data/"
SOURCE_DIR = "./data_images"

"""Save a txt file of a single image
filename: "000001.jpg"
data_dir: TRAIN_DIR or TEST_DIR
group_by: groupby() object
Reason:
for each img:
    group_by.get_group(img) is more efficient than df.loc[img]
"""
def save_txt(filename, data_dir, group_by):
    # copy images; delete original later
    src = pjoin(SOURCE_DIR, filename)
    dst = pjoin(data_dir, filename)
    copy(src, dst)

    # create labels (txt files)
    group_by.get_group(filename).set_index('filename').to_csv(
        pjoin(data_dir, os.path.splitext(filename)[0] + '.txt'),
        sep=' ',
        index=False,
        header=False
    )
    

In [None]:
# Create group_by and call functions
train_groupby = train_df.groupby('filename')
test_groupby = test_df.groupby('filename')

for fname in train_fnames:
    save_txt(fname, TRAIN_DIR, train_groupby)
for fname in test_fnames:
    save_txt(fname, TEST_DIR, test_groupby)
