In [1]:
import os
from functools import reduce
import pandas as pd
import numpy as np

from glob import glob     # match specified patterns
from xml.etree import ElementTree as et    # for parsing and creating XML data.
from sklearn.model_selection import train_test_split

In [2]:
# 1. Load all paths of xml files (tagged images) in the data folder
DATA_DIR = "./data_images/"

xml_list = sorted( glob(DATA_DIR + '*.xml') )
xml_list

['./data_images/000001.xml',
 './data_images/000002.xml',
 './data_images/000007.xml',
 './data_images/000009.xml',
 './data_images/000012.xml',
 './data_images/000016.xml',
 './data_images/000017.xml',
 './data_images/000019.xml',
 './data_images/000020.xml',
 './data_images/000021.xml',
 './data_images/000023.xml',
 './data_images/000024.xml',
 './data_images/000026.xml',
 './data_images/000030.xml',
 './data_images/000032.xml',
 './data_images/000033.xml',
 './data_images/000034.xml',
 './data_images/000035.xml',
 './data_images/000036.xml',
 './data_images/000039.xml',
 './data_images/000041.xml',
 './data_images/000042.xml',
 './data_images/000044.xml',
 './data_images/000046.xml',
 './data_images/000047.xml',
 './data_images/000048.xml',
 './data_images/000050.xml',
 './data_images/000051.xml',
 './data_images/000052.xml',
 './data_images/000060.xml',
 './data_images/000061.xml',
 './data_images/000063.xml',
 './data_images/000064.xml',
 './data_images/000065.xml',
 './data_image

In [3]:
# 2. Define function that extracts info from single xml
# variables: (can be checked by opening xml file with text editor)
## filename
## size(width, height): image size
## *object(name, xmin, xmax, ymin, ymax): bounding box and label for EACH object

def extract_xml(xml_name):
    """Return a 2D list, with each row as an object in a bbox"""
    root = et.parse(xml_name).getroot()
    
    fname = root.findtext('filename')
    
    size = root.find('size')
    w = size.findtext('width')
    h = size.findtext('height')

    # iterate thru all objects
    obj_it = root.iterfind('object')
    res = []     # image:object is 1-to-n with non-fixed n
    for obj in obj_it:
        tag = obj.findtext('name')
        obj_info = [fname, int(w), int(h), tag]
        bbox_it = obj.find('bndbox').iter().__next__()  # first element is a separator
        for t in bbox_it:
            obj_info.append(int(t.text))
        res.append(obj_info)
        
    return res

In [4]:
# Have a look
extract_xml('./data_images/000001.xml')

[['000001.jpg', 1024, 657, 'car', 14, 335, 301, 522],
 ['000001.jpg', 1024, 657, 'car', 269, 345, 571, 489],
 ['000001.jpg', 1024, 657, 'car', 502, 342, 798, 450],
 ['000001.jpg', 1024, 657, 'car', 709, 333, 1009, 438]]

In [5]:
# 3. Process all xml files and create a Dataframe
data_list = []
for f_xml in xml_list:
    data_list += extract_xml(f_xml)
df = pd.DataFrame(
    data=data_list, 
    columns=['filename', 'width', 'height', 'tag', 'x_min', 'y_min', 'x_max', 'y_max']
)
df.head(10)

Unnamed: 0,filename,width,height,tag,x_min,y_min,x_max,y_max
0,000001.jpg,1024,657,car,14,335,301,522
1,000001.jpg,1024,657,car,269,345,571,489
2,000001.jpg,1024,657,car,502,342,798,450
3,000001.jpg,1024,657,car,709,333,1009,438
4,000002.jpg,800,600,car,41,240,768,497
5,000002.jpg,800,600,car,533,236,722,299
6,000007.jpg,500,333,car,141,50,500,330
7,000009.jpg,500,375,horse,69,172,270,330
8,000009.jpg,500,375,person,150,141,229,284
9,000009.jpg,500,375,person,285,201,327,331


In [6]:
# 4. Yolo uses different bbox info:
## normalized center x and y: x_c, y_c
## normalized bbox width and height: box_w, box_h
df['x_c'] = (df['x_min'] + df['x_max']) * 0.5 / df['width']
df['y_c'] = (df['y_min'] + df['y_max']) * 0.5 / df['height']

df['box_w'] = (df['x_max'] - df['x_min']) / df['width']
df['box_h'] = (df['y_max'] - df['y_min']) / df['height']
df.head()

Unnamed: 0,filename,width,height,tag,x_min,y_min,x_max,y_max,x_c,y_c,box_w,box_h
0,000001.jpg,1024,657,car,14,335,301,522,0.153809,0.652207,0.280273,0.284627
1,000001.jpg,1024,657,car,269,345,571,489,0.410156,0.634703,0.294922,0.219178
2,000001.jpg,1024,657,car,502,342,798,450,0.634766,0.60274,0.289062,0.164384
3,000001.jpg,1024,657,car,709,333,1009,438,0.838867,0.586758,0.292969,0.159817
4,000002.jpg,800,600,car,41,240,768,497,0.505625,0.614167,0.90875,0.428333


In [7]:
# 5. Convert original tag (str) to encodings (int)
tag_array = df['tag'].unique()
tag_encode = {tag: i[0] for i, tag in np.ndenumerate(tag_array)}
print(tag_encode)

# insert encoding col next to tag string
df.insert(
    loc=4, 
    column='tag_encode', 
    value=df['tag'].apply(lambda x: tag_encode[x])
)
# have a look at complete df
df 

{'car': 0, 'horse': 1, 'person': 2, 'bicycle': 3, 'cat': 4, 'dog': 5, 'train': 6, 'aeroplane': 7, 'diningtable': 8, 'tvmonitor': 9, 'chair': 10, 'bird': 11, 'bottle': 12, 'motorbike': 13, 'pottedplant': 14, 'boat': 15, 'sofa': 16, 'sheep': 17, 'cow': 18, 'bus': 19}


Unnamed: 0,filename,width,height,tag,tag_encode,x_min,y_min,x_max,y_max,x_c,y_c,box_w,box_h
0,000001.jpg,1024,657,car,0,14,335,301,522,0.153809,0.652207,0.280273,0.284627
1,000001.jpg,1024,657,car,0,269,345,571,489,0.410156,0.634703,0.294922,0.219178
2,000001.jpg,1024,657,car,0,502,342,798,450,0.634766,0.602740,0.289062,0.164384
3,000001.jpg,1024,657,car,0,709,333,1009,438,0.838867,0.586758,0.292969,0.159817
4,000002.jpg,800,600,car,0,41,240,768,497,0.505625,0.614167,0.908750,0.428333
...,...,...,...,...,...,...,...,...,...,...,...,...,...
15658,009958.jpg,500,333,person,2,89,27,149,217,0.238000,0.366366,0.120000,0.570571
15659,009958.jpg,500,333,person,2,75,39,147,124,0.222000,0.244745,0.144000,0.255255
15660,009958.jpg,500,333,bicycle,3,77,121,178,316,0.255000,0.656156,0.202000,0.585586
15661,009959.jpg,500,375,car,0,192,142,330,187,0.522000,0.438667,0.276000,0.120000


In [10]:
# 5a. Create data.yaml file; copy the output under 'names:'
for k in tag_encode:
    print(f"{d[k]} : '{k}'")

0 : 'car'
1 : 'horse'
2 : 'person'
3 : 'bicycle'
4 : 'cat'
5 : 'dog'
6 : 'train'
7 : 'aeroplane'
8 : 'diningtable'
9 : 'tvmonitor'
10 : 'chair'
11 : 'bird'
12 : 'bottle'
13 : 'motorbike'
14 : 'pottedplant'
15 : 'boat'
16 : 'sofa'
17 : 'sheep'
18 : 'cow'
19 : 'bus'


In [8]:
df['tag'].value_counts()

person         5447
car            1650
chair          1427
bottle          634
pottedplant     625
bird            599
dog             538
sofa            425
bicycle         418
horse           406
boat            398
motorbike       390
cat             389
tvmonitor       367
cow             356
sheep           353
aeroplane       331
train           328
diningtable     310
bus             272
Name: tag, dtype: int64

In [None]:
# 6. Yolo only needs (tag_encode, x_c, y_c, box_w, box_h) stored in txt files
# We also need filename: 000001.jpg -> 000001.txt

cols = ['filename', 'tag_encode']+ list(df.columns[-4:] )
print(cols)
df = df[cols]
df

In [None]:
# 7. Split train and test data: should split filenames, not all objects (bboxes)
# i.e. all objects of each file should go into the same train/test set

train_fnames, test_fnames = train_test_split(df['filename'].unique(), test_size=0.2)
train_df = df.loc[df['filename'].isin(train_fnames)]
test_df = df.loc[df['filename'].isin(test_fnames)]
train_df.head()

In [None]:
import os
from os.path import join as pjoin
from shutil import copy

In [None]:
# 8. Save info to txt files in train and test folders
# This txt file actually has 'space_sep_format'
TRAIN_DIR = "./train_data/"
TEST_DIR = "./test_data/"
SOURCE_DIR = "./data_images"

"""Save a txt file of a single image
filename: "000001.jpg"
data_dir: TRAIN_DIR or TEST_DIR
group_by: groupby() object
Reason:
for each img:
    group_by.get_group(img) is more efficient than df.loc[img]
"""
def save_txt(filename, data_dir, group_by):
    # copy images; delete original later
    src = pjoin(SOURCE_DIR, filename)
    dst = pjoin(data_dir, filename)
    copy(src, dst)

    # create labels (txt files)
    group_by.get_group(filename).set_index('filename').to_csv(
        pjoin(data_dir, os.path.splitext(filename)[0] + '.txt'),
        sep=' ',
        index=False,
        header=False
    )
    

In [None]:
# Create group_by and call functions
train_groupby = train_df.groupby('filename')
test_groupby = test_df.groupby('filename')

for fname in train_fnames:
    save_txt(fname, TRAIN_DIR, train_groupby)
for fname in test_fnames:
    save_txt(fname, TEST_DIR, test_groupby)
