In [50]:
import os
from glob import glob
import pandas as pd
from functools import reduce
from xml.etree import ElementTree as et

In [51]:
#Step 1
#getting path of each xml file
xmlfiles = glob('./1_datapreparation/data_images/*.xml')
#data cleaning. replace \\ with /
xmlfiles = list(map(lambda x: x.replace('\\','/'), xmlfiles))

In [52]:
xmlfiles

[]

In [45]:
#step 2
#read xml file and extract filename, size(width, height), 
#object(name, xmin, xmax, ymin, ymax)
def extract_text(filename):
    tree = et.parse(filename)
    root = tree.getroot()

    #extract filename
    image_name = root.find('filename').text
    #width and height of the image
    width = root.find('size').find('width').text
    height = root.find('size').find('height').text
    objs = root.findall('object')
    parser = []
    for obj in objs:
        name = obj.find('name').text
        bndbox = obj.find('bndbox')
        xmin = bndbox.find('xmin').text
        xmax = bndbox.find('xmax').text
        ymin = bndbox.find('ymin').text
        ymax = bndbox.find('ymax').text
        parser.append([image_name, width, height, name, xmin, xmax, ymin, ymax])

    return parser


In [46]:
parser_all = list(map(extract_text, xmlfiles))

In [54]:
data = reduce(lambda x, y: x + y, parser_all)

In [48]:
df = pd.DataFrame(data, columns = ['filename', 'width', 'height', 'name', 'xmin', 'xmax', 'ymin', 'ymax'])

In [49]:
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax
0,1.jpg,1529,975,tree,1009,1053,340,376
1,1.jpg,1529,975,tree,1038,1089,379,415
2,1.jpg,1529,975,tree,853,880,201,227
3,1.jpg,1529,975,tree,914,955,198,232
4,1.jpg,1529,975,tree,857,899,248,285


In [10]:
# df.shape

(4179, 8)

In [11]:
# df['name'].value_counts()

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4179 entries, 0 to 4178
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  4179 non-null   object
 1   width     4179 non-null   object
 2   height    4179 non-null   object
 3   name      4179 non-null   object
 4   xmin      4179 non-null   object
 5   xmax      4179 non-null   object
 6   ymin      4179 non-null   object
 7   ymax      4179 non-null   object
dtypes: object(8)
memory usage: 261.3+ KB


In [13]:
#type conversion
cols = ['width', 'height','xmin', 'xmax', 'ymin', 'ymax']
df[cols]  = df[cols].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4179 entries, 0 to 4178
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  4179 non-null   object
 1   width     4179 non-null   int32 
 2   height    4179 non-null   int32 
 3   name      4179 non-null   object
 4   xmin      4179 non-null   int32 
 5   xmax      4179 non-null   int32 
 6   ymin      4179 non-null   int32 
 7   ymax      4179 non-null   int32 
dtypes: int32(6), object(2)
memory usage: 163.4+ KB


In [14]:
#Centre x, y
df['centre_x'] = ((df['xmax'] + df['xmin'])/2) /df['width']
df['centre_y'] = ((df['ymax'] + df['ymin'])/2) /df['height']

# Width w
df['W'] = (df['xmax'] - df['xmin'])/df['width']
# Height h
df['H'] = (df['ymax'] - df['ymin'])/df['height']

In [15]:
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,centre_x,centre_y,W,H
0,1.jpg,1529,975,tree,1009,1053,340,376,0.674297,0.367179,0.028777,0.036923
1,1.jpg,1529,975,tree,1038,1089,379,415,0.695553,0.407179,0.033355,0.036923
2,1.jpg,1529,975,tree,853,880,201,227,0.56671,0.219487,0.017659,0.026667
3,1.jpg,1529,975,tree,914,955,198,232,0.611184,0.220513,0.026815,0.034872
4,1.jpg,1529,975,tree,857,899,248,285,0.574232,0.273333,0.027469,0.037949


In [16]:
images = df['filename'].unique()

In [18]:
# 80% train and 20 % test

img_df = pd.DataFrame(images, columns=['filename'])
img_train = tuple(img_df.sample(frac=0.8)['filename']) # shuffle and pick 80% of images


In [19]:
img_train

('12.jpg',
 '2.jpg',
 '13.jpg',
 '4.jpg',
 '28.jpg',
 '18.jpg',
 '9.jpg',
 '5.jpg',
 '7.jpg',
 '3.jpg',
 '20.jpg',
 '51.jpg',
 '52.jpg',
 '58.jpg',
 '44.jpg',
 '14.jpg',
 '6.jpg',
 '48.jpg',
 '57.jpg',
 '60.jpg',
 '55.jpg',
 '15.jpg',
 '53.jpg',
 '54.jpg',
 '8.jpg',
 '1.jpg',
 '59.jpg')

In [20]:
img_test = tuple(img_df.query(f'filename not in {img_train}')['filename'])

In [21]:
img_test

('10.jpg', '11.jpg', '56.jpg')

In [22]:
train_df = df.query(f'filename in {img_train}')
test_df = df.query(f'filename in {img_test }')

In [23]:
# train_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,centre_x,centre_y,W,H
0,1.jpg,1529,975,tree,1009,1053,340,376,0.674297,0.367179,0.028777,0.036923
1,1.jpg,1529,975,tree,1038,1089,379,415,0.695553,0.407179,0.033355,0.036923
2,1.jpg,1529,975,tree,853,880,201,227,0.56671,0.219487,0.017659,0.026667
3,1.jpg,1529,975,tree,914,955,198,232,0.611184,0.220513,0.026815,0.034872
4,1.jpg,1529,975,tree,857,899,248,285,0.574232,0.273333,0.027469,0.037949


In [24]:
# test_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,centre_x,centre_y,W,H
137,10.jpg,1529,975,tree,113,142,239,268,0.083388,0.26,0.018967,0.029744
138,10.jpg,1529,975,tree,97,129,270,294,0.073905,0.289231,0.020929,0.024615
139,10.jpg,1529,975,tree,61,90,335,363,0.049379,0.357949,0.018967,0.028718
140,10.jpg,1529,975,tree,23,65,242,279,0.028777,0.267179,0.027469,0.037949
141,10.jpg,1529,975,tree,46,86,210,244,0.043165,0.232821,0.026161,0.034872


In [25]:
# Assign Id number to object names

In [55]:

train_df['Id'] = 0
test_df['Id'] = 0

In [29]:
# df.info()
test_df.head()
# print("HELLO")
# train_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,centre_x,centre_y,W,H,Id
137,10.jpg,1529,975,tree,113,142,239,268,0.083388,0.26,0.018967,0.029744,0
138,10.jpg,1529,975,tree,97,129,270,294,0.073905,0.289231,0.020929,0.024615,0
139,10.jpg,1529,975,tree,61,90,335,363,0.049379,0.357949,0.018967,0.028718,0
140,10.jpg,1529,975,tree,23,65,242,279,0.028777,0.267179,0.027469,0.037949,0
141,10.jpg,1529,975,tree,46,86,210,244,0.043165,0.232821,0.026161,0.034872,0


In [33]:
# Save image and labels in text

In [30]:
import os
from shutil import move

In [32]:
# Give folder location
train_folder = '1_datapreparation/data_images/train'
test_folder = '1_datapreparation/data_images/test'

os.mkdir(train_folder)
os.mkdir(test_folder)

In [33]:
cols = ['filename', 'Id', 'centre_x', 'centre_y', 'W', 'H']
groupby_obj_train = train_df[cols].groupby('filename')
groupby_obj_test  = test_df [cols].groupby('filename')

In [35]:
#Save each image in train/test folder and respective labels in .txt
def save_data(filename, folder_path, group_obj):
    # move image
    src = os.path.join('1_datapreparation/data_images/', filename)
    dst = os.path.join(folder_path, filename)
    move(src, dst)  # move img to destination folder

    #save the labels
    text_filename = os.path.join(folder_path, os.path.splitext(filename)[0]+'.txt')
    group_obj.get_group(filename).set_index('filename').to_csv(text_filename, sep=' ', index = False, header = False)
    

In [36]:
filename_series = pd.Series(groupby_obj_train.groups.keys())

In [37]:
filename_series.apply(save_data,args=(train_folder, groupby_obj_train))
# filename_series

0     None
1     None
2     None
3     None
4     None
5     None
6     None
7     None
8     None
9     None
10    None
11    None
12    None
13    None
14    None
15    None
16    None
17    None
18    None
19    None
20    None
21    None
22    None
23    None
24    None
25    None
26    None
dtype: object

In [38]:
filename_series_test = pd.Series(groupby_obj_test.groups.keys())
filename_series_test.apply(save_data, args=(test_folder, groupby_obj_test))

0    None
1    None
2    None
dtype: object