In [34]:
import os 
from glob import glob
import pandas as pd
from functools import reduce
from xml.etree import ElementTree as et

In [44]:
# get path of xml file
xmlfiles = glob('./data_images/*.xml')
# replace
replace_text = lambda x:x.replace('\\', '/')
xmlfiles = list(map(replace_text, xmlfiles))

In [45]:
xmlfiles

[]

In [4]:
# read xml files 
def extract(filename):
    tree = et.parse(filename)
    root = tree.getroot()
    
    # extract filename 
    image_name = root.find('filename').text
    # width and height
    width = root.find('size').find('width').text
    height = root.find('size').find('height').text
    objs = root.findall('object')
    parser = []
    
    for obj in objs:
        
        obj = objs[0]
        name = obj.find('name').text
        bndbox = obj.find('bndbox')
        xmin = bndbox.find('xmin').text
        xmax = bndbox.find('xmax').text
        ymin = bndbox.find('ymin').text
        ymax = bndbox.find('ymax').text
        parser.append([image_name, width, height, name, xmin, xmax, ymin, ymax])
        
    return parser    

In [5]:
parser_all = list(map(extract, xmlfiles))

In [6]:
data = reduce(lambda x, y: x+y, parser_all)

In [7]:
df = pd.DataFrame(data,columns = ['filename', 'width', 'height', 'name', 'xmin', 'xmax', 'ymin', 'ymax'])

In [8]:
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax
0,2011_006135.jpg,500,375,person,391,470,144,315
1,2008_006482.jpg,500,411,chair,155,233,321,411
2,2008_006482.jpg,500,411,chair,155,233,321,411
3,2010_005054.jpg,500,111,tvmonitor,86,121,34,70
4,2010_005054.jpg,500,111,tvmonitor,86,121,34,70


In [9]:
df.shape

(40138, 8)

In [10]:
df['name'].value_counts()

name
person         13137
car             2480
chair           2207
dog             2128
diningtable     1681
bottle          1493
cat             1492
bus             1376
tvmonitor       1374
boat            1365
bicycle         1301
bird            1286
aeroplane       1267
sheep           1191
sofa            1177
horse           1168
motorbike       1145
pottedplant     1068
train            957
cow              845
Name: count, dtype: int64

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40138 entries, 0 to 40137
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  40138 non-null  object
 1   width     40138 non-null  object
 2   height    40138 non-null  object
 3   name      40138 non-null  object
 4   xmin      40138 non-null  object
 5   xmax      40138 non-null  object
 6   ymin      40138 non-null  object
 7   ymax      40138 non-null  object
dtypes: object(8)
memory usage: 2.4+ MB


In [12]:
#type-conversion
cols = ['width', 'height', 'xmin', 'xmax', 'ymin', 'ymax']
df[cols] = df[cols].apply(pd.to_numeric, errors='coerce').round(0).astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40138 entries, 0 to 40137
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  40138 non-null  object
 1   width     40138 non-null  int64 
 2   height    40138 non-null  int64 
 3   name      40138 non-null  object
 4   xmin      40138 non-null  int64 
 5   xmax      40138 non-null  int64 
 6   ymin      40138 non-null  int64 
 7   ymax      40138 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 2.4+ MB


In [13]:
# center of x and y
df['center_x'] = ((df['xmax']+df['xmin'])/2)/df['width']
df['center_y'] = ((df['ymax']+df['ymin'])/2)/df['height']

df['w'] = (df['xmax']-df['xmin'])/df['width']
df['h'] = (df['ymax']-df['ymin'])/df['height']

In [14]:
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,2011_006135.jpg,500,375,person,391,470,144,315,0.861,0.612,0.158,0.456
1,2008_006482.jpg,500,411,chair,155,233,321,411,0.388,0.890511,0.156,0.218978
2,2008_006482.jpg,500,411,chair,155,233,321,411,0.388,0.890511,0.156,0.218978
3,2010_005054.jpg,500,111,tvmonitor,86,121,34,70,0.207,0.468468,0.07,0.324324
4,2010_005054.jpg,500,111,tvmonitor,86,121,34,70,0.207,0.468468,0.07,0.324324


## Split the Data into test and train

In [15]:
images = df['filename'].unique()

In [16]:
len(images)

17125

In [17]:
# 80% train and 20% test
img_df = pd.DataFrame(images, columns = ['filename'])
img_train = tuple(img_df.sample(frac=0.8)['filename']) # shuffle and pick 80% of images

In [18]:
img_test = tuple(img_df.query(f'filename not in {img_train}')['filename']) # rest 20% of images 

In [19]:
len(img_train), len(img_test)

(13700, 3425)

In [20]:
train_df = df.query(f'filename in {img_train}')
test_df = df.query(f'filename in {img_test}')

In [21]:
train_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
1,2008_006482.jpg,500,411,chair,155,233,321,411,0.388,0.890511,0.156,0.218978
2,2008_006482.jpg,500,411,chair,155,233,321,411,0.388,0.890511,0.156,0.218978
3,2010_005054.jpg,500,111,tvmonitor,86,121,34,70,0.207,0.468468,0.07,0.324324
4,2010_005054.jpg,500,111,tvmonitor,86,121,34,70,0.207,0.468468,0.07,0.324324
5,2010_005054.jpg,500,111,tvmonitor,86,121,34,70,0.207,0.468468,0.07,0.324324


In [22]:
test_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,2011_006135.jpg,500,375,person,391,470,144,315,0.861,0.612,0.158,0.456
9,2010_003343.jpg,500,375,sheep,68,264,122,232,0.332,0.472,0.392,0.293333
10,2010_003343.jpg,500,375,sheep,68,264,122,232,0.332,0.472,0.392,0.293333
14,2008_000795.jpg,500,332,person,1,232,82,332,0.233,0.623494,0.462,0.753012
15,2008_000795.jpg,500,332,person,1,232,82,332,0.233,0.623494,0.462,0.753012


## Assign ID to object name 

In [23]:
# label encoding
def label_encoding(x):
    label = {'person':0, 'car':1, 'chair':2, 'dog':3, 'diningtable':4, 'bottle':5, 'cat':6, 'bus':7, 'tvmonitor':8, 'boat':9, 'bicycle':10, 'bird':11, 'aeroplane':12, 'sheep':13, 'sofa':14, 'horse':15, 'motorbike':16, 'pottedplant':17, 'train':18, 'cow':19}
    return label[x]    

In [25]:
train_df.loc[:, 'id'] = train_df['name'].apply(label_encoding)
test_df.loc[:, 'id'] = test_df['name'].apply(label_encoding)

In [26]:
train_df.head(10)

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h,id
1,2008_006482.jpg,500,411,chair,155,233,321,411,0.388,0.890511,0.156,0.218978,2
2,2008_006482.jpg,500,411,chair,155,233,321,411,0.388,0.890511,0.156,0.218978,2
3,2010_005054.jpg,500,111,tvmonitor,86,121,34,70,0.207,0.468468,0.07,0.324324,8
4,2010_005054.jpg,500,111,tvmonitor,86,121,34,70,0.207,0.468468,0.07,0.324324,8
5,2010_005054.jpg,500,111,tvmonitor,86,121,34,70,0.207,0.468468,0.07,0.324324,8
6,2010_005732.jpg,500,375,cat,181,500,1,375,0.681,0.501333,0.638,0.997333,6
7,2011_004044.jpg,500,375,person,220,337,143,375,0.557,0.690667,0.234,0.618667,0
8,2011_004044.jpg,500,375,person,220,337,143,375,0.557,0.690667,0.234,0.618667,0
11,2011_002435.jpg,500,375,sofa,23,500,56,375,0.523,0.574667,0.954,0.850667,14
12,2011_002435.jpg,500,375,sofa,23,500,56,375,0.523,0.574667,0.954,0.850667,14


## Save Imange and Labels in text

In [27]:
import os
from shutil import move

In [28]:
train_folder = 'data_images/train'
test_folder = 'data_images/test'

os.mkdir(train_folder)
os.mkdir(test_folder)

FileExistsError: [Errno 17] File exists: 'data_images/train'

In [29]:
cols = ['filename', 'id', 'center_x', 'center_y', 'w', 'h']
groupby_obj_train = train_df[cols].groupby('filename')
groupby_obj_test = test_df[cols].groupby('filename')
print(groupby_obj_train.groups)

{'2007_000027.jpg': [12695], '2007_000032.jpg': [10181, 10182, 10183, 10184], '2007_000033.jpg': [12585, 12586, 12587], '2007_000039.jpg': [24934], '2007_000061.jpg': [37805, 37806], '2007_000063.jpg': [37560, 37561], '2007_000068.jpg': [7529], '2007_000121.jpg': [9000, 9001], '2007_000123.jpg': [5996], '2007_000170.jpg': [23527, 23528, 23529, 23530, 23531, 23532], '2007_000175.jpg': [26120], '2007_000243.jpg': [9705], '2007_000256.jpg': [7989], '2007_000272.jpg': [30383], '2007_000323.jpg': [9330, 9331], '2007_000332.jpg': [1925], '2007_000333.jpg': [619], '2007_000364.jpg': [21772, 21773, 21774], '2007_000392.jpg': [11811, 11812], '2007_000423.jpg': [11900, 11901], '2007_000452.jpg': [6692, 6693], '2007_000480.jpg': [4194, 4195, 4196, 4197], '2007_000491.jpg': [6534], '2007_000504.jpg': [39700, 39701, 39702, 39703], '2007_000515.jpg': [32277, 32278, 32279, 32280], '2007_000528.jpg': [30496], '2007_000529.jpg': [32228, 32229], '2007_000549.jpg': [24605], '2007_000559.jpg': [25431, 254

In [31]:
def save_data(filename, folder_path, group_obj):
    src = os.path.join('data_images',filename)
    dst = os.path.join(folder_path,filename)
    move(src,dst)
    text_filename = os.path.join(folder_path, os.path.splitext(filename)[0]+'.txt')
    group_obj.get_group(filename).set_index('filename').to_csv(text_filename,sep=' ',index=False,header=False)
    

In [39]:
filename_series = pd.Series(groupby_obj_train.groups.keys())

In [42]:
filename_series.apply(save_data,args=(train_folder,groupby_obj_train))

0        None
1        None
2        None
3        None
4        None
         ... 
13695    None
13696    None
13697    None
13698    None
13699    None
Length: 13700, dtype: object

In [43]:
filename_series_test = pd.Series(groupby_obj_test.groups.keys())
filename_series_test.apply(save_data,args=(test_folder,groupby_obj_test))

0       None
1       None
2       None
3       None
4       None
        ... 
3420    None
3421    None
3422    None
3423    None
3424    None
Length: 3425, dtype: object