In [2]:
import os
from glob import glob # extract path of each file
import pandas as pd # data preprocessing
from xml.etree import ElementTree as et # parse information from XML
from functools import reduce

In [3]:
import warnings
warnings.filterwarnings(' ignore')

In [8]:
# step-1: get path of each xml file
xmlfiles = glob("./Data_Preparation/Dataset/*.xml")

# Clean the dataset names 
#replace \\ with /
replace_text = lambda x: x.replace('\\','/')
# xmlfiles = list(map(replace_text,xmlfiles))

In [9]:
xmlfiles

['./Data_Preparation/Dataset/007826.xml',
 './Data_Preparation/Dataset/002786.xml',
 './Data_Preparation/Dataset/006286.xml',
 './Data_Preparation/Dataset/002962.xml',
 './Data_Preparation/Dataset/008297.xml',
 './Data_Preparation/Dataset/009189.xml',
 './Data_Preparation/Dataset/009823.xml',
 './Data_Preparation/Dataset/002976.xml',
 './Data_Preparation/Dataset/002745.xml',
 './Data_Preparation/Dataset/006523.xml',
 './Data_Preparation/Dataset/008268.xml',
 './Data_Preparation/Dataset/004452.xml',
 './Data_Preparation/Dataset/002023.xml',
 './Data_Preparation/Dataset/005980.xml',
 './Data_Preparation/Dataset/004446.xml',
 './Data_Preparation/Dataset/002037.xml',
 './Data_Preparation/Dataset/009162.xml',
 './Data_Preparation/Dataset/006251.xml',
 './Data_Preparation/Dataset/000620.xml',
 './Data_Preparation/Dataset/000146.xml',
 './Data_Preparation/Dataset/007629.xml',
 './Data_Preparation/Dataset/001258.xml',
 './Data_Preparation/Dataset/002751.xml',
 './Data_Preparation/Dataset/00298

In [10]:
# step-2: read xml files
# from each xml file we need to extract
# filename, size(width, height), object(name, xmin, xmax, ymin, ymax)

def extract_text(filename):
    tree = et.parse(filename)
    root = tree.getroot()

    # extract filename
    image_name = root.find('filename').text
    # width and height of the image
    width = root.find('size').find('width').text
    height = root.find('size').find('height').text
    objs = root.findall('object')
    parser = []
    for obj in objs:
        name = obj.find('name').text
        bndbox = obj.find('bndbox')
        xmin = bndbox.find('xmin').text
        xmax = bndbox.find('xmax').text
        ymin = bndbox.find('ymin').text
        ymax = bndbox.find('ymax').text
        parser.append([image_name, width, height, name,xmin,xmax,ymin,ymax])
        
    return parser

In [11]:
parser_all = list(map(extract_text,xmlfiles))

In [12]:
data = reduce(lambda x, y : x+y,parser_all)

In [13]:
df = pd.DataFrame(data,columns = ['filename','width','height','name','xmin','xmax','ymin','ymax'])

In [14]:
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax
0,007826.jpg,500,375,diningtable,80,320,217,273
1,007826.jpg,500,375,chair,197,257,193,326
2,007826.jpg,500,375,chair,139,185,184,231
3,007826.jpg,500,375,chair,258,312,180,314
4,007826.jpg,500,375,chair,10,93,195,358


In [15]:
df.shape

(15663, 8)

In [16]:
df['name'].value_counts()

name
person         5447
car            1650
chair          1427
bottle          634
pottedplant     625
bird            599
dog             538
sofa            425
bicycle         418
horse           406
boat            398
motorbike       390
cat             389
tvmonitor       367
cow             356
sheep           353
aeroplane       331
train           328
diningtable     310
bus             272
Name: count, dtype: int64

# Conversion
![image-2.png](attachment:image-2.png)


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15663 entries, 0 to 15662
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  15663 non-null  object
 1   width     15663 non-null  object
 2   height    15663 non-null  object
 3   name      15663 non-null  object
 4   xmin      15663 non-null  object
 5   xmax      15663 non-null  object
 6   ymin      15663 non-null  object
 7   ymax      15663 non-null  object
dtypes: object(8)
memory usage: 979.1+ KB


In [18]:
# type conversion
cols = ['width','height','xmin','xmax','ymin','ymax']
df[cols] = df[cols].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15663 entries, 0 to 15662
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  15663 non-null  object
 1   width     15663 non-null  int64 
 2   height    15663 non-null  int64 
 3   name      15663 non-null  object
 4   xmin      15663 non-null  int64 
 5   xmax      15663 non-null  int64 
 6   ymin      15663 non-null  int64 
 7   ymax      15663 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 979.1+ KB


In [48]:
# center x, center y
df['center_x'] = ((df['xmax']+df['xmin'])/2)/df['width']
df['center_y'] = ((df['ymax']+df['ymin'])/2)/df['height']
# w 
df['w'] = (df['xmax']-df['xmin'])/df['width']
# h 
df['h'] = (df['ymax']-df['ymin'])/df['height']

In [51]:
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,007826.jpg,500,375,diningtable,80,320,217,273,0.4,0.653333,0.48,0.149333
1,007826.jpg,500,375,chair,197,257,193,326,0.454,0.692,0.12,0.354667
2,007826.jpg,500,375,chair,139,185,184,231,0.324,0.553333,0.092,0.125333
3,007826.jpg,500,375,chair,258,312,180,314,0.57,0.658667,0.108,0.357333
4,007826.jpg,500,375,chair,10,93,195,358,0.103,0.737333,0.166,0.434667


### split data into train and test

In [52]:
images = df['filename'].unique()

In [53]:
len(images)

5012

In [54]:
# 80% train and 20% test
img_df = pd.DataFrame(images,columns=['filename'])
img_train = tuple(img_df.sample(frac=0.8)['filename']) # shuffle and pick 80% of images

In [55]:
img_test = tuple(img_df.query(f'filename not in {img_train}')['filename']) # take rest 20% images

In [56]:
len(img_train), len(img_test)

(4010, 1002)

In [57]:
train_df = df.query(f'filename in {img_train}')
test_df = df.query(f'filename in {img_test}')

In [58]:
train_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,007826.jpg,500,375,diningtable,80,320,217,273,0.4,0.653333,0.48,0.149333
1,007826.jpg,500,375,chair,197,257,193,326,0.454,0.692,0.12,0.354667
2,007826.jpg,500,375,chair,139,185,184,231,0.324,0.553333,0.092,0.125333
3,007826.jpg,500,375,chair,258,312,180,314,0.57,0.658667,0.108,0.357333
4,007826.jpg,500,375,chair,10,93,195,358,0.103,0.737333,0.166,0.434667


In [59]:
test_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
33,009189.jpg,500,378,cow,464,500,241,291,0.964,0.703704,0.072,0.132275
34,009189.jpg,500,378,cow,405,449,226,281,0.854,0.670635,0.088,0.145503
35,009189.jpg,500,378,cow,304,366,224,261,0.67,0.641534,0.124,0.097884
36,009189.jpg,500,378,cow,231,287,212,245,0.518,0.604497,0.112,0.087302
37,009189.jpg,500,378,cow,123,227,186,250,0.35,0.57672,0.208,0.169312


## Assign ID Numbers to Object Names

In [60]:
# label encoding
def label_encoding(x):
    labels = {'person':0, 'car':1, 'chair':2, 'bottle':3, 'pottedplant':4, 'bird':5, 'dog':6,
       'sofa':7, 'bicycle':8, 'horse':9, 'boat':10, 'motorbike':11, 'cat':12, 'tvmonitor':13,
       'cow':14, 'sheep':15, 'aeroplane':16, 'train':17, 'diningtable':18, 'bus':19}
    return labels[x]

In [61]:
train_df['id'] = train_df['name'].apply(label_encoding)
test_df['id'] = test_df['name'].apply(label_encoding)

In [62]:
train_df.head(10)

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h,id
0,007826.jpg,500,375,diningtable,80,320,217,273,0.4,0.653333,0.48,0.149333,18
1,007826.jpg,500,375,chair,197,257,193,326,0.454,0.692,0.12,0.354667,2
2,007826.jpg,500,375,chair,139,185,184,231,0.324,0.553333,0.092,0.125333,2
3,007826.jpg,500,375,chair,258,312,180,314,0.57,0.658667,0.108,0.357333,2
4,007826.jpg,500,375,chair,10,93,195,358,0.103,0.737333,0.166,0.434667,2
5,007826.jpg,500,375,chair,82,243,252,372,0.325,0.832,0.322,0.32,2
6,007826.jpg,500,375,chair,43,144,319,375,0.187,0.925333,0.202,0.149333,2
7,002786.jpg,500,332,horse,80,348,97,272,0.428,0.555723,0.536,0.527108,9
8,002786.jpg,500,332,person,201,258,52,202,0.459,0.38253,0.114,0.451807,0
9,006286.jpg,500,375,person,80,405,88,375,0.485,0.617333,0.65,0.765333,0


## Save Image and Labels in Text

In [63]:
import os
from shutil import move

In [None]:
train_folder = 'Data_Preparation/Dataset/train'
test_folder = 'Data_Preparation/Dataset/test'


os.mkdir(train_folder)
os.mkdir(test_folder)

In [66]:
cols = ['filename','id','center_x','center_y', 'w', 'h']
groupby_obj_train = train_df[cols].groupby('filename')
groupby_obj_test = test_df[cols].groupby('filename')

In [74]:
#groupby_obj_train.get_group('000009.jpg').set_index('filename').to_csv('sample.txt',index=False,header=False)
# save each image in train/test folder and repective labels in .txt
def save_data(filename, folder_path, group_obj):
    # move image
    src = os.path.join('Data_Preparation/Dataset',filename)
    dst = os.path.join(folder_path,filename)
    move(src,dst) # move image to the destination folder
    
    # save the labels
    text_filename = os.path.join(folder_path,
                                 os.path.splitext(filename)[0]+'.txt')
    group_obj.get_group(filename).set_index('filename').to_csv(text_filename,sep=' ',index=False,header=False)
    

In [75]:
filename_series = pd.Series(groupby_obj_train.groups.keys())

In [76]:
filename_series.apply(save_data,args=(train_folder,groupby_obj_train))

0       None
1       None
2       None
3       None
4       None
        ... 
4005    None
4006    None
4007    None
4008    None
4009    None
Length: 4010, dtype: object

In [77]:
filename_series_test = pd.Series(groupby_obj_test.groups.keys())
filename_series_test.apply(save_data,args=(test_folder,groupby_obj_test))

0       None
1       None
2       None
3       None
4       None
        ... 
997     None
998     None
999     None
1000    None
1001    None
Length: 1002, dtype: object