In [None]:
import os
import pandas as pd
from glob import glob
from functools import reduce
from xml.etree import ElementTree as et

In [None]:
# loading all xml files and storing it as List
xml_list = glob('./data_images/*.xml')

# Replacing \\ wiith / in Xml files
xml_list = [xml.replace('\\','/') for xml in xml_list]

In [None]:
xml_list

In [None]:
def extract_data(filename):
    tree = et.parse(filename)
    root = tree.getroot()

    #extracting FileName
    image_name = root.find('filename').text

    #Extracting Width and Height of the image
    width = root.find('size').find('width').text
    height = root.find('size').find('height').text
    boxes=[]

    for objs in root.findall('object'):
        name = objs.find('name').text
        bndbox = objs.find('bndbox')
        xmin = bndbox.find('xmin').text
        xmax = bndbox.find('xmax').text
        ymin = bndbox.find('ymin').text
        ymax = bndbox.find('ymax').text
        boxes.append([image_name,width,height,name,xmin,xmax,ymin,ymax])

    return boxes

In [None]:
bounding_boxes = list(map(extract_data,xml_list))


In [None]:
bounding_boxes

In [None]:
data = reduce(lambda x,y:x+y,bounding_boxes)
data

In [None]:
df = pd.DataFrame(data,columns=['filename','width','height','name','xmin','xmax','ymin','ymax'])

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df['name'].value_counts()

In [None]:
df.info()

In [None]:
cols = ['width','height','xmin','xmax','ymin','ymax']
df[cols] = df[cols].astype(int)
df.info()

In [None]:
df['center_x']=((df['xmin']+df['xmax'])/2)/df['width']
df['center_y']=((df['ymin']+df['ymax'])/2)/df['height']
df['w']=(df['xmax']-df['xmin'])/df['width']
df['h']=(df['ymax']-df['ymin'])/df['height']
df.head()

In [None]:
images = df['filename'].unique()
len(images)

5012

In [None]:
# 80% train and 20% test
img_df = pd.DataFrame(images,columns=['filename'])
img_train = tuple(img_df.sample(frac=0.8)['filename']) #shuffling and picking 80% of images

In [None]:
img_train

In [None]:
img_test = tuple(img_df.query(f'filename not in {img_train}')['filename'])
img_test

In [None]:
len(img_train),len(img_test)

In [None]:
train_df = df.query(f'filename in {img_train}')
test_df = df.query(f'filename in {img_test}')
test_df.head()

In [None]:
train_df.head()

In [None]:
#label Encoding
def label_encoding(x):
    labels = {'person':0,'car':1,'chair':2,'bottle':3,
              'pottedplant':4,'bird':5,'dog':6,
              'sofa':7,'bicycle':8,'horse':9,'boat':10,
              'motorbike':11,'cat':12,'tvmonitor':13,
              'cow':14,'sheep':15,'aeroplane':16,
              'train':17,'diningtable':18,'bus':19
             }
    return labels[x]

In [None]:
train_df.loc[:, 'id'] = train_df['name'].apply(label_encoding)
test_df.loc[:, 'id'] = test_df['name'].apply(label_encoding)
train_df['id']=train_df['id'].astype(int)
test_df['id']=test_df['id'].astype(int)

In [None]:
import warnings
warnings.filterwarnings('ignore')

# Save Images and Labels in Text

In [None]:
import os
from shutil import move

In [None]:
train_folder ='data_images/train'
test_folder ='data_images/test'

os.mkdir(train_folder)
os.mkdir(test_folder)

In [None]:
cols = ['filename','id','center_x','center_y','w','h']
groupby_obj_train = train_df[cols].groupby('filename')
groupby_obj_test = test_df[cols].groupby('filename')

In [None]:
def save_data(filename,folder_path,group_obj):
    #moving image
    src = os.path.join('data_images',filename)
    dst = os.path.join(folder_path,filename)
    move(src,dst)

    #saving labels
    text_filename = os.path.join(folder_path,
                                 os.path.splitext(filename)[0]+'.txt')
    group_obj.get_group(filename).set_index('filename').to_csv(text_filename,sep=' ',index=False,header=False)

In [None]:
filename_series=pd.Series(groupby_obj_train.groups.keys())

In [None]:
filename_series.apply(save_data,args=(train_folder,groupby_obj_train))

In [None]:
filename_series_test=pd.Series(groupby_obj_test.groups.keys())

In [None]:
filename_series_test.apply(save_data,args=(test_folder,groupby_obj_test))