In [8]:
import os
from glob import glob
from functools import reduce
from xml.etree import ElementTree as et
import pandas as pd

In [9]:
#Load all xml files and store in a list
# step1: get path of each xml file
xmlfiles = glob('./data_images/*.xml')

# data cleaning. replace \\ with /
replace_text = lambda x: x.replace('\\','/')
list(map(replace_text,xmlfiles))

[]

In [10]:
# step2: read xml files
# from each xml file we need to extract
# filename, size(width, height), object(name, xmin, xmax, ymin, ymax)
def extract_text(filename):
    tree = et.parse(filename)
    root = tree.getroot()
    
    # extract filename
    image_name = root.find('filename').text
    #width and height of the image
    width = root.find('size').find('width').text
    height = root.find('size').find('height').text
    objs = root.findall('object')
    parser = []
    for obj in objs:
        name = obj.find('name').text
        bndbox = obj.find('bndbox')
        xmin = bndbox.find('xmin').text
        xmax = bndbox.find('xmax').text
        ymin = bndbox.find('ymin').text
        ymax = bndbox.find('ymax').text
        parser.append([image_name, width, height, name, xmin, xmax, ymin, ymax])

    return(parser)

In [11]:
xmlfiles

[]

In [12]:
parser_all = list(map(extract_text,xmlfiles)) 

In [13]:
data = reduce(lambda x,y : x+y, parser_all)

TypeError: reduce() of empty iterable with no initial value

In [None]:
df = pd.DataFrame(data,columns = ['filename', 'width', 'height', 'name', 'xmin', 'xmax', 'ymin', 'ymax'])

NameError: name 'data' is not defined

In [None]:
df.head()

NameError: name 'df' is not defined

In [None]:
df.shape

(78, 8)

In [None]:
df['name'].value_counts()

name
car           59
motorcycle    10
bus            6
truck          3
Name: count, dtype: int64

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78 entries, 0 to 77
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  78 non-null     object
 1   width     78 non-null     object
 2   height    78 non-null     object
 3   name      78 non-null     object
 4   xmin      78 non-null     object
 5   xmax      78 non-null     object
 6   ymin      78 non-null     object
 7   ymax      78 non-null     object
dtypes: object(8)
memory usage: 5.0+ KB


In [None]:
# type conversion
cols = ['width', 'height', 'xmin', 'xmax', 'ymin', 'ymax']
df[cols] = df[cols].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78 entries, 0 to 77
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  78 non-null     object
 1   width     78 non-null     int64 
 2   height    78 non-null     int64 
 3   name      78 non-null     object
 4   xmin      78 non-null     int64 
 5   xmax      78 non-null     int64 
 6   ymin      78 non-null     int64 
 7   ymax      78 non-null     int64 
dtypes: int64(6), object(2)
memory usage: 5.0+ KB


In [None]:
# center x, center y
df['center_x'] = ((df['xmax'] + df['xmin'])/2)/df['width']
df['center_y'] = ((df['ymax'] + df['ymin'])/2)/df['height']
# w
df['w'] = (df['xmax'] - df['xmin'])/df['width']
# h
df['h'] = (df['ymax'] - df['ymin'])/df['height']

In [None]:
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,pic1.jpg,3024,4032,car,946,1246,2959,3250,0.362434,0.769965,0.099206,0.072173
1,pic1.jpg,3024,4032,motorcycle,286,434,3028,3250,0.119048,0.778522,0.048942,0.05506
2,pic1.jpg,3024,4032,car,1719,2158,3314,3884,0.641038,0.892609,0.145172,0.141369
3,pic1.jpg,3024,4032,car,1597,1822,2835,3084,0.565311,0.734003,0.074405,0.061756
4,pic1.jpg,3024,4032,motorcycle,428,525,2814,2996,0.157573,0.720486,0.032077,0.045139


In [None]:
images = df['filename'].unique()

In [None]:
len(images)

13

In [None]:
# 80% train and 20% test
img_df = pd.DataFrame(images, columns = ['filename'])
img_train = tuple(img_df.sample(frac=0.8)['filename']) # shuffle and pick 80% of images

In [None]:
img_test = tuple(img_df.query(f'filename not in {img_train}')['filename']) # take rest 20% images

In [None]:
len(img_train), len(img_test)

(10, 3)

In [None]:
train_df = df.query(f'filename in {img_train}')
test_df = df.query(f'filename in {img_test}')


In [None]:
train_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
18,pic10.jpg,3024,4032,car,637,982,2554,2808,0.267692,0.664931,0.114087,0.062996
19,pic10.jpg,3024,4032,car,1057,1496,2511,2927,0.422123,0.674355,0.145172,0.103175
20,pic11.jpg,3648,5472,car,2335,3176,3604,4134,0.755345,0.707054,0.230537,0.096857
21,pic11.jpg,3648,5472,car,2138,2622,3510,3899,0.652412,0.676992,0.132675,0.071089
22,pic12.jpg,2832,4256,car,276,960,3259,3928,0.21822,0.844337,0.241525,0.15719


In [None]:
test_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,pic1.jpg,3024,4032,car,946,1246,2959,3250,0.362434,0.769965,0.099206,0.072173
1,pic1.jpg,3024,4032,motorcycle,286,434,3028,3250,0.119048,0.778522,0.048942,0.05506
2,pic1.jpg,3024,4032,car,1719,2158,3314,3884,0.641038,0.892609,0.145172,0.141369
3,pic1.jpg,3024,4032,car,1597,1822,2835,3084,0.565311,0.734003,0.074405,0.061756
4,pic1.jpg,3024,4032,motorcycle,428,525,2814,2996,0.157573,0.720486,0.032077,0.045139


In [None]:
#Label encoding
def label_encoding(x):
    labels = {'car':0, 'motorcycle':1, 'truck':2, 'bus':3 }
    return labels [x]

In [None]:
train_df['id'] = train_df['name'].apply(label_encoding)
test_df['id'] = test_df['name'].apply(label_encoding)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['id'] = train_df ['name'].apply(label_encoding)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['id'] = test_df ['name'].apply(label_encoding)
