In [1]:
import os
from glob import glob
import pandas as pd
from functools import reduce
from xml.etree import ElementTree as et

In [2]:
# Load all xml files and store in a list
xml_list = glob('./data_images/*.xml')

In [3]:
xml_list

['./data_images/plastic686_jpg.rf.8764f145e9900b272d16b993bc5f1999.xml',
 './data_images/paper1272_jpg.rf.dcffd5a4f241d921610b8ca394f6d92f.xml',
 './data_images/metal500_jpg.rf.2469e149e7a3791ae4cb5decfc38b13a.xml',
 './data_images/glass337_jpg.rf.c01eefce4541158fb9c4290315a0c254.xml',
 './data_images/plastic702_jpg.rf.dd409336a82c629901f220b3f90d106c.xml',
 './data_images/biodegradable849_jpg.rf.289b2ef3aa2d0caf6fde9309e9db2ead.xml',
 './data_images/glass929_jpg.rf.86b6ef1c51b052339be3e085f4170fe5.xml',
 './data_images/glass1460_jpg.rf.5423226316d0db43365c1d8fc50352d8.xml',
 './data_images/plastic682_jpg.rf.3598a8aa7feabaf40a7333484c08ede3.xml',
 './data_images/glass2228_jpg.rf.46f9d77d5a5181be803b7e8ea3e11d13.xml',
 './data_images/metal156_jpg.rf.2133a0dd65b27f3eb9f63c026a65565b.xml',
 './data_images/glass770_jpg.rf.fc3e48c4873b624dfcf9e656269fd3d6.xml',
 './data_images/paper102_jpeg.rf.0f05ffa4076c6b20139b61b8e7ca5fc7.xml',
 './data_images/plastic142_jpg.rf.8d3f1d69064a6da810a457df8

In [4]:
# step-2: read xml files
# from each xml files we need to extract
# filename, size(width, height), object(name, xmin, xmax, ymin, ymax)
def extract_text(filename):
    tree = et.parse(filename)
    root = tree.getroot()
    
    # extract filename
    image_name = root.find('filename').text 
    # width and height of the image
    width = root.find('size').find('width').text
    height = root.find('size').find('height').text
    objs = root.findall('object')
    parser = []
    for obj in objs:
        name = obj.find('name').text
        bndbox = obj.find('bndbox')
        xmin = bndbox.find('xmin').text
        xmax = bndbox.find('xmax').text
        ymin = bndbox.find('ymin').text
        ymax = bndbox.find('ymax').text
        parser.append([image_name,width,height,name,xmin,xmax,ymin,ymax])
        
    return parser

In [5]:
parser_all = list(map(extract_text,xml_list))

In [6]:
parser_all 

[[['plastic686_jpg.rf.8764f145e9900b272d16b993bc5f1999.jpg',
   '416',
   '416',
   'PLASTIC',
   '131',
   '297',
   '21',
   '390'],
  ['plastic686_jpg.rf.8764f145e9900b272d16b993bc5f1999.jpg',
   '416',
   '416',
   'PLASTIC',
   '168',
   '256',
   '349',
   '390']],
 [['paper1272_jpg.rf.dcffd5a4f241d921610b8ca394f6d92f.jpg',
   '416',
   '416',
   'PAPER',
   '2',
   '416',
   '43',
   '417']],
 [['metal500_jpg.rf.2469e149e7a3791ae4cb5decfc38b13a.jpg',
   '416',
   '416',
   'METAL',
   '87',
   '416',
   '59',
   '416']],
 [['glass337_jpg.rf.c01eefce4541158fb9c4290315a0c254.jpg',
   '416',
   '416',
   'GLASS',
   '45',
   '369',
   '156',
   '255']],
 [['plastic702_jpg.rf.dd409336a82c629901f220b3f90d106c.jpg',
   '416',
   '416',
   'PLASTIC',
   '91',
   '365',
   '172',
   '416']],
 [['biodegradable849_jpg.rf.289b2ef3aa2d0caf6fde9309e9db2ead.jpg',
   '416',
   '416',
   'BIODEGRADABLE',
   '159',
   '367',
   '190',
   '322'],
  ['biodegradable849_jpg.rf.289b2ef3aa2d0caf6fde93

In [7]:
data = reduce(lambda x, y : x+y,parser_all)

In [8]:
data

[['plastic686_jpg.rf.8764f145e9900b272d16b993bc5f1999.jpg',
  '416',
  '416',
  'PLASTIC',
  '131',
  '297',
  '21',
  '390'],
 ['plastic686_jpg.rf.8764f145e9900b272d16b993bc5f1999.jpg',
  '416',
  '416',
  'PLASTIC',
  '168',
  '256',
  '349',
  '390'],
 ['paper1272_jpg.rf.dcffd5a4f241d921610b8ca394f6d92f.jpg',
  '416',
  '416',
  'PAPER',
  '2',
  '416',
  '43',
  '417'],
 ['metal500_jpg.rf.2469e149e7a3791ae4cb5decfc38b13a.jpg',
  '416',
  '416',
  'METAL',
  '87',
  '416',
  '59',
  '416'],
 ['glass337_jpg.rf.c01eefce4541158fb9c4290315a0c254.jpg',
  '416',
  '416',
  'GLASS',
  '45',
  '369',
  '156',
  '255'],
 ['plastic702_jpg.rf.dd409336a82c629901f220b3f90d106c.jpg',
  '416',
  '416',
  'PLASTIC',
  '91',
  '365',
  '172',
  '416'],
 ['biodegradable849_jpg.rf.289b2ef3aa2d0caf6fde9309e9db2ead.jpg',
  '416',
  '416',
  'BIODEGRADABLE',
  '159',
  '367',
  '190',
  '322'],
 ['biodegradable849_jpg.rf.289b2ef3aa2d0caf6fde9309e9db2ead.jpg',
  '416',
  '416',
  'BIODEGRADABLE',
  '34',


In [9]:
df = pd.DataFrame(data,columns = ['filename','width','height','name','xmin','xmax','ymin','ymax'])

In [10]:
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax
0,plastic686_jpg.rf.8764f145e9900b272d16b993bc5f...,416,416,PLASTIC,131,297,21,390
1,plastic686_jpg.rf.8764f145e9900b272d16b993bc5f...,416,416,PLASTIC,168,256,349,390
2,paper1272_jpg.rf.dcffd5a4f241d921610b8ca394f6d...,416,416,PAPER,2,416,43,417
3,metal500_jpg.rf.2469e149e7a3791ae4cb5decfc38b1...,416,416,METAL,87,416,59,416
4,glass337_jpg.rf.c01eefce4541158fb9c4290315a0c2...,416,416,GLASS,45,369,156,255


In [11]:
df.shape

(122557, 8)

In [12]:
df['name'].value_counts()

name
BIODEGRADABLE    75019
GLASS            13001
PLASTIC           9924
METAL             9610
CARDBOARD         7835
PAPER             7168
Name: count, dtype: int64

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122557 entries, 0 to 122556
Data columns (total 8 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   filename  122557 non-null  object
 1   width     122557 non-null  object
 2   height    122557 non-null  object
 3   name      122557 non-null  object
 4   xmin      122557 non-null  object
 5   xmax      122557 non-null  object
 6   ymin      122557 non-null  object
 7   ymax      122557 non-null  object
dtypes: object(8)
memory usage: 7.5+ MB


In [14]:
#type conversion
cols = ['width','height','xmin','xmax','ymin','ymax']
df[cols] = df[cols].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122557 entries, 0 to 122556
Data columns (total 8 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   filename  122557 non-null  object
 1   width     122557 non-null  int64 
 2   height    122557 non-null  int64 
 3   name      122557 non-null  object
 4   xmin      122557 non-null  int64 
 5   xmax      122557 non-null  int64 
 6   ymin      122557 non-null  int64 
 7   ymax      122557 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 7.5+ MB


In [15]:
# center x, center y
df['center_x'] = ((df['xmax']+df['xmin'])/2)/df['width']
df['center_y'] = ((df['ymax']+df['ymin'])/2)/df['height']
# w
df['w'] = (df['xmax']-df['xmin'])/df['width']
#h
df['h'] = (df['ymax']-df['ymin'])/df['height']

In [16]:
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,plastic686_jpg.rf.8764f145e9900b272d16b993bc5f...,416,416,PLASTIC,131,297,21,390,0.514423,0.49399,0.399038,0.887019
1,plastic686_jpg.rf.8764f145e9900b272d16b993bc5f...,416,416,PLASTIC,168,256,349,390,0.509615,0.888221,0.211538,0.098558
2,paper1272_jpg.rf.dcffd5a4f241d921610b8ca394f6d...,416,416,PAPER,2,416,43,417,0.502404,0.552885,0.995192,0.899038
3,metal500_jpg.rf.2469e149e7a3791ae4cb5decfc38b1...,416,416,METAL,87,416,59,416,0.604567,0.570913,0.790865,0.858173
4,glass337_jpg.rf.c01eefce4541158fb9c4290315a0c2...,416,416,GLASS,45,369,156,255,0.497596,0.49399,0.778846,0.237981


**split data into train and test**

In [18]:
images = df['filename'].unique()

In [19]:
len(images)

17365

In [20]:
# 80% train and 20% test
img_df = pd.DataFrame(images,columns=['filename'])
img_train = tuple(img_df.sample(frac=0.8)['filename']) # shuffle and pick 80% of images

In [21]:
img_train

('glass1813_jpg.rf.2f5197f23f5d7e20967356ed0cf8eef3.jpg',
 'biodegradable1124_jpg.rf.0c06b6cc0045d0ecba768c2ec2276f55.jpg',
 'cardboard849_jpg.rf.27b498b5213be0035acc2cd5eb4e8f47.jpg',
 'plastic700_jpg.rf.d398eefe30d10a17e1d40b087092c28c.jpg',
 'paper1288_jpg.rf.51ba6776ac28753a5d1d5cf36ad2814d.jpg',
 'plastic826_jpg.rf.fd21d8e25c70844b68c5744eba055cc7.jpg',
 'glass1773_jpg.rf.dc54666527f3d74c58362802619fcfd4.jpg',
 'biodegradable694_jpg.rf.2d630997650cce876ca09cbbba12803f.jpg',
 'paper1810_jpg.rf.1fce29a3727cf6cdb232b664e2b43d7f.jpg',
 'biodegradable473_jpg.rf.c6235557f023cf58ddd3812b27ca4e74.jpg',
 'metal1165_jpg.rf.b864bfe4c1041d36cb038ee304fb573a.jpg',
 'plastic1059_jpg.rf.2c67354341f934b1e8202edc561be1de.jpg',
 'glass3104_jpg.rf.788bc76981e5e9715f0995fe853635f1.jpg',
 'paper2054_jpg.rf.f06faee11355ab1c19ece62d2231615a.jpg',
 'biodegradable1195_jpg.rf.4445de9ba612708e19ef24c585d0a39b.jpg',
 'glass3027_jpg.rf.53f49918c1b37886c4833801f8b860a6.jpg',
 'glass1105_jpg.rf.32bac878892ec61b

In [22]:
img_test = tuple(img_df.query(f'filename not in {img_train}')['filename']) # take rest 20% of images

In [23]:
img_test

('glass2228_jpg.rf.46f9d77d5a5181be803b7e8ea3e11d13.jpg',
 'metal156_jpg.rf.2133a0dd65b27f3eb9f63c026a65565b.jpg',
 'glass770_jpg.rf.fc3e48c4873b624dfcf9e656269fd3d6.jpg',
 'biodegradable2205_jpeg.rf.1fde3396d4498c20296f4d863c52160a.jpg',
 'plastic234_jpg.rf.7e6799d1332d7ef1ab59f5b2983b6c5f.jpg',
 'cardboard321_jpg.rf.fd4b1c540dff5fcda983a77672e9a6f9.jpg',
 'metal105_jpg.rf.2d0372112cb93611d46acb08fb81c05e.jpg',
 'glass2976_jpg.rf.64538391bc7fb2dbcb8caed78432bd73.jpg',
 'glass2755_jpg.rf.d2ddaa31774facea8406074d8699707e.jpg',
 'cardboard1507_jpg.rf.8e8e002bf97274aa15ac591f3e1d140f.jpg',
 'cardboard1518_jpg.rf.874b852159b76264d55196e28d53c51e.jpg',
 'glass2935_jpg.rf.4505fe193bde450588256ad19524f0f7.jpg',
 'cardboard403_jpg.rf.6744d1863ab0e0297a613864b3d112dc.jpg',
 'cardboard1249_jpg.rf.11835530deda9048b865890a0f21604b.jpg',
 'biodegradable1151_jpg.rf.1804df2a21f6d2e61797e85320c736e4.jpg',
 'paper2147_jpg.rf.d662257dc82f63f4b96d23201cbc46c2.jpg',
 'biodegradable1231_jpg.rf.5122adc4710d

In [24]:
len(img_train), len(img_test)

(13892, 3473)

In [25]:
train_df = df.query(f'filename in {img_train}')
test_df = df.query(f'filename in {img_test}')

In [26]:
train_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,plastic686_jpg.rf.8764f145e9900b272d16b993bc5f...,416,416,PLASTIC,131,297,21,390,0.514423,0.49399,0.399038,0.887019
1,plastic686_jpg.rf.8764f145e9900b272d16b993bc5f...,416,416,PLASTIC,168,256,349,390,0.509615,0.888221,0.211538,0.098558
2,paper1272_jpg.rf.dcffd5a4f241d921610b8ca394f6d...,416,416,PAPER,2,416,43,417,0.502404,0.552885,0.995192,0.899038
3,metal500_jpg.rf.2469e149e7a3791ae4cb5decfc38b1...,416,416,METAL,87,416,59,416,0.604567,0.570913,0.790865,0.858173
4,glass337_jpg.rf.c01eefce4541158fb9c4290315a0c2...,416,416,GLASS,45,369,156,255,0.497596,0.49399,0.778846,0.237981


In [27]:
test_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
23,glass2228_jpg.rf.46f9d77d5a5181be803b7e8ea3e11...,416,416,GLASS,1,339,2,364,0.408654,0.439904,0.8125,0.870192
24,metal156_jpg.rf.2133a0dd65b27f3eb9f63c026a6556...,416,416,METAL,9,144,212,349,0.183894,0.674279,0.324519,0.329327
25,metal156_jpg.rf.2133a0dd65b27f3eb9f63c026a6556...,416,416,METAL,19,165,113,304,0.221154,0.501202,0.350962,0.459135
26,metal156_jpg.rf.2133a0dd65b27f3eb9f63c026a6556...,416,416,METAL,104,233,248,346,0.405048,0.713942,0.310096,0.235577
27,metal156_jpg.rf.2133a0dd65b27f3eb9f63c026a6556...,416,416,METAL,131,263,19,87,0.473558,0.127404,0.317308,0.163462


**Assign id number to object names**

In [28]:
# label encoding
def label_encoding(x):
    labels = {'BIODEGRADABLE':0, 'GLASS':1, 'PLASTIC':2, 'METAL':3, 'CARDBOARD':4, 'PAPER':5}
    return labels[x]

In [29]:
train_df.loc[:, 'id'] = train_df['name'].apply(label_encoding)
test_df.loc[:, 'id'] = test_df['name'].apply(label_encoding)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df.loc[:, 'id'] = train_df['name'].apply(label_encoding)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df.loc[:, 'id'] = test_df['name'].apply(label_encoding)


In [30]:
train_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h,id
0,plastic686_jpg.rf.8764f145e9900b272d16b993bc5f...,416,416,PLASTIC,131,297,21,390,0.514423,0.49399,0.399038,0.887019,2
1,plastic686_jpg.rf.8764f145e9900b272d16b993bc5f...,416,416,PLASTIC,168,256,349,390,0.509615,0.888221,0.211538,0.098558,2
2,paper1272_jpg.rf.dcffd5a4f241d921610b8ca394f6d...,416,416,PAPER,2,416,43,417,0.502404,0.552885,0.995192,0.899038,5
3,metal500_jpg.rf.2469e149e7a3791ae4cb5decfc38b1...,416,416,METAL,87,416,59,416,0.604567,0.570913,0.790865,0.858173,3
4,glass337_jpg.rf.c01eefce4541158fb9c4290315a0c2...,416,416,GLASS,45,369,156,255,0.497596,0.49399,0.778846,0.237981,1


**Save image and labels in text**

In [31]:
import os
from shutil import move

In [32]:
import os

train_folder = 'data_images/train'
test_folder = 'data_images/test'

os.mkdir(train_folder)

os.mkdir(test_folder)

In [33]:
cols = ['filename','id','center_x', 'center_y', 'w','h']
groupby_obj_train = train_df[cols].groupby('filename')
groupby_obj_test = test_df[cols].groupby('filename')

In [34]:
#groupby_obj_train.get_group('biodegradable83_jpg.rf.517e4dd4daa2380c2451e3859f56ab4e.jpg').set_index('filename').to_csv('sample.txt',index=False,header=False)
# same each image in train/test and repective labels in.txt
def save_data(filename, folder_path, group_obj):
    #move image
    src = os.path.join('data_images',filename)
    dst = os.path.join(folder_path,filename)
    
    move(src,dst) # move image to the destination folder
    
    # save the labels
    text_filename = os.path.join(folder_path,
                                 os.path.splitext(filename)[0]+'.txt')
    group_obj.get_group(filename).set_index('filename').to_csv(text_filename,sep=' ',index=False,header=False)

In [35]:
filename_series = pd.Series(groupby_obj_train.groups.keys())

In [36]:
filename_series.apply(save_data,args=(train_folder,groupby_obj_train))

0        None
1        None
2        None
3        None
4        None
         ... 
13887    None
13888    None
13889    None
13890    None
13891    None
Length: 13892, dtype: object

In [37]:
filename_series_test = pd.Series(groupby_obj_test.groups.keys())
filename_series_test.apply(save_data,args=(test_folder,groupby_obj_test))

0       None
1       None
2       None
3       None
4       None
        ... 
3468    None
3469    None
3470    None
3471    None
3472    None
Length: 3473, dtype: object