In [1]:
import os
from glob import glob # extract path of each file
import pandas as pd # data preprocessing
from xml.etree import ElementTree as et # parse information from XML
from functools import reduce


In [2]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
#  get path of each xml file
xmlfiles = glob('./Number_Plates/*.xml')
# replace \\ with /
replace_text = lambda x: x.replace('\\','/')
xmlfiles = list(map(replace_text,xmlfiles))

In [3]:
xmlfiles


['./Number_Plates/Cars0.xml',
 './Number_Plates/Cars1.xml',
 './Number_Plates/Cars10.xml',
 './Number_Plates/Cars100.xml',
 './Number_Plates/Cars101.xml',
 './Number_Plates/Cars102.xml',
 './Number_Plates/Cars103.xml',
 './Number_Plates/Cars104.xml',
 './Number_Plates/Cars105.xml',
 './Number_Plates/Cars106.xml',
 './Number_Plates/Cars107.xml',
 './Number_Plates/Cars108.xml',
 './Number_Plates/Cars109.xml',
 './Number_Plates/Cars11.xml',
 './Number_Plates/Cars110.xml',
 './Number_Plates/Cars111.xml',
 './Number_Plates/Cars112.xml',
 './Number_Plates/Cars113.xml',
 './Number_Plates/Cars114.xml',
 './Number_Plates/Cars115.xml',
 './Number_Plates/Cars116.xml',
 './Number_Plates/Cars117.xml',
 './Number_Plates/Cars118.xml',
 './Number_Plates/Cars119.xml',
 './Number_Plates/Cars12.xml',
 './Number_Plates/Cars120.xml',
 './Number_Plates/Cars121.xml',
 './Number_Plates/Cars122.xml',
 './Number_Plates/Cars123.xml',
 './Number_Plates/Cars124.xml',
 './Number_Plates/Cars125.xml',
 './Number_Plat

In [4]:
#  read xml files
# from each xml file we need to extract
# filename, size(width, height), object(name, xmin, xmax, ymin, ymax)
def extract_text(filename):
    tree = et.parse(filename)
    root = tree.getroot()

    # extract filename
    image_name = root.find('filename').text
    # width and height of the image
    width = root.find('size').find('width').text
    height = root.find('size').find('height').text
    obj = root.find('object')
    parser = []
    
    name = obj.find('name').text
    bndbox = obj.find('bndbox')
    xmin = bndbox.find('xmin').text
    xmax = bndbox.find('xmax').text
    ymin = bndbox.find('ymin').text
    ymax = bndbox.find('ymax').text
    parser.append([image_name, width, height, name,xmin,xmax,ymin,ymax])
        
    return parser

In [5]:
parser_all = list(map(extract_text,xmlfiles))

In [6]:
parser_all

[[['Cars0.png', '500', '268', 'licence', '226', '419', '125', '173']],
 [['Cars1.png', '400', '248', 'licence', '134', '262', '128', '160']],
 [['Cars10.png', '400', '225', 'licence', '140', '303', '5', '148']],
 [['Cars100.png', '400', '267', 'licence', '175', '214', '114', '131']],
 [['Cars101.png', '400', '300', 'licence', '167', '240', '202', '220']],
 [['Cars102.png', '350', '263', 'licence', '66', '322', '147', '199']],
 [['Cars103.png', '400', '196', 'licence', '230', '248', '129', '134']],
 [['Cars104.png', '500', '374', 'licence', '195', '244', '266', '282']],
 [['Cars105.png', '400', '240', 'licence', '152', '206', '147', '159']],
 [['Cars106.png', '400', '247', 'licence', '138', '177', '79', '92']],
 [['Cars107.png', '471', '270', 'licence', '141', '311', '113', '164']],
 [['Cars108.png', '442', '333', 'licence', '158', '277', '216', '248']],
 [['Cars109.png', '400', '267', 'licence', '115', '278', '116', '153']],
 [['Cars11.png', '400', '305', 'licence', '131', '273', '206'

In [7]:
data = reduce(lambda x, y : x+y,parser_all)

In [8]:
data

[['Cars0.png', '500', '268', 'licence', '226', '419', '125', '173'],
 ['Cars1.png', '400', '248', 'licence', '134', '262', '128', '160'],
 ['Cars10.png', '400', '225', 'licence', '140', '303', '5', '148'],
 ['Cars100.png', '400', '267', 'licence', '175', '214', '114', '131'],
 ['Cars101.png', '400', '300', 'licence', '167', '240', '202', '220'],
 ['Cars102.png', '350', '263', 'licence', '66', '322', '147', '199'],
 ['Cars103.png', '400', '196', 'licence', '230', '248', '129', '134'],
 ['Cars104.png', '500', '374', 'licence', '195', '244', '266', '282'],
 ['Cars105.png', '400', '240', 'licence', '152', '206', '147', '159'],
 ['Cars106.png', '400', '247', 'licence', '138', '177', '79', '92'],
 ['Cars107.png', '471', '270', 'licence', '141', '311', '113', '164'],
 ['Cars108.png', '442', '333', 'licence', '158', '277', '216', '248'],
 ['Cars109.png', '400', '267', 'licence', '115', '278', '116', '153'],
 ['Cars11.png', '400', '305', 'licence', '131', '273', '206', '234'],
 ['Cars110.png', 

In [9]:
df = pd.DataFrame(data,columns = ['filename','width','height','name','xmin','xmax','ymin','ymax'])

In [10]:
df.head()


Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax
0,Cars0.png,500,268,licence,226,419,125,173
1,Cars1.png,400,248,licence,134,262,128,160
2,Cars10.png,400,225,licence,140,303,5,148
3,Cars100.png,400,267,licence,175,214,114,131
4,Cars101.png,400,300,licence,167,240,202,220


In [11]:
df.shape

(433, 8)

In [12]:
df['name'].value_counts()

name
licence    433
Name: count, dtype: int64

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 433 entries, 0 to 432
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  433 non-null    object
 1   width     433 non-null    object
 2   height    433 non-null    object
 3   name      433 non-null    object
 4   xmin      433 non-null    object
 5   xmax      433 non-null    object
 6   ymin      433 non-null    object
 7   ymax      433 non-null    object
dtypes: object(8)
memory usage: 27.2+ KB


In [14]:
# type conversion
cols = ['width','height','xmin','xmax','ymin','ymax']
df[cols] = df[cols].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 433 entries, 0 to 432
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  433 non-null    object
 1   width     433 non-null    int32 
 2   height    433 non-null    int32 
 3   name      433 non-null    object
 4   xmin      433 non-null    int32 
 5   xmax      433 non-null    int32 
 6   ymin      433 non-null    int32 
 7   ymax      433 non-null    int32 
dtypes: int32(6), object(2)
memory usage: 17.0+ KB


#### Conversion
![alt text](download.png)

In [15]:
# center x, center y
df['center_x'] = ((df['xmax']+df['xmin'])/2)/df['width']
df['center_y'] = ((df['ymax']+df['ymin'])/2)/df['height']
# w 
df['w'] = (df['xmax']-df['xmin'])/df['width']
# h 
df['h'] = (df['ymax']-df['ymin'])/df['height']

In [16]:
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,Cars0.png,500,268,licence,226,419,125,173,0.645,0.55597,0.386,0.179104
1,Cars1.png,400,248,licence,134,262,128,160,0.495,0.580645,0.32,0.129032
2,Cars10.png,400,225,licence,140,303,5,148,0.55375,0.34,0.4075,0.635556
3,Cars100.png,400,267,licence,175,214,114,131,0.48625,0.458801,0.0975,0.06367
4,Cars101.png,400,300,licence,167,240,202,220,0.50875,0.703333,0.1825,0.06


In [17]:
images = df['filename'].unique()

In [18]:
len(images)

433

### split data into train and test

In [19]:
# 80% train and 20% test
img_df = pd.DataFrame(images,columns=['filename'])
img_train = tuple(img_df.sample(frac=0.8)['filename']) # shuffle and pick 80% of images

In [20]:
img_test = tuple(img_df.query(f'filename not in {img_train}')['filename']) # take rest 20% images

In [21]:
len(img_train), len(img_test)

(346, 87)

In [22]:
train_df = df.query(f'filename in {img_train}')
test_df = df.query(f'filename in {img_test}')

In [23]:
train_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,Cars0.png,500,268,licence,226,419,125,173,0.645,0.55597,0.386,0.179104
1,Cars1.png,400,248,licence,134,262,128,160,0.495,0.580645,0.32,0.129032
2,Cars10.png,400,225,licence,140,303,5,148,0.55375,0.34,0.4075,0.635556
3,Cars100.png,400,267,licence,175,214,114,131,0.48625,0.458801,0.0975,0.06367
5,Cars102.png,350,263,licence,66,322,147,199,0.554286,0.657795,0.731429,0.197719


In [24]:
test_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
4,Cars101.png,400,300,licence,167,240,202,220,0.50875,0.703333,0.1825,0.06
7,Cars104.png,500,374,licence,195,244,266,282,0.439,0.73262,0.098,0.042781
10,Cars107.png,471,270,licence,141,311,113,164,0.47983,0.512963,0.360934,0.188889
19,Cars115.png,507,388,licence,315,407,287,311,0.712032,0.770619,0.18146,0.061856
23,Cars119.png,400,247,licence,187,217,140,153,0.505,0.593117,0.075,0.052632


In [25]:
def label_encoding(x):
    labels = {'licence':0}
    return labels[x]

In [26]:
train_df['id'] = train_df['name'].apply(label_encoding)
test_df['id'] = test_df['name'].apply(label_encoding)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['id'] = train_df['name'].apply(label_encoding)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['id'] = test_df['name'].apply(label_encoding)


In [27]:
train_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h,id
0,Cars0.png,500,268,licence,226,419,125,173,0.645,0.55597,0.386,0.179104,0
1,Cars1.png,400,248,licence,134,262,128,160,0.495,0.580645,0.32,0.129032,0
2,Cars10.png,400,225,licence,140,303,5,148,0.55375,0.34,0.4075,0.635556,0
3,Cars100.png,400,267,licence,175,214,114,131,0.48625,0.458801,0.0975,0.06367,0
5,Cars102.png,350,263,licence,66,322,147,199,0.554286,0.657795,0.731429,0.197719,0


### Save Image and Labels in text

In [28]:
import os
from shutil import move

In [29]:
train_folder = 'Number_Plates/train'
test_folder = 'Number_Plates/test'


os.mkdir(train_folder)
os.mkdir(test_folder)

In [30]:
cols = ['filename','id','center_x','center_y', 'w', 'h']
groupby_obj_train = train_df[cols].groupby('filename')
groupby_obj_test = test_df[cols].groupby('filename')

In [31]:
#groupby_obj_train.get_group('000009.jpg').set_index('filename').to_csv('sample.txt',index=False,header=False)
# save each image in train/test folder and repective labels in .txt
def save_data(filename, folder_path, group_obj):
    # move image
    src = os.path.join('Number_Plates',filename)
    dst = os.path.join(folder_path,filename)
    move(src,dst) # move image to the destination folder
    
    # save the labels
    text_filename = os.path.join(folder_path,
                                 os.path.splitext(filename)[0]+'.txt')
    group_obj.get_group(filename).set_index('filename').to_csv(text_filename,sep=' ',index=False,header=False)

In [32]:
filename_series = pd.Series(groupby_obj_train.groups.keys())

In [33]:
filename_series.apply(save_data,args=(train_folder,groupby_obj_train))

0      None
1      None
2      None
3      None
4      None
       ... 
341    None
342    None
343    None
344    None
345    None
Length: 346, dtype: object

In [34]:
filename_series_test = pd.Series(groupby_obj_test.groups.keys())
filename_series_test.apply(save_data,args=(test_folder,groupby_obj_test))

0     None
1     None
2     None
3     None
4     None
      ... 
82    None
83    None
84    None
85    None
86    None
Length: 87, dtype: object