In [104]:
import os
from glob import glob # extract path of each file
import pandas as pd # data preprocessing
from xml.etree import ElementTree as et # parse information from XML
from functools import reduce

In [105]:
import warnings
warnings.filterwarnings('ignore')

In [106]:
xmlfiles = glob('data_images/*.xml')
# replace \\ with /
replace_text = lambda x: x.replace('\\','/')
xmlfiles = list(map(replace_text,xmlfiles))

In [107]:
xmlfiles

['data_images/02__Crack_in_Between_Rotor_Seating_Areas-_Approx__Length_60mm_x_Thickness_12_5mm_jpg.rf.5f163922d1c10dca34e9c44bcf75c588.xml',
 'data_images/02__Crack_in_Between_Rotor_Seating_Areas-_Approx__Length_60mm_x_Thickness_12_5mm_jpg.rf.711baccb710cad7ae35cab6081580200.xml',
 'data_images/02__Crack_in_Between_Rotor_Seating_Areas-_Approx__Length_60mm_x_Thickness_12_5mm_jpg.rf.93b1f860ee0317b3b93aae500d775405.xml',
 'data_images/02__Crack_in_Between_Rotor_Seating_Areas-_Approx__Length_60mm_x_Thickness_12_5mm_jpg.rf.ad72faa20975e6330bd7a1ad5e58459b.xml',
 'data_images/02__Crack_in_Between_Rotor_Seating_Areas-_Approx__Length_60mm_x_Thickness_12_5mm_jpg.rf.d307d7c7e9ee5bcedecc95c70be9b0f1.xml',
 'data_images/100_0_JPG_jpg.rf.d99bd0c8e923729a30ee3bd62d846479.xml',
 'data_images/100_10_JPG_jpg.rf.a259afb73b92e1c95a75540d36901847.xml',
 'data_images/100_11_JPG_jpg.rf.523f8d891fe8436910f8a3404166bce0.xml',
 'data_images/100_12_JPG_jpg.rf.09513edd98bba40e682ea110efa8810d.xml',
 'data_image

In [108]:
# step-2: read xml files
# from each xml file we need to extract
# filename, size(width, height), object(name, xmin, xmax, ymin, ymax)
def extract_text(filename):
    tree = et.parse(filename)
    root = tree.getroot()

    # extract filename
    image_name = root.find('filename').text
    # width and height of the image
    width = root.find('size').find('width').text
    height = root.find('size').find('height').text
    objs = root.findall('object')
    parser = []
    for obj in objs:
        name = obj.find('name').text
        bndbox = obj.find('bndbox')
        xmin = bndbox.find('xmin').text
        xmax = bndbox.find('xmax').text
        ymin = bndbox.find('ymin').text
        ymax = bndbox.find('ymax').text
        parser.append([image_name, width, height, name,xmin,xmax,ymin,ymax])
        
    return parser

In [109]:
parser_all = list(map(extract_text,xmlfiles))

In [110]:
data = reduce(lambda x, y : x+y,parser_all)

In [111]:
df = pd.DataFrame(data,columns = ['filename','width','height','name','xmin','xmax','ymin','ymax'])

In [112]:
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax
0,02__Crack_in_Between_Rotor_Seating_Areas-_Appr...,640,640,crack,287,487,203,429
1,02__Crack_in_Between_Rotor_Seating_Areas-_Appr...,640,640,crack,287,487,213,439
2,02__Crack_in_Between_Rotor_Seating_Areas-_Appr...,640,640,crack,170,366,181,401
3,02__Crack_in_Between_Rotor_Seating_Areas-_Appr...,640,640,crack,276,472,181,401
4,02__Crack_in_Between_Rotor_Seating_Areas-_Appr...,640,640,crack,181,401,170,366


In [113]:
df.shape

(8844, 8)

In [114]:
df['name'].value_counts()

name
crack    5417
dent     3427
Name: count, dtype: int64

In [115]:
cols = ['width','height','xmin','xmax','ymin','ymax']
df[cols] = df[cols].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8844 entries, 0 to 8843
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  8844 non-null   object
 1   width     8844 non-null   int32 
 2   height    8844 non-null   int32 
 3   name      8844 non-null   object
 4   xmin      8844 non-null   int32 
 5   xmax      8844 non-null   int32 
 6   ymin      8844 non-null   int32 
 7   ymax      8844 non-null   int32 
dtypes: int32(6), object(2)
memory usage: 345.6+ KB


In [116]:
df['center_x'] = ((df['xmax']+df['xmin'])/2)/df['width']
df['center_y'] = ((df['ymax']+df['ymin'])/2)/df['height']
# w 
df['w'] = (df['xmax']-df['xmin'])/df['width']
# h 
df['h'] = (df['ymax']-df['ymin'])/df['height']

In [117]:
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,02__Crack_in_Between_Rotor_Seating_Areas-_Appr...,640,640,crack,287,487,203,429,0.604688,0.49375,0.3125,0.353125
1,02__Crack_in_Between_Rotor_Seating_Areas-_Appr...,640,640,crack,287,487,213,439,0.604688,0.509375,0.3125,0.353125
2,02__Crack_in_Between_Rotor_Seating_Areas-_Appr...,640,640,crack,170,366,181,401,0.41875,0.454688,0.30625,0.34375
3,02__Crack_in_Between_Rotor_Seating_Areas-_Appr...,640,640,crack,276,472,181,401,0.584375,0.454688,0.30625,0.34375
4,02__Crack_in_Between_Rotor_Seating_Areas-_Appr...,640,640,crack,181,401,170,366,0.454688,0.41875,0.34375,0.30625


In [118]:
images = df['filename'].unique()

In [119]:
len(images)

3748

In [120]:
# 80% train and 20% test
img_df = pd.DataFrame(images,columns=['filename'])
img_train = tuple(img_df.sample(frac=0.8)['filename']) # shuffle and pick 80% of images

In [121]:
img_test = tuple(img_df.query(f'filename not in {img_train}')['filename']) # take rest 20% images

In [122]:
len(img_train), len(img_test)

(2998, 750)

In [123]:
train_df = df.query(f'filename in {img_train}')
test_df = df.query(f'filename in {img_test}')

In [124]:
train_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,02__Crack_in_Between_Rotor_Seating_Areas-_Appr...,640,640,crack,287,487,203,429,0.604688,0.49375,0.3125,0.353125
1,02__Crack_in_Between_Rotor_Seating_Areas-_Appr...,640,640,crack,287,487,213,439,0.604688,0.509375,0.3125,0.353125
2,02__Crack_in_Between_Rotor_Seating_Areas-_Appr...,640,640,crack,170,366,181,401,0.41875,0.454688,0.30625,0.34375
5,100_0_JPG_jpg.rf.d99bd0c8e923729a30ee3bd62d846...,640,640,dent,202,492,172,379,0.542188,0.430469,0.453125,0.323437
6,100_0_JPG_jpg.rf.d99bd0c8e923729a30ee3bd62d846...,640,640,crack,319,420,63,121,0.577344,0.14375,0.157812,0.090625


In [125]:
def label_encoding(x):
    labels = {'crack':0, 'dent':1}
    return labels[x]

In [126]:
train_df['id'] = train_df['name'].apply(label_encoding)
test_df['id'] = test_df['name'].apply(label_encoding)

In [127]:
train_df.head(10)

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h,id
0,02__Crack_in_Between_Rotor_Seating_Areas-_Appr...,640,640,crack,287,487,203,429,0.604688,0.49375,0.3125,0.353125,0
1,02__Crack_in_Between_Rotor_Seating_Areas-_Appr...,640,640,crack,287,487,213,439,0.604688,0.509375,0.3125,0.353125,0
2,02__Crack_in_Between_Rotor_Seating_Areas-_Appr...,640,640,crack,170,366,181,401,0.41875,0.454688,0.30625,0.34375,0
5,100_0_JPG_jpg.rf.d99bd0c8e923729a30ee3bd62d846...,640,640,dent,202,492,172,379,0.542188,0.430469,0.453125,0.323437,1
6,100_0_JPG_jpg.rf.d99bd0c8e923729a30ee3bd62d846...,640,640,crack,319,420,63,121,0.577344,0.14375,0.157812,0.090625,0
7,100_0_JPG_jpg.rf.d99bd0c8e923729a30ee3bd62d846...,640,640,crack,426,464,67,108,0.695312,0.136719,0.059375,0.064062,0
8,100_0_JPG_jpg.rf.d99bd0c8e923729a30ee3bd62d846...,640,640,crack,451,570,8,57,0.797656,0.050781,0.185938,0.076563,0
9,100_10_JPG_jpg.rf.a259afb73b92e1c95a75540d3690...,640,640,dent,203,483,172,377,0.535937,0.428906,0.4375,0.320312,1
10,100_10_JPG_jpg.rf.a259afb73b92e1c95a75540d3690...,640,640,crack,337,402,56,124,0.577344,0.140625,0.101562,0.10625,0
11,100_10_JPG_jpg.rf.a259afb73b92e1c95a75540d3690...,640,640,crack,425,478,63,115,0.705469,0.139063,0.082812,0.08125,0


In [128]:
import os
from shutil import move


train_folder = 'data_images/train'
test_folder = 'data_images/test'


os.mkdir(train_folder)
os.mkdir(test_folder)




In [129]:
cols = ['filename','id','center_x','center_y', 'w', 'h']
groupby_obj_train = train_df[cols].groupby('filename')
groupby_obj_test = test_df[cols].groupby('filename')

In [136]:
#groupby_obj_train.get_group('000009.jpg').set_index('filename').to_csv('sample.txt',index=False,header=False)
# save each image in train/test folder and repective labels in .txt
def save_data(filename, folder_path, group_obj):
    # move image
    src = os.path.join('data_images',filename)
    dst = os.path.join(folder_path,filename)
    move(src,dst) # move image to the destination folder
    
    # save the labels
    text_filename = os.path.join(folder_path,os.path.splitext(filename)[0]+'.txt')
    group_obj.get_group(filename).set_index('filename').to_csv(text_filename,sep=' ',index=False,header=False)    

# groupby_obj_train.get_group(train_df['filename'][0]).set_index('filename').to_csv('sample.txt',index=False,header=False)

In [138]:
filename_series = pd.Series(groupby_obj_train.groups.keys())

In [139]:
filename_series.apply(save_data,args=(train_folder,groupby_obj_train))

0       None
1       None
2       None
3       None
4       None
        ... 
2993    None
2994    None
2995    None
2996    None
2997    None
Length: 2998, dtype: object

In [140]:
filename_series_test = pd.Series(groupby_obj_test.groups.keys())
filename_series_test.apply(save_data,args=(test_folder,groupby_obj_test))

0      None
1      None
2      None
3      None
4      None
       ... 
745    None
746    None
747    None
748    None
749    None
Length: 750, dtype: object