In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
from glob import glob # extract path of each file
import pandas as pd # data preprocessing
from xml.etree import ElementTree as et # parse information from XML
from functools import reduce

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
# step-1: get path of each xml file
xmlfiles = glob('/content/drive/MyDrive/data images/*.xml')
# replace \\ with /
replace_text = lambda x: x.replace('\\','/')
xmlfiles = list(map(replace_text,xmlfiles))

In [None]:
xmlfiles

['/content/drive/MyDrive/data images/IMG-20231106-WA0210_jpg.rf.6ae15c1ebc4ee156bce06a87ff8e8dfd.xml',
 '/content/drive/MyDrive/data images/IMG-20231106-WA0193_jpg.rf.3a1adaac90bffc1b426faf376e492982.xml',
 '/content/drive/MyDrive/data images/IMG-20231106-WA0185_jpg.rf.66f39fd678ba07483e72e8bfb857fa5b.xml',
 '/content/drive/MyDrive/data images/IMG-20231106-WA0206_jpg.rf.8fc36895f2ae1e90eea6f48befa510ea.xml',
 '/content/drive/MyDrive/data images/IMG-20231106-WA0185_jpg.rf.b317cf24b34e0a5f010b6c174ff586e3.xml',
 '/content/drive/MyDrive/data images/IMG-20231106-WA0197_jpg.rf.b8ec28df76fd22bb641dc74cfc24e4d0.xml',
 '/content/drive/MyDrive/data images/IMG-20231106-WA0228_jpg.rf.b8f15d9900ca2c829d4cf4dd546e2154.xml',
 '/content/drive/MyDrive/data images/IMG-20231106-WA0227_jpg.rf.f63d633e9d0efc2295d35d8d1c37a46e.xml',
 '/content/drive/MyDrive/data images/IMG-20231106-WA0228_jpg.rf.5dc2579dc0e54f7ac567fffcafcf2e46.xml',
 '/content/drive/MyDrive/data images/IMG-20231106-WA0231_jpg.rf.78021724e

In [None]:
# step-2: read xml files
# from each xml file we need to extract
# filename, size(width, height), object(name, xmin, xmax, ymin, ymax)
def extract_text(filename):
    tree = et.parse(filename)
    root = tree.getroot()

    # extract filename
    image_name = root.find('filename').text
    # width and height of the image
    width = root.find('size').find('width').text
    height = root.find('size').find('height').text
    objs = root.findall('object')
    parser = []
    for obj in objs:
        name = obj.find('name').text
        bndbox = obj.find('bndbox')
        xmin = bndbox.find('xmin').text
        xmax = bndbox.find('xmax').text
        ymin = bndbox.find('ymin').text
        ymax = bndbox.find('ymax').text
        parser.append([image_name, width, height, name,xmin,xmax,ymin,ymax])

    return parser

In [None]:
parser_all = list(map(extract_text,xmlfiles))

In [None]:
data = reduce(lambda x, y : x+y,parser_all)

In [None]:
df = pd.DataFrame(data,columns = ['filename','width','height','name','xmin','xmax','ymin','ymax'])

In [None]:
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax
0,IMG-20231106-WA0210_jpg.rf.6ae15c1ebc4ee156bce...,640,640,Devakanchan tree,34,597,46,479
1,IMG-20231106-WA0193_jpg.rf.3a1adaac90bffc1b426...,640,640,Devakanchan tree,45,620,8,442
2,IMG-20231106-WA0185_jpg.rf.66f39fd678ba07483e7...,640,640,Devakanchan tree,289,447,246,334
3,IMG-20231106-WA0206_jpg.rf.8fc36895f2ae1e90eea...,640,640,Neem Tree,27,575,2,321
4,IMG-20231106-WA0185_jpg.rf.b317cf24b34e0a5f010...,640,640,Devakanchan tree,195,353,246,334


In [None]:
df.shape

(1954, 8)

In [None]:
df['name'].value_counts()

name
trees               1244
Devakanchan tree     313
Neem Tree            306
Mango Tree            91
Name: count, dtype: int64

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1954 entries, 0 to 1953
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  1954 non-null   object
 1   width     1954 non-null   object
 2   height    1954 non-null   object
 3   name      1954 non-null   object
 4   xmin      1954 non-null   object
 5   xmax      1954 non-null   object
 6   ymin      1954 non-null   object
 7   ymax      1954 non-null   object
dtypes: object(8)
memory usage: 122.2+ KB


In [None]:
# type conversion
cols = ['width','height','xmin','xmax','ymin','ymax']
df[cols] = df[cols].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1954 entries, 0 to 1953
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  1954 non-null   object
 1   width     1954 non-null   int64 
 2   height    1954 non-null   int64 
 3   name      1954 non-null   object
 4   xmin      1954 non-null   int64 
 5   xmax      1954 non-null   int64 
 6   ymin      1954 non-null   int64 
 7   ymax      1954 non-null   int64 
dtypes: int64(6), object(2)
memory usage: 122.2+ KB


In [None]:
# center x, center y
df['center_x'] = ((df['xmax']+df['xmin'])/2)/df['width']
df['center_y'] = ((df['ymax']+df['ymin'])/2)/df['height']
# w
df['w'] = (df['xmax']-df['xmin'])/df['width']
# h
df['h'] = (df['ymax']-df['ymin'])/df['height']

In [None]:
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,IMG-20231106-WA0210_jpg.rf.6ae15c1ebc4ee156bce...,640,640,Devakanchan tree,34,597,46,479,0.492969,0.410156,0.879687,0.676562
1,IMG-20231106-WA0193_jpg.rf.3a1adaac90bffc1b426...,640,640,Devakanchan tree,45,620,8,442,0.519531,0.351562,0.898438,0.678125
2,IMG-20231106-WA0185_jpg.rf.66f39fd678ba07483e7...,640,640,Devakanchan tree,289,447,246,334,0.575,0.453125,0.246875,0.1375
3,IMG-20231106-WA0206_jpg.rf.8fc36895f2ae1e90eea...,640,640,Neem Tree,27,575,2,321,0.470313,0.252344,0.85625,0.498437
4,IMG-20231106-WA0185_jpg.rf.b317cf24b34e0a5f010...,640,640,Devakanchan tree,195,353,246,334,0.428125,0.453125,0.246875,0.1375


### split data into train and test

In [None]:
images = df['filename'].unique()

In [None]:
len(images)

851

In [None]:
# 80% train and 20% test
img_df = pd.DataFrame(images,columns=['filename'])
img_train = tuple(img_df.sample(frac=0.8)['filename']) # shuffle and pick 80% of images

In [None]:
img_train


('IMG-20231106-WA0237_jpg.rf.059458446995793b4e3c05aef79cd31c.jpg',
 '12li_136_png.rf.cefbfb423d513ed28fd0bc155e81e273.jpg',
 '1_png.rf.ebda41d23ba9de268ee6fe30f6557ff7.jpg',
 '2_png.rf.a45c3344c8c7d000e2501fb28cdbda10.jpg',
 '10_png.rf.cf58bf147fb892b392cb89ac2c341435.jpg',
 'IMG_20231106_145404_jpg.rf.c2606619eb1f9bb47fd8bd1b513a3652.jpg',
 'IMG_20231004_103137_jpg.rf.21b7511d45136e459418fb933b399152.jpg',
 'IMG-20231106-WA0034_jpg.rf.eea9d891e0c9e89243131b5fee7f9750.jpg',
 '65_png.rf.ecc48ade92889ee96bded343cc5ec1c7.jpg',
 '103_png.rf.fe9363205cdf4bab3469041374e89a1c.jpg',
 'IMG_20231106_145816_jpg.rf.701fb81af33c5043b3129cd1e88d63b1.jpg',
 '104_png.rf.a5e45ed5347413b20b8b56bec4b3c922.jpg',
 '01ad35fcb_jpg.rf.d19c325b6ff8c1348514432700cbff8d.jpg',
 'IMG_20231004_103308_jpg.rf.08ec0e69d2623ef5bf3ed8d5762b17c0.jpg',
 'IMG_20231106_150448_jpg.rf.7637b4f404ef9d9ec2f96df3855f81d3.jpg',
 '12li_136_png.rf.9134c4dedc07681c9197db9a212d4c49.jpg',
 '101_png.rf.e2b22a06b911d47d61e1632b6bb77147.

In [None]:
img_test = tuple(img_df.query(f'filename not in {img_train}')['filename']) # take rest 20% images

In [None]:
img_test

('IMG-20231106-WA0185_jpg.rf.66f39fd678ba07483e72e8bfb857fa5b.jpg',
 'IMG-20231106-WA0206_jpg.rf.8fc36895f2ae1e90eea6f48befa510ea.jpg',
 'IMG-20231106-WA0231_jpg.rf.78021724e6bbf3982411d23caa6cd0fc.jpg',
 'IMG-20231106-WA0214_jpg.rf.5a77125cc2019f557a19f3195069f963.jpg',
 'WhatsApp-Image-2023-11-03-at-14-06-09_9c2b01b7-Copy_jpg.rf.2947ca5b98d536fc309b1402bb5fb879.jpg',
 'WhatsApp-Image-2023-11-03-at-14-06-14_606342cb-Copy_jpg.rf.80bb43eda5f71f7c31276a642fbe4ded.jpg',
 'WhatsApp-Image-2023-11-03-at-14-06-12_0c7f0385-Copy_jpg.rf.5d7d53534b7a1c59dfc7ab73e8ecce54.jpg',
 'WhatsApp-Image-2023-11-03-at-14-06-06_7f73acf8-Copy_jpg.rf.15fbee81e08ad1d9d3b5c13d84d4d2f7.jpg',
 'WhatsApp-Image-2023-11-03-at-14-06-09_9c2b01b7-Copy_jpg.rf.59e1dfb478e0b76869412d5455980d4d.jpg',
 'WhatsApp-Image-2023-11-03-at-14-06-17_22ce8d09-Copy_jpg.rf.824de7cb7ad9a031500184d75efc4a40.jpg',
 'WhatsApp-Image-2023-11-03-at-14-06-16_d206c803-Copy_jpg.rf.5a324b58faf52cbab4c3e73e2ca9020d.jpg',
 '01ad35fcb_jpg.rf.28050c9e4

In [None]:
len(img_train), len(img_test)

(681, 170)

In [None]:
train_df = df.query(f'filename in {img_train}')
test_df = df.query(f'filename in {img_test}')

In [None]:
train_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,IMG-20231106-WA0210_jpg.rf.6ae15c1ebc4ee156bce...,640,640,Devakanchan tree,34,597,46,479,0.492969,0.410156,0.879687,0.676562
1,IMG-20231106-WA0193_jpg.rf.3a1adaac90bffc1b426...,640,640,Devakanchan tree,45,620,8,442,0.519531,0.351562,0.898438,0.678125
4,IMG-20231106-WA0185_jpg.rf.b317cf24b34e0a5f010...,640,640,Devakanchan tree,195,353,246,334,0.428125,0.453125,0.246875,0.1375
5,IMG-20231106-WA0197_jpg.rf.b8ec28df76fd22bb641...,640,640,Mango Tree,252,471,236,296,0.564844,0.415625,0.342187,0.09375
6,IMG-20231106-WA0228_jpg.rf.b8f15d9900ca2c829d4...,640,640,Neem Tree,40,561,24,526,0.469531,0.429688,0.814063,0.784375


In [None]:
test_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
2,IMG-20231106-WA0185_jpg.rf.66f39fd678ba07483e7...,640,640,Devakanchan tree,289,447,246,334,0.575,0.453125,0.246875,0.1375
3,IMG-20231106-WA0206_jpg.rf.8fc36895f2ae1e90eea...,640,640,Neem Tree,27,575,2,321,0.470313,0.252344,0.85625,0.498437
9,IMG-20231106-WA0231_jpg.rf.78021724e6bbf398241...,640,640,Neem Tree,150,496,76,389,0.504687,0.363281,0.540625,0.489063
18,IMG-20231106-WA0214_jpg.rf.5a77125cc2019f557a1...,640,640,Devakanchan tree,1,594,11,418,0.464844,0.335156,0.926562,0.635938
27,WhatsApp-Image-2023-11-03-at-14-06-09_9c2b01b7...,640,640,Neem Tree,101,563,40,489,0.51875,0.413281,0.721875,0.701562


### Assign id number to object names

In [None]:
# label encoding
def label_encoding(x):
    labels = {'trees':0, 'Devakanchan tree':1, 'Neem Tree':2, 'Mango Tree':3}
    return labels[x]

In [None]:
train_df['id'] = train_df['name'].apply(label_encoding)
test_df['id'] = test_df['name'].apply(label_encoding)

In [None]:
train_df.head(10)

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h,id
0,IMG-20231106-WA0210_jpg.rf.6ae15c1ebc4ee156bce...,640,640,Devakanchan tree,34,597,46,479,0.492969,0.410156,0.879687,0.676562,1
1,IMG-20231106-WA0193_jpg.rf.3a1adaac90bffc1b426...,640,640,Devakanchan tree,45,620,8,442,0.519531,0.351562,0.898438,0.678125,1
4,IMG-20231106-WA0185_jpg.rf.b317cf24b34e0a5f010...,640,640,Devakanchan tree,195,353,246,334,0.428125,0.453125,0.246875,0.1375,1
5,IMG-20231106-WA0197_jpg.rf.b8ec28df76fd22bb641...,640,640,Mango Tree,252,471,236,296,0.564844,0.415625,0.342187,0.09375,3
6,IMG-20231106-WA0228_jpg.rf.b8f15d9900ca2c829d4...,640,640,Neem Tree,40,561,24,526,0.469531,0.429688,0.814063,0.784375,2
7,IMG-20231106-WA0227_jpg.rf.f63d633e9d0efc2295d...,640,640,Neem Tree,91,461,63,368,0.43125,0.336719,0.578125,0.476562,2
8,IMG-20231106-WA0228_jpg.rf.5dc2579dc0e54f7ac56...,640,640,Neem Tree,81,602,24,526,0.533594,0.429688,0.814063,0.784375,2
10,IMG-20231106-WA0231_jpg.rf.c312573626e29e71509...,640,640,Neem Tree,146,492,76,389,0.498437,0.363281,0.540625,0.489063,2
11,IMG-20231106-WA0215_jpg.rf.0c34067f2db9e541a39...,640,640,Devakanchan tree,1,641,3,301,0.501563,0.2375,1.0,0.465625,1
12,IMG-20231106-WA0223_jpg.rf.8a56ac3fff7f5ba786d...,640,640,Neem Tree,21,641,1,399,0.517188,0.3125,0.96875,0.621875,2


### Save Image and Labels in text

In [None]:
import os
from shutil import move

In [None]:
train_folder = '/content/drive/MyDrive/data images/train'
test_folder = '/content/drive/MyDrive/data images/test'


os.mkdir(train_folder)
os.mkdir(test_folder)

In [None]:
cols = ['filename','id','center_x','center_y', 'w', 'h']
groupby_obj_train = train_df[cols].groupby('filename')
groupby_obj_test = test_df[cols].groupby('filename')

In [47]:
#groupby_obj_train.get_group('000009.jpg').set_index('filename').to_csv('sample.txt',index=False,header=False)
# save each image in train/test folder and repective labels in .txt
def save_data(filename, folder_path, group_obj):
    # move image
    src = os.path.join('/content/drive/MyDrive/data images',filename)
    dst = os.path.join(folder_path,filename)
    move(src,dst) # move image to the destination folder

    # save the labels
    text_filename = os.path.join(folder_path,
                                 os.path.splitext(filename)[0]+'.txt')
    group_obj.get_group(filename).set_index('filename').to_csv(text_filename,sep=' ',index=False,header=False)


In [48]:
filename_series = pd.Series(groupby_obj_train.groups.keys())

In [49]:
filename_series.apply(save_data,args=(train_folder,groupby_obj_train))

0      None
1      None
2      None
3      None
4      None
       ... 
676    None
677    None
678    None
679    None
680    None
Length: 681, dtype: object

In [50]:
filename_series_test = pd.Series(groupby_obj_test.groups.keys())
filename_series_test.apply(save_data,args=(test_folder,groupby_obj_test))

0      None
1      None
2      None
3      None
4      None
       ... 
165    None
166    None
167    None
168    None
169    None
Length: 170, dtype: object