# A script that for all images, crop boxes @ the bounding boxes coordinates given. BBx data is stored in .xml format

In [32]:
# Read images and their corresponding bounding boxes
# Crop images using their bounding boxes coordinates
# Using the label of each bounding box, save that croped box in a subfolder accordingly

In [45]:
import glob
import os 
import cv2 
import random
import matplotlib.pyplot as plt
import numpy as np
import skimage.io as io
from PIL import Image
import pandas as pd
from bs4 import BeautifulSoup


%matplotlib inline

In [40]:
train_box_dir = 'val_bbxs'

images_path = glob.glob(train_box_dir+'/*.jpg')
images_boundingboxs = glob.glob(train_box_dir+'/*.xml')

print('Number of images:', len(images_path))
print('Number of images having bounding boxes:', len(images_boundingboxs))

Number of images: 201
Number of images having bounding boxes: 182


In [46]:
images_boundingboxs = os.listdir(train_box_dir)

In [47]:
'''
Crop bounding boxes inside an image. Bounding boxes data is stored in an xml file

Params:
------
fname           string   the xml file that contains the coordinates of the bounding boxes
img_path        string   the path of the image to extract from the boxes
final_dir_path  string   the directory path to which save the charcaters subdirectories in
character_freq  dict     dictionary containing the frequency of cropped characters from the given img
verbose         bool     defualt false. If true, print some visualizations of the image and its cropped boxes

'''
def crop_img(fname, img_path, final_dir_path, character_freq={}, verbose=False):
    
    img = io.imread(img_path)
    if (verbose):
        plt.imshow(img)
        
    with open(fname, 'r', encoding="utf-8") as f:
        data = f.read()
    
    Bs_data = BeautifulSoup(data, "html.parser")
    labels = np.array(Bs_data.find_all('name'))
    xmins = np.array(Bs_data.find_all('xmin'), dtype=int)
    ymins = np.array(Bs_data.find_all('ymin'), dtype=int)
    xmaxs = np.array(Bs_data.find_all('xmax'), dtype=int)
    ymaxs = np.array(Bs_data.find_all('ymax'), dtype=int)
    bounding_boxs_data = np.hstack([xmins,ymins,xmaxs,ymaxs])

    i = 1
    if (verbose):
        fig = plt.figure(figsize=(100, 100))
        img_boxes= img.copy()

    for xmin,ymin,xmax,ymax in bounding_boxs_data:
        # For boxes out of image dimensions
        xmin = 0 if (xmin<0) else xmin
        ymin = 0 if (ymin<0) else ymin
        xmax = img.shape[1] if (xmax>img.shape[1]) else xmax
        ymax = img.shape[0] if (ymax>img.shape[0]) else ymax

        # there are some boxes where xmin > xmax
        if (xmin >=xmax):
            continue
        if (ymin >=ymax):
            continue

        # Cropping the bounding box
        box = img[ymin:ymax,xmin:xmax, : ]
        box_img = Image.fromarray(box)
        # Getting the label corresponding to this box
        char_label = labels[i-1][0]
        subdir_path = os.path.join(final_dir_path,char_label)
        
        # If a directory for this label is not created before, create it
        if not os.path.exists(subdir_path):
            os.makedirs(subdir_path)
            character_freq[char_label] = 0

        box_img.save(os.path.join(subdir_path, str(character_freq[char_label])+'.jpg'))
        character_freq[char_label]+=1
        if (verbose):
            img_with_bbxs = cv2.rectangle(img_boxes,(xmin, ymin), (xmax, ymax), (0,255,0))
            fig.add_subplot(50,50, i)
            plt.imshow(img[ymin:ymax,xmin:xmax, : ])
        i += 1
    if (verbose):
        plt.show()
        plt.imshow(img_with_bbxs)
        plt.show()

In [48]:
character_freq={}

for path in images_boundingboxs:
    
    full_path = os.path.join(train_box_dir, path)
    file_num = full_path[:-4]
    file_ext = full_path[-4:]
    if(file_ext =='.jpg'):
        xml_file_corres=path[:-4]+'.xml'
        if xml_file_corres in images_boundingboxs:
            crop_img(fname=full_path[:-4]+'.xml', img_path=full_path, verbose=False, final_dir_path='./val_cropped_characters', character_freq=character_freq)

## Cropped Characters' Frequency

In [49]:
character_freq

{'ا': 60,
 '3': 34,
 '1': 159,
 'ق': 20,
 'ن': 17,
 'و': 28,
 'ر': 42,
 '8': 40,
 'ب': 36,
 'ج': 22,
 '6': 46,
 'م': 68,
 'ص': 11,
 '5': 71,
 'ل': 29,
 '7': 55,
 '4': 53,
 'د': 25,
 '9': 49,
 'ه': 16,
 'س': 37,
 'ي': 25,
 '2': 43,
 'ع': 34,
 'ف': 12,
 'ط': 13,
 'غ': 1}