# Aument Training data

## Libraries

In [14]:
import os
import cv2
import imutils
import git
import numpy as np
import random as rd
from xml.dom import minidom
import xml.etree.ElementTree as ET
from IPython.display import clear_output
from zipfile import ZipFile

repo = git.Repo('.', search_parent_directories=True)
base_path = f'{repo.working_tree_dir}/src'
os.chdir(base_path)

from utils.file_handling import download_s3_file, unzip, upload_to_s3, object_exists

## Functions

### Rotation 

In [15]:
def random_rotation(image, d_range = [0, 360] ):
    angle = rd.randrange(d_range[0], d_range[1])
    
    # Get image center
    h, w  = image.shape[0:2]
    centreY, centreX = h//2, w//2
    
    # Get matrix
    rot_mat = cv2.getRotationMatrix2D((centreY, centreX), angle, 1.0)
    #print(rot_mat, angle)
    
    # Now will take out sin and cos values from rotationMatrix
    # Also used numpy absolute function to make positive value
    cosofRotationMatrix = rot_mat[0][0]
    sinofRotationMatrix = rot_mat[1][0]
 
    # Now will compute new height & width of
    # an image so that we can use it in
    # warpAffine function to prevent cropping of image sides
    newImageWidth = int((h * np.abs(sinofRotationMatrix)) +
                         (w * np.abs(cosofRotationMatrix)))
    newImageHeight = int((h * np.abs(cosofRotationMatrix)) +
                        (w * np.abs(sinofRotationMatrix)))
 
    # Getting original center rotated
    v = [centreX,centreY,1]
    centre = np.dot(rot_mat, v)
    
    # After computing the new height & width of an image
    # we also need to update the values of rotation matrix
    # using for this the location of the original center and
    # the currente center
    rot_mat[0][2] += (newImageWidth/2) - centre[0]
    rot_mat[1][2] += (newImageHeight/2) - centre[1]
    
    # Make rotation
    result = cv2.warpAffine(image, rot_mat, (newImageWidth, newImageHeight))
    
    return result, rot_mat

### Random Blur and Noise

In [16]:
def random_blur(image):
    """Create random blurring between 3-5 kernel sizes, 
    bigger sizes cause excesive blur on testing
    input: image numpy_array
    output: image numpy_array"""
    s_kernel = rd.randrange(1,3) * 2 + 1
    
    # ksize
    ksize = (s_kernel, s_kernel)
    
    # Using cv2.blur() method 
    image = cv2.blur(image, ksize) 
    
    return image


def random_noise(image):
    """Create random noise choosing between, poisson,
    speckle, salt&pepper and gauss
    input: image numpy_array
    output: image numpy_array"""
    
    random = rd.randrange(0,4)
    
    noise_dict = {0:"gauss",
                 1:"s&p",
                 2:"poisson",
                 3:"speckle"}
    
    # Standarizing image size to float between 0 - 1
    image = 1 * (image / np.max(image))
    noise_typ = noise_dict[random]
    if noise_typ == "gauss":
        row,col,ch= image.shape
        mean = 0
        var = 0.01
        sigma = var**0.5
        gauss = np.random.normal(mean,sigma,(row,col,ch))
        gauss = gauss.reshape(row,col,ch)
        gauss = .3 * (gauss / np.max(gauss))
        noisy = image + gauss
    
    elif noise_typ == "s&p":
        row,col,ch = image.shape
        s_vs_p = 0.5
        amount = 0.01
        out = np.copy(image)
        # Salt mode
        num_salt = np.ceil(amount * image.size * s_vs_p)
        coords = [np.random.randint(0, i - 1, int(num_salt)) for i in image.shape]
        out[tuple(coords)] = 1
        # Pepper mode
        num_pepper = np.ceil(amount* image.size * (1. - s_vs_p))
        coords = [np.random.randint(0, i - 1, int(num_pepper)) for i in image.shape]
        out[tuple(coords)] = 0
        noisy = out
    
    elif noise_typ == "poisson":
        vals = len(np.unique(image))
        vals = 2 ** np.ceil(np.log2(vals))
        noisy = np.random.poisson(image * vals) / float(vals)
    
    elif noise_typ =="speckle":
        row,col,ch = image.shape
        gauss = np.random.randn(row,col,ch)
        gauss = gauss.reshape(row,col,ch)        
        noisy = image + .05*image * gauss

    else:
        image = np.array(255 * (image / np.max(image)), dtype = np.uint8)
        return image
    
    # Getting back original dimensions between 0-255
    noisy = noisy - np.min(noisy)
    noisy = np.array(255 * (noisy / np.max(noisy)), dtype = np.uint8)
    return noisy

### Transform IMG

In [17]:
def transform_img(filename, returnOriginal = False):
    """Given a file name return a transformed image, rotation matrix and if specified
    original image
    input:  filename (str)
            returnOriginal (bool) if True return input image, default false
    output: image (numpy_array)"""
    
    image = cv2.imread(file_img)
    
    if rd.choice([True, True, False]):
        # Rotate
        if rd.choice([True, True, False]):
            img_rot, rot_mat = random_rotation(image)
        else:
            img_rot = image
            rot_mat = cv2.getRotationMatrix2D((image.shape[0]/2, image.shape[1]/2), 0, 1.0)

        # Blur
        if rd.choice([True, False]):
            img_blur = random_blur(img_rot)
        else:
            img_blur = img_rot

        # Noise
        if rd.choice([True, False]):
            img_noise = random_noise(img_blur)
        else:
            img_noise = img_blur
    else:
        img_noise = image
        rot_mat = cv2.getRotationMatrix2D((image.shape[0]/2, image.shape[1]/2), 0, 1.0)
     
    if returnOriginal:
        return img_noise, rot_mat, image
        
    return img_noise, rot_mat

### get Random file

In [18]:
def get_random_file(data_dir, name = None):
    """Given a data directory get a random file
    Warning, may fail if are directories on the specified dir
    input:
            data_dir (str)
            name (str) if specified, name to return
    output: 
           file_xml (str)
           fiel_img (str)
           name (str)"""
    if name is None:
        rd_numb = rd.randrange(1000)
        rd_numb = 837
        numb = (5-len(str(rd_numb)))*'0'+str(rd_numb)

        file_img = os.path.join(data_dir, 'Image', numb + '.png')
        file_xml = os.path.join(data_dir, 'Markup', numb + '.xml')

        print("File: {}".format(file_img))
    else:
        file_img = os.path.join(data_dir, 'Image', numb + '.png')
        file_xml = os.path.join(data_dir, 'Markup', numb + '.xml')

        print("File: {}".format(file_img))
    
    return file_xml, file_img, numb

### XML things 

In [19]:
def get_xml_info(filename):
    """Return xml information
    input: filename (str)
    ouput: 
        barcodeTypes (list of barcode types in the image, one per each barcode)
        bottomLines (list of bottomLines, one per each barcode)
        points_list (list of list of tuples,
                    each list of tuples is a bounding box )"""
    mydoc = minidom.parse(filename)
    barcodes = mydoc.getElementsByTagName('Barcode')
    barcodeTypes = []
    bottomLines = []
    points_list = []
    
    for barcode in barcodes:
        points = barcode.getElementsByTagName('Point')
        barcodeType = barcode.attributes["Type"].value
        
        polygon = barcode.getElementsByTagName("Polygon")
        if "BottomLine" in polygon[0].attributes:
            bottomLine = polygon[0].attributes["BottomLine"].value
        else:
            bottomLine = "-1"

        #print("{} Points. barcodeType: {}. bottomLine: {}".format(points.length, barcodeType, bottomLine))
        pts = []

        for point in points:
            x = float(point.attributes['X'].value)
            y = float(point.attributes['Y'].value)
            pts.append((x, y))

            #print("---",x,y)

        barcodeTypes.append(barcodeType)
        bottomLines.append(bottomLine)
        points_list.append(pts)

    return barcodeTypes, bottomLines, points_list
    
    
def process_info(points, rot_mat):
    """Transform points given to rotate them
    input: points (list of list of tuples,
                    each list of tuples is a bounding box )
            rot_mat (2x3 array rotation matrix)
    output: rot_point (list of list of tuples,
                    each list of tuples is the rotated bounding box )
    """
    rot_point = []
    for group in points:
        pts_trans = []
        for (x, y) in group:
            v = [x,y,1]
            points = np.dot(rot_mat, v)
            pts_trans.append((points[0], points[1]))
        
        rot_point.append(pts_trans)
    
    
    return rot_point


def write_xml( barcode_types, bottomLines, points, filename):
    """Write xml with the extracted information and the rotated points
    Input: 
        barcodeTypes (list of barcode types in the image, one per each barcode)
        bottomLines (list of bottomLines, one per each barcode)
        points_list (list of list of tuples,
                    each list of tuples is a bounding box)
        filename: output filename
    Output: None, writes xml on filename provided
    """
    root = ET.Element("xml")
    
    for i in range(len(bottomLines)):
    
        barcode = ET.Element("Barcode")
        barcode.set("Type", barcode_types[i])
        barcode.text = "b'"

        polygon = ET.SubElement(barcode, "Polygon")
        polygon.set("BottomLine", bottomLines[i])
        polygon.text = "\\n\\t\\t\\t\\t\\t\\t"
        polygon.tail = "\\n\\t\\t\\t\\t'"

        for j, (x, y) in enumerate(points[i]):
            point= ET.SubElement(polygon, "Point")
            point.set("X", str(x))
            point.set("Y", str(y))
            if j == len(points[i])-1:
                point.tail = "\\n\\t\\t\\t\\t\\t"
            else:
                point.tail = "\\n\\t\\t\\t\\t\\t\\t"


        root.append(barcode)

    tree = ET.ElementTree(root)
    
    if not os.path.exists(os.path.dirname(filename)):
        os.makedirs(os.path.dirname(filename))
        
    with open (filename, "wb") as files :
        tree.write(files)
        

### Draw Boxes

In [20]:
def draw_boxes(image, points):
    """Given an image and point, creates the annotated polygon on the image
    input:
        image, numpy array
        points, (list of list of tuples,
                    each list of tuples is a polygon box)
    output: 
        marked image (numpy array)
    """
    imageMarked = np.copy(image)
    for group in points:
        pts = []
        for (x, y) in group:
            pts.append([int(x), int(y)])
            
        pts = np.array(pts, np.int32)
        
        # To draw a point in the center
        #M = cv2.moments(pts)
        #cx = M['m10']/M['m00']
        #cy = M['m01']/M['m00']
        #image = cv2.circle(image, (int(cx),int(cy)), radius=3, color=(0, 0, 255), thickness=-1)

        pts = pts.reshape((-1, 1, 2))

        isClosed = True

        # Blue color in BGR
        color = (255, 0, 0)

        # Line thickness of 2 px
        thickness = 2

        # Using cv2.polylines() method  # Draw a Blue polygon with   # thickness of 1 pxs
        imageMarked = cv2.polylines(imageMarked, [pts], isClosed, color, thickness)
    
    return imageMarked
        

### Verify training dir

In [21]:
def verify_original_data(data_to_augm):
    # Verify Image and Markup location
    markups = ["markup","markups"]
    images = ["image","images"]
    m_dir = ''
    i_dir = ''
    for file in os.listdir(data_to_augm):
        if file.upper() in [m.upper() for m in markups]:
            if os.path.isdir(file):
                print("Found markups folder in {}".format(file))
                m_dir = file
        if file.upper() in [i.upper() for i in images]:
            if os.path.isdir(file):
                print("Found images folder in {}".format(file))
                i_dir = file

    if len(m_dir)<1 or len(i_dir)<1:
        print("Markups or Images directories not found")
        return False, i_dir, m_dir
    else:
        return True, i_dir, m_dir

### Pipeline

In [22]:
def augmentation_pipeline(from_data_dir, to_data_dir, data_mult = 1, batch = None):
    
    isOk, image_dir, markup_dir = verify_original_data(from_data_dir)
    
    if not isOk:
        return False
    
    tam = len(os.listdir(from_data_dir))
    batch_size = tam if batch is None else batch
    
    dir_xml = os.path.join(from_data_dir, markup_dir)
    dir_img = os.path.join(from_data_dir, image_dir)
    
    for i in range(batch_size):    
        for j, file in enumerate(os.listdir(img_xml)):

            if j>batch_size:
                break

            name = file.split(".")[0]
            file_xml = os.path.join(dir_xml, name + ".xml")
            file_img = os.path.join(dir_img, file)

            barcode_types, bottomLines, points = get_xml_info(file_xml)
            img_transformed, rot_mat, original = transform_img(file_img, returnOriginal = True)

            rot_points = process_info(points, rot_mat)
            
            padding = 10**(len(str(batch_size)) + len(str(data_mult)))
            a  = j + i*10**(len(str(batch_size)))
            numb = (padding-len(str(a)))*'0'+str(a)
            
            data_created_xml = os.path.join(to_data_dir, "Markup", numb+".xml")
            data_created_image = os.path.join(to_data_dir, "Image", numb+".jpg")

            write_xml(barcode_types, bottomLines, rot_points, data_created_xml)
            cv2.imwrite(data_created_image, img_transformed)

            img_marked_rot = draw_boxes(img_transformed, rot_points)

            if i%int(tam/100)==0:
                clear_output()
                print("{}% completado con {} imagenes".format(100*i/tam,i))
        
        return True
    
    # Uncomment this for debug
    # cv2.imshow('Image marked Rotated', img_marked_rot)
    # cv2.waitKey()
    # cv2.destroyAllWindows()

### Zip

In [23]:
def zip_augmentedData(fileDir):
    name = fileDir + ".zip"
    with ZipFile(name, 'w') as zipObj:
    # Iterate over all the files in directory
        for folderName, subfolders, filenames in os.walk(dirName):
            for filename in filenames:
               #create complete filepath of file in directory
               filePath = os.path.join(folderName, filename)
               # Add file to zip
               zipObj.write(filePath, os.path.basename(filePath))

## Run all

### Set Directories

In [24]:
current_dir = os.getcwd()
data_dir = os.path.join(current_dir, "data")
#data_synthetic = os.path.join(data_dir, 'ZVZ-synth-512', "part01")
data_to_augm = os.path.join(data_dir, 'barcode_detection_benchmark', "images", "train")
data_augmented = os.path.join(data_dir, 'barcode_detection_benchmark', 'barcode_augmented_dataset')
if os.path.exists(data_to_augm):
    print("Source: \tData location Verified, {}".format(data_to_augm))
else:
    print("Source: \tReview original data location, not found {}".format(data_to_augm))
    
if os.path.exists(data_augmented):
    print("Destination: \tData location already exist, files may be overwrite {}".format(data_augmented))
else:
    print("Destination: \tData location not found, directories will be created, {}".format(data_augmented)) 

Source: 	Data location Verified, /home/ec2-user/SageMaker/la-comer/src/data/barcode_detection_benchmark/images/train
Destination: 	Data location not found, directories will be created, /home/ec2-user/SageMaker/la-comer/src/data/barcode_detection_benchmark/barcode_augmented_dataset


### Run pipeline

In [25]:
success = augmentation_pipeline(data_to_augm, data_augmented, data_mult = 3, batch = 1000)
if success:
    zip_augmentedData(data_augmented)
    upload_to_s3('la-comer', 'data_augmented'+'.zip', 'data/barcode_detection_benchmark/')
    print("Uploaded to S3")
else:
    print("Failed to generate augmented data")

Markups or Images directories not found
Failed to generate augmented data
