In [1]:
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image, ImageDraw, ImageEnhance, ImageOps, ImageFilter
import random
import os
from tqdm import tqdm

In [2]:
def random_crop(img, size, crops, enforce_content=True):
    ims = []
    
    w, h = img.size
    
    wstarts = list(range(w-size))
    hstarts = list(range(h-size))
    
    for i in range(crops):
        ws = random.choice(wstarts)
        hs = random.choice(hstarts)
        we = ws+size
        he = hs+size
        crp = img.crop((ws, hs, we, he)) 
        is_blank = False
        if crp.convert("L").getextrema() == (0, 0) or crp.convert("L").getextrema() == (255, 255):
            is_blank = True
        if enforce_content:
            it_lim = 20
            it = 0
            while is_blank:
                ws = random.choice(wstarts)
                hs = random.choice(hstarts)
                we = ws+size
                he = hs+size
                crp = img.crop((ws, hs, we, he)) 
                is_blank = False
                if crp.convert("L").getextrema() == (0, 0) or crp.convert("L").getextrema() == (255, 255):
                    is_blank = True
                it += 1
                if it > it_lim:
                    break  
        ims.append(crp)
    return ims

def blemish(img):
    img_in = img.copy()
    img_arr = np.asarray(img_in)
    points = random.choice(list(range(5, 20)))
    
    sizes = list(range(1,6))
    coords = [(random.choice(range(img_arr.shape[1]-2*max(sizes))), 
               random.choice(range(img_arr.shape[0]-2*max(sizes)))) for _ in range(points)]
    
    draw = ImageDraw.Draw(img_in) 
    for x, y in coords:
        size = random.choice(sizes)
        draw.ellipse([x-size, y-size, x+size, y+size], fill=0, outline=0,width=1)
    
    return img_in

def noise(img, factor=0.1):
    img_arr = np.asarray(img)
    noise = np.random.rand(*img_arr.shape)
    noise = img_arr * noise * factor
    img_arr = img_arr - noise
    img_out = Image.fromarray(np.uint8(img_arr.clip(min=0)))
    return img_out

def grey_patching(img, by=20):
    img_arr = np.asarray(img).copy()
    random_reduction = np.random.randint(by, size=img_arr.shape[:2])
    random_reduction = np.dstack((random_reduction,random_reduction,random_reduction))
    img_arr = img_arr - random_reduction           
    img_out = Image.fromarray(np.uint8(img_arr.clip(min=0)))
    return img_out

def blend(img, dirty_masks='data/damage_examples'):
    mask = random.choice(os.listdir(dirty_masks))
    mask_img = Image.open(dirty_masks+"/"+mask).convert('RGB')
    mask_img = random_crop(mask_img, min(img.size), 1)[0]
    rand = random.random() 
    if rand < 0.5:
        mask_img = mask_img.rotate(180)
        
    rand = random.random() 
    if rand < 0.5:
        mask_img = mask_img.transpose(Image.FLIP_LEFT_RIGHT)
    else:
        mask_img = mask_img.transpose(Image.FLIP_TOP_BOTTOM)
    
    
    mask_arr = np.asarray(mask_img, dtype=np.float32)
    img_arr = np.asarray(img, dtype=np.float32)

    comb = (img_arr+mask_arr)/2
        
    img_out = Image.fromarray(np.uint8(comb.clip(min=0, max=255)))
    img_out = ImageEnhance.Contrast(img_out).enhance(2)
    return img_out

def blur_image(img):
    imgout = img.filter(ImageFilter.GaussianBlur(radius=2)) 
    return imgout

modifications = [[blend, blemish],
                 [blend, blemish],
                 [blend, noise],
                 [blend, grey_patching],
                 [blend, blur_image],
                 [blur_image],
                 [blemish],
                 [blend],
                 [blend]]

In [3]:
pdf_dir = 'data/pdfs/'
clean_dir = 'data/clean/'
damaged_dir = 'data/damaged/'

ncrops=5

imno = 0
for file in tqdm(os.listdir(pdf_dir)):
    img = Image.open(pdf_dir+file).convert('RGB')
    ims = random_crop(img, 224, ncrops)
    
    for i in range(ncrops):
        img = ims[i]
        img.save(clean_dir + f'{imno}.png', "PNG")

        mods = random.choice(modifications)
        damage = img.copy()
        for mod in mods:
            damage = mod(damage)
        
        damage.save(damaged_dir + f'{imno}_dmg.png', "PNG")
        imno += 1
        

100%|██████████████████████████████████████████████████████████████████████████████| 1376/1376 [05:43<00:00,  4.01it/s]
