# Imports

In [18]:
import os
import re
import sys
import time
import hashlib
import shutil
from pathlib import Path
from pprint import pprint
# thread instead of process: https://stackoverflow.com/questions/21198857/python-multiprocessing-example-not-working
from multiprocessing.dummy import Pool
import multiprocessing

from tqdm.notebook import tqdm
import numpy as np
from PIL import Image, ImageOps, ImageColor
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split

# Settings

In [34]:
ROOT_DIR = r'.\datasets\RAW_DATA-20220516T194619Z-001\RAW_DATA'  # containing shots
OUTPUT_DATASET_ROOT_DIR = r'./datasets/processed_rough2clean_data'  # output dir
TRAIN_TEST_RATIO = 0.8
RANDOM_SEED = 42

## Folder Structure
- RAW_DATA: 1
    - [SHOT_NAME]: n
        - [CHAR_NAME]: n
            - ?cleanup|spotkeys
                - ?[number]\_[type]\_[other]:   type = line / clean  --> line
                - ?[number]\_[type]\_[other]:   type = fill / col    --> color

# Utilities

In [4]:
IMG_EXTENSIONS = [
    '.jpg', '.jpeg',
    '.png', '.ppm', '.bmp',
    '.tif', '.tiff',
]

def is_image_file(filename):
    return any(filename.lower().endswith(ext) for ext in IMG_EXTENSIONS)

def find_all_image_paths(root):
    paths = []
    assert os.path.isdir(root)

    for root, _, filenames in sorted(os.walk(root)):
        for filename in filenames:
            if is_image_file(filename):
                paths.append(os.path.join(root, filename))
    
    return paths

def find_all_folders(root):
    try:
        return list(next(os.walk(root))[1])
    except StopIteration:
        return []

# Merge Images

In [24]:
def process_cleanup_dir(cleanup_dir, output_dir):
    
    part_folders = find_all_folders(cleanup_dir)
    line_folders = [folder for folder in part_folders if 'line' in folder.lower() or 'clean' in folder.lower()]
    
    merged_im_paths = []
    for i, folder in enumerate(line_folders):
        path = os.path.join(cleanup_dir, folder)
        im_paths = sorted(find_all_image_paths(path))
        while len(im_paths) > len(merged_im_paths):
            merged_im_paths.append([])
        for i, im_path in enumerate(im_paths):
            merged_im_paths[i].append(im_path)
        
    for i, paths in tqdm(enumerate(merged_im_paths), total=len(merged_im_paths)):
        merged_im = 0
        for path in paths:
            im = Image.open(path).convert('RGBA')
            im = np.asarray(im)
            merged_im = merged_im + im
        Image.fromarray(merged_im).save(os.path.join(output_dir, f'{i}.png'))

In [25]:
# process_cleanup_dir(
#     r'D:\UCL\labs\comp0122\datasets\RAW_DATA-20220516T194619Z-001\RAW_DATA\SH_0010\Connie\cleanup',
#     r'test_out'
# )

  0%|          | 0/73 [00:00<?, ?it/s]

# Find Mapping Image Pairs

In [32]:
def process_shot_dir(shot_dir):
    print()
    print(shot_dir)
    
    rough2clean_pairs = []
    char_names = find_all_folders(shot_dir)
    for i, char in enumerate(char_names):
        
        clean_dir = os.path.join(shot_dir, char, 'cleanup')
        rough_dir = os.path.join(shot_dir, char, 'tiedown')
        
        if not os.path.isdir(clean_dir) or not os.path.isdir(rough_dir):
            continue
            
        merged_clean_dir = f'temp_out_{i}'
        Path(merged_clean_dir).mkdir(parents=True, exist_ok=True)
        
        print(f'{rough_dir} -> {clean_dir} -> {merged_clean_dir}')
        
        process_cleanup_dir(clean_dir, merged_clean_dir)
        
        rough_paths = sorted(find_all_image_paths(rough_dir))
        clean_paths = sorted(find_all_image_paths(clean_dir))
        
        for rough, clean in zip(rough_paths, clean_paths):
            rough2clean_pairs.append((rough, clean))
    
    return rough2clean_pairs

rough2clean_pairs = []

for shot_folder in tqdm(find_all_folders(ROOT_DIR)):
    rough2clean_pairs.extend(process_shot_dir(os.path.join(ROOT_DIR, shot_folder)))

  0%|          | 0/5 [00:00<?, ?it/s]


.\datasets\RAW_DATA-20220516T194619Z-001\RAW_DATA\SH_0010
.\datasets\RAW_DATA-20220516T194619Z-001\RAW_DATA\SH_0010\Connie\tiedown -> .\datasets\RAW_DATA-20220516T194619Z-001\RAW_DATA\SH_0010\Connie\cleanup -> temp_out_0


  0%|          | 0/73 [00:00<?, ?it/s]

.\datasets\RAW_DATA-20220516T194619Z-001\RAW_DATA\SH_0010\SleepingDude\tiedown -> .\datasets\RAW_DATA-20220516T194619Z-001\RAW_DATA\SH_0010\SleepingDude\cleanup -> temp_out_3


  0%|          | 0/73 [00:00<?, ?it/s]


.\datasets\RAW_DATA-20220516T194619Z-001\RAW_DATA\SH_0030
.\datasets\RAW_DATA-20220516T194619Z-001\RAW_DATA\SH_0030\BGguy\tiedown -> .\datasets\RAW_DATA-20220516T194619Z-001\RAW_DATA\SH_0030\BGguy\cleanup -> temp_out_0


  0%|          | 0/51 [00:00<?, ?it/s]

.\datasets\RAW_DATA-20220516T194619Z-001\RAW_DATA\SH_0030\BGkid\tiedown -> .\datasets\RAW_DATA-20220516T194619Z-001\RAW_DATA\SH_0030\BGkid\cleanup -> temp_out_1


  0%|          | 0/51 [00:00<?, ?it/s]

.\datasets\RAW_DATA-20220516T194619Z-001\RAW_DATA\SH_0030\BGmum\tiedown -> .\datasets\RAW_DATA-20220516T194619Z-001\RAW_DATA\SH_0030\BGmum\cleanup -> temp_out_2


  0%|          | 0/51 [00:00<?, ?it/s]

.\datasets\RAW_DATA-20220516T194619Z-001\RAW_DATA\SH_0030\BGoldman\tiedown -> .\datasets\RAW_DATA-20220516T194619Z-001\RAW_DATA\SH_0030\BGoldman\cleanup -> temp_out_3


  0%|          | 0/51 [00:00<?, ?it/s]

.\datasets\RAW_DATA-20220516T194619Z-001\RAW_DATA\SH_0030\BGwoman\tiedown -> .\datasets\RAW_DATA-20220516T194619Z-001\RAW_DATA\SH_0030\BGwoman\cleanup -> temp_out_5


  0%|          | 0/51 [00:00<?, ?it/s]

.\datasets\RAW_DATA-20220516T194619Z-001\RAW_DATA\SH_0030\FGcharacter\tiedown -> .\datasets\RAW_DATA-20220516T194619Z-001\RAW_DATA\SH_0030\FGcharacter\cleanup -> temp_out_6


  0%|          | 0/51 [00:00<?, ?it/s]


.\datasets\RAW_DATA-20220516T194619Z-001\RAW_DATA\SH_0040
.\datasets\RAW_DATA-20220516T194619Z-001\RAW_DATA\SH_0040\Connie\tiedown -> .\datasets\RAW_DATA-20220516T194619Z-001\RAW_DATA\SH_0040\Connie\cleanup -> temp_out_0


  0%|          | 0/50 [00:00<?, ?it/s]

.\datasets\RAW_DATA-20220516T194619Z-001\RAW_DATA\SH_0040\Janelle\tiedown -> .\datasets\RAW_DATA-20220516T194619Z-001\RAW_DATA\SH_0040\Janelle\cleanup -> temp_out_1


  0%|          | 0/50 [00:00<?, ?it/s]

.\datasets\RAW_DATA-20220516T194619Z-001\RAW_DATA\SH_0040\Kurt\tiedown -> .\datasets\RAW_DATA-20220516T194619Z-001\RAW_DATA\SH_0040\Kurt\cleanup -> temp_out_2


  0%|          | 0/50 [00:00<?, ?it/s]


.\datasets\RAW_DATA-20220516T194619Z-001\RAW_DATA\SH_0050
.\datasets\RAW_DATA-20220516T194619Z-001\RAW_DATA\SH_0050\Connie\tiedown -> .\datasets\RAW_DATA-20220516T194619Z-001\RAW_DATA\SH_0050\Connie\cleanup -> temp_out_0


  0%|          | 0/47 [00:00<?, ?it/s]

.\datasets\RAW_DATA-20220516T194619Z-001\RAW_DATA\SH_0050\Kurt\tiedown -> .\datasets\RAW_DATA-20220516T194619Z-001\RAW_DATA\SH_0050\Kurt\cleanup -> temp_out_2


  0%|          | 0/47 [00:00<?, ?it/s]


.\datasets\RAW_DATA-20220516T194619Z-001\RAW_DATA\SH_0060
.\datasets\RAW_DATA-20220516T194619Z-001\RAW_DATA\SH_0060\Background\tiedown -> .\datasets\RAW_DATA-20220516T194619Z-001\RAW_DATA\SH_0060\Background\cleanup -> temp_out_0


0it [00:00, ?it/s]

.\datasets\RAW_DATA-20220516T194619Z-001\RAW_DATA\SH_0060\Connie\tiedown -> .\datasets\RAW_DATA-20220516T194619Z-001\RAW_DATA\SH_0060\Connie\cleanup -> temp_out_3


0it [00:00, ?it/s]

.\datasets\RAW_DATA-20220516T194619Z-001\RAW_DATA\SH_0060\Dad\tiedown -> .\datasets\RAW_DATA-20220516T194619Z-001\RAW_DATA\SH_0060\Dad\cleanup -> temp_out_4


0it [00:00, ?it/s]

.\datasets\RAW_DATA-20220516T194619Z-001\RAW_DATA\SH_0060\Kurt\tiedown -> .\datasets\RAW_DATA-20220516T194619Z-001\RAW_DATA\SH_0060\Kurt\cleanup -> temp_out_6


0it [00:00, ?it/s]

.\datasets\RAW_DATA-20220516T194619Z-001\RAW_DATA\SH_0060\Yoyota\tiedown -> .\datasets\RAW_DATA-20220516T194619Z-001\RAW_DATA\SH_0060\Yoyota\cleanup -> temp_out_8


0it [00:00, ?it/s]

# Process and Create Training Samples

In [42]:
def process(paths):
    line_path, fill_path, save_path = paths
    
    # read image
    im_A = Image.open(line_path)
    im_B = Image.open(fill_path)

    # crop to non empty region
    non_empty_coord = im_A.getbbox()
    
    if non_empty_coord is None:
        # empty image
        return None
    
#     im_A = im_A.crop(non_empty_coord)
#     im_B = im_B.crop(non_empty_coord)
    
    # pad and resize to square
    size = 1024
    im_A = ImageOps.pad(im_A, (size, size), method=Image.BICUBIC, color='rgba(0,0,0,0)')
    im_B = ImageOps.pad(im_B, (size, size), method=Image.BICUBIC, color='rgba(0,0,0,0)')
    
    # combine
    im = Image.new('RGBA', (size * 2, size))
    im.paste(im_A, (0, 0))
    im.paste(im_B, (size, 0))
    
    # convert to RGB, do not use .convert('RGB') as it gives black background
    rgb = Image.new("RGB", im.size, (255, 255, 255))
    rgb.paste(im, mask=im.split()[3])

    rgb.save(save_path)
    return rgb

In [43]:
input_dataset_folder = os.path.basename(os.path.normpath(ROOT_DIR))
output_dataset_root = os.path.join(OUTPUT_DATASET_ROOT_DIR, input_dataset_folder)

print(f'Output Directory: {output_dataset_root}')
Path(output_dataset_root).mkdir(parents=True, exist_ok=True)

# set up worker input
worker_inputs = []
for i, (rough, clean) in enumerate(rough2clean_pairs, 1):
    save_path = os.path.join(output_dataset_root, f'{i}.jpg')
    worker_inputs.append((rough, clean, save_path)) 
    
valid_pairs = 0
seen = set()
dups = []
remains = 0

with Pool(8) as pool:
    for i, im in tqdm(enumerate(pool.imap_unordered(process, worker_inputs)), total=len(worker_inputs)):
        if im:
            valid_pairs += 1
            
            h = hashlib.sha256(im.tobytes()).hexdigest()
            save_path = worker_inputs[i][2]
            if h in seen:
                dups.append(save_path)  # append the save_path
            else:
                remains += 1
                seen.add(h)

# remove duplicates
for p in dups:
    Path(p).unlink(missing_ok=True)

print(f'Number of            data pairs: {len(worker_inputs)}')    
print(f'Number of non empty  data pairs: {valid_pairs}')
print(f'Number of duplicates data pairs: {len(dups)}')
print(f'Number of remaining  data pairs: {remains}')

Output Directory: ./datasets/processed_rough2clean_data\RAW_DATA


  0%|          | 0/1924 [00:00<?, ?it/s]

Number of            data pairs: 1924
Number of non empty  data pairs: 1565
Number of duplicates data pairs: 901
Number of remaining  data pairs: 664


# Train Test Split

In [8]:
train_out_folder = os.path.join(output_dataset_root, 'train')
test_out_folder = os.path.join(output_dataset_root, 'test')

Path(train_out_folder).mkdir(parents=True, exist_ok=True)
Path(test_out_folder).mkdir(parents=True, exist_ok=True)

train_paths, test_paths = train_test_split(find_all_image_paths(output_dataset_root), 
                                           train_size=TRAIN_TEST_RATIO, 
                                           random_state=RANDOM_SEED, 
                                           shuffle=True, 
                                           stratify=None)

print(f'Number of training samples: {len(train_paths)}')
print(f'Number of testing  samples: {len(test_paths)}')


# do not specify the specific file so that it raise warning whe file already exists
# to avoid potential training samples.

for train_path in tqdm(train_paths):
    shutil.move(train_path, train_out_folder)
    
for test_path in tqdm(test_paths):
    shutil.move(test_path, test_out_folder)

Number of training samples: ['./datasets/processed_data\\RAW_DATA\\1899.jpg', './datasets/processed_data\\RAW_DATA\\884.jpg', './datasets/processed_data\\RAW_DATA\\1825.jpg', './datasets/processed_data\\RAW_DATA\\167.jpg', './datasets/processed_data\\RAW_DATA\\1008.jpg', './datasets/processed_data\\RAW_DATA\\1129.jpg', './datasets/processed_data\\RAW_DATA\\914.jpg', './datasets/processed_data\\RAW_DATA\\1933.jpg', './datasets/processed_data\\RAW_DATA\\986.jpg', './datasets/processed_data\\RAW_DATA\\1440.jpg', './datasets/processed_data\\RAW_DATA\\1996.jpg', './datasets/processed_data\\RAW_DATA\\1972.jpg', './datasets/processed_data\\RAW_DATA\\181.jpg', './datasets/processed_data\\RAW_DATA\\1911.jpg', './datasets/processed_data\\RAW_DATA\\794.jpg', './datasets/processed_data\\RAW_DATA\\605.jpg', './datasets/processed_data\\RAW_DATA\\1925.jpg', './datasets/processed_data\\RAW_DATA\\369.jpg', './datasets/processed_data\\RAW_DATA\\1915.jpg', './datasets/processed_data\\RAW_DATA\\2091.jpg',

  0%|          | 0/328 [00:00<?, ?it/s]

  0%|          | 0/83 [00:00<?, ?it/s]