# Imports

In [1]:
import os
import re
import sys
import time
import hashlib
import shutil
from pathlib import Path
from pprint import pprint
# thread instead of process: https://stackoverflow.com/questions/21198857/python-multiprocessing-example-not-working
from multiprocessing.dummy import Pool
import multiprocessing

from tqdm.notebook import tqdm
from PIL import Image, ImageOps, ImageColor
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split

# Settings

In [2]:
ROOT_DIR = r'.\datasets\RAW_DATA-20220516T194619Z-001\RAW_DATA'  # containing shots
DATASET_ROOT_DIR = r'./datasets/processed_data'  # output dir
TRAIN_TEST_RATIO = 0.8
RANDOM_SEED = 42

## Folder Structure
- RAW_DATA: 1
    - [SHOT_NAME]: n
        - [CHAR_NAME]: n
            - ?cleanup|spotkeys
                - ?[number]\_[type]\_[other]:   type = line / clean  --> line
                - ?[number]\_[type]\_[other]:   type = fill / col    --> color

# Utilities

In [3]:
IMG_EXTENSIONS = [
    '.jpg', '.jpeg',
    '.png', '.ppm', '.bmp',
    '.tif', '.tiff',
]

def is_image_file(filename):
    return any(filename.lower().endswith(ext) for ext in IMG_EXTENSIONS)

def find_all_image_paths(root):
    paths = []
    assert os.path.isdir(root)

    for root, _, filenames in sorted(os.walk(root)):
        for filename in filenames:
            if is_image_file(filename):
                paths.append(os.path.join(root, filename))
    
    return paths

def find_all_folders(root):
    try:
        return list(next(os.walk(root))[1])
    except StopIteration:
        return []

# Find Mapping Image Pairs

In [4]:
clean_line = re.compile(r'line|clean|_', re.IGNORECASE)
clean_fill = re.compile(r'col|fill|_', re.IGNORECASE)

def process_char_dir(char_dir):
    print(char_dir)
    
    line2fill_map = {}
    
    part_dirs = find_all_folders(char_dir)
    
    # map clean to original 
    clean_line_dirs = {}  
    clean_fill_dirs = {}
    
    # for each part folder:
    for part_dir in part_dirs:
        
        # remove seq no and convert to lower case
        clean_folder = part_dir.split('_', maxsplit=1)[1].lower()
        
        # remove line, clean, _ from the folder name
        clean_line_dirs[clean_line.sub('', clean_folder)] = part_dir  # keep track of original folder name
        
        # remove col, fill, _ from the folder name
        clean_fill_dirs[clean_fill.sub('', clean_folder)] = part_dir  # keep track of original folder name

    for clean_line_dir in clean_line_dirs:
        
        if clean_line_dir in clean_fill_dirs:
            
            line_original_dir = clean_line_dirs[clean_line_dir]
            fill_original_dir = clean_fill_dirs[clean_line_dir]
            
            if line_original_dir != fill_original_dir:  # if its not just a random folder
                # we got a matching folder, line -> fill
                print(f'{line_original_dir} -> {fill_original_dir}')
                line_path = os.path.join(char_dir, line_original_dir)
                fill_path = os.path.join(char_dir, fill_original_dir)

                line2fill_map[line_path] = fill_path
                
    return line2fill_map

def process_shot_dir(shot_dir):
    print()
    print(shot_dir)
    
    line2fill_map = {}
    char_names = find_all_folders(shot_dir)
    for char in char_names:
        
        char_dir = os.path.join(shot_dir, char, 'cleanup')
        line2fill_map.update(process_char_dir(char_dir))
        
        char_dir = os.path.join(shot_dir, char, 'spotkeys')
        line2fill_map.update(process_char_dir(char_dir))
    
    return line2fill_map


line2fill_map = {}

for shot_folder in tqdm(find_all_folders(ROOT_DIR)):
    line2fill_map.update(process_shot_dir(os.path.join(ROOT_DIR, shot_folder)))

  0%|          | 0/5 [00:00<?, ?it/s]


.\datasets\RAW_DATA-20220516T194619Z-001\RAW_DATA\SH_0010
.\datasets\RAW_DATA-20220516T194619Z-001\RAW_DATA\SH_0010\Connie\cleanup
02_CONNIE_HAIR_LINE -> 03_COL_ConnieHair
04_CONNIE_HAND_LINE -> 05_COL_ConnieHand
06_LINE_CONNIE_UPPERBODY -> 07_COL_ConnieUpperbody
08_LINE_CONNIE_STILL -> 09_COL_ConnieStill
.\datasets\RAW_DATA-20220516T194619Z-001\RAW_DATA\SH_0010\Connie\spotkeys
01_Line_ConnieHair -> 02_COL_ConnieHair
03_Line_ConnieHand -> 04_COL_ConnieHand
05_Line_ConnieUpperbody -> 06_COL_ConnieUpperbody
07_Line_ConnieStill -> 08_COL_ConnieStill
.\datasets\RAW_DATA-20220516T194619Z-001\RAW_DATA\SH_0010\OldDude\cleanup
19_OLdDudeLine -> 23_OldDudeCol
.\datasets\RAW_DATA-20220516T194619Z-001\RAW_DATA\SH_0010\OldDude\spotkeys
.\datasets\RAW_DATA-20220516T194619Z-001\RAW_DATA\SH_0010\Ruben\cleanup
24_RubenLine -> 27_RubenCol
.\datasets\RAW_DATA-20220516T194619Z-001\RAW_DATA\SH_0010\Ruben\spotkeys
.\datasets\RAW_DATA-20220516T194619Z-001\RAW_DATA\SH_0010\SleepingDude\cleanup
11_SleepingDu

In [5]:
line_fill_paths = []
for line_path, fill_path in tqdm(line2fill_map.items()):
    line_im_paths = sorted(find_all_image_paths(line_path))
    fill_im_paths = sorted(find_all_image_paths(fill_path))
    line_fill_paths += list(zip(line_im_paths, fill_im_paths))

  0%|          | 0/40 [00:00<?, ?it/s]

# Process and Create Training Samples

In [6]:
def process(paths):
    line_path, fill_path, save_path = paths
    
    # read image
    im_A = Image.open(line_path)
    im_B = Image.open(fill_path)

    # crop to non empty region
    non_empty_coord = im_A.getbbox()
    
    if non_empty_coord is None:
        # empty image
        return None
    
    im_A = im_A.crop(non_empty_coord)
    im_B = im_B.crop(non_empty_coord)
    
    # pad and resize to square
    size = 512
    im_A = ImageOps.pad(im_A, (size, size), method=Image.BICUBIC, color='rgba(0,0,0,0)')
    im_B = ImageOps.pad(im_B, (size, size), method=Image.BICUBIC, color='rgba(0,0,0,0)')
    
    # combine
    im = Image.new('RGBA', (size * 2, size))
    im.paste(im_A, (0, 0))
    im.paste(im_B, (size, 0))
    
    # convert to RGB, do not use .convert('RGB') as it gives black background
    rgb = Image.new("RGB", im.size, (255, 255, 255))
    rgb.paste(im, mask=im.split()[3])

    rgb.save(save_path)
    return rgb

In [7]:
input_dataset_folder = os.path.basename(os.path.normpath(ROOT_DIR))
output_dataset_root = os.path.join(DATASET_ROOT_DIR, input_dataset_folder)

print(f'Output Directory: {output_dataset_root}')
Path(output_dataset_root).mkdir(parents=True, exist_ok=True)

# set up worker input
worker_inputs = []
for i, (line_path, fill_path) in enumerate(line_fill_paths, 1):
    save_path = os.path.join(output_dataset_root, f'{i}.jpg')
    worker_inputs.append((line_path, fill_path, save_path)) 
    
valid_pairs = 0
seen = set()
dups = []
remains = 0

with Pool(8) as pool:
    for i, im in tqdm(enumerate(pool.imap_unordered(process, worker_inputs)), total=len(worker_inputs)):
        if im:
            valid_pairs += 1
            
            h = hashlib.sha256(im.tobytes()).hexdigest()
            save_path = worker_inputs[i][2]
            if h in seen:
                dups.append(save_path)  # append the save_path
            else:
                remains += 1
                seen.add(h)

# remove duplicates
for p in dups:
    Path(p).unlink(missing_ok=True)

print(f'Number of            data pairs: {len(worker_inputs)}')    
print(f'Number of non empty  data pairs: {valid_pairs}')
print(f'Number of duplicates data pairs: {len(dups)}')
print(f'Number of remaining  data pairs: {remains}')

Output Directory: ./datasets/processed_data\RAW_DATA


  0%|          | 0/2155 [00:00<?, ?it/s]

Number of            data pairs: 2155
Number of non empty  data pairs: 1477
Number of duplicates data pairs: 1104
Number of remaining  data pairs: 373


# Train Test Split

In [8]:
train_out_folder = os.path.join(output_dataset_root, 'train')
test_out_folder = os.path.join(output_dataset_root, 'test')

Path(train_out_folder).mkdir(parents=True, exist_ok=True)
Path(test_out_folder).mkdir(parents=True, exist_ok=True)

train_paths, test_paths = train_test_split(find_all_image_paths(output_dataset_root), 
                                           train_size=TRAIN_TEST_RATIO, 
                                           random_state=RANDOM_SEED, 
                                           shuffle=True, 
                                           stratify=None)

print(f'Number of training samples: {len(train_paths)}')
print(f'Number of testing  samples: {len(test_paths)}')


# do not specify the specific file so that it raise warning whe file already exists
# to avoid potential training samples.

for train_path in tqdm(train_paths):
    shutil.move(train_path, train_out_folder)
    
for test_path in tqdm(test_paths):
    shutil.move(test_path, test_out_folder)

Number of training samples: ['./datasets/processed_data\\RAW_DATA\\1899.jpg', './datasets/processed_data\\RAW_DATA\\884.jpg', './datasets/processed_data\\RAW_DATA\\1825.jpg', './datasets/processed_data\\RAW_DATA\\167.jpg', './datasets/processed_data\\RAW_DATA\\1008.jpg', './datasets/processed_data\\RAW_DATA\\1129.jpg', './datasets/processed_data\\RAW_DATA\\914.jpg', './datasets/processed_data\\RAW_DATA\\1933.jpg', './datasets/processed_data\\RAW_DATA\\986.jpg', './datasets/processed_data\\RAW_DATA\\1440.jpg', './datasets/processed_data\\RAW_DATA\\1996.jpg', './datasets/processed_data\\RAW_DATA\\1972.jpg', './datasets/processed_data\\RAW_DATA\\181.jpg', './datasets/processed_data\\RAW_DATA\\1911.jpg', './datasets/processed_data\\RAW_DATA\\794.jpg', './datasets/processed_data\\RAW_DATA\\605.jpg', './datasets/processed_data\\RAW_DATA\\1925.jpg', './datasets/processed_data\\RAW_DATA\\369.jpg', './datasets/processed_data\\RAW_DATA\\1915.jpg', './datasets/processed_data\\RAW_DATA\\2091.jpg',

  0%|          | 0/328 [00:00<?, ?it/s]

  0%|          | 0/83 [00:00<?, ?it/s]