In [1]:
import os
import re
import sys
import time
import random
import hashlib
import shutil
from pathlib import Path
from pprint import pprint
# thread instead of process: https://stackoverflow.com/questions/21198857/python-multiprocessing-example-not-working
from multiprocessing.dummy import Pool
import multiprocessing

import imagehash
from tqdm.notebook import tqdm
import numpy as np
from PIL import Image, ImageOps, ImageColor
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split

In [2]:
INPUT_DATASET_ROOT = r'D:\UCL\labs\comp0122\datasets\RAW_DATA-20220610T141404Z-001'
# RAW_DATA-20220516T194619Z-001
# RAW_DATA-20220610T141404Z-001
OUTPUT_DATASET_ROOT = r'./datasets/processed/'

TRAIN_TEST_RATIO = 0.8
RANDOM_SEED = 42

random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

In [20]:
IMG_EXTENSIONS = [
    '.jpg', '.jpeg',
    '.png', '.ppm', '.bmp',
    '.tif', '.tiff',
]

def is_image_file(filename):
    return any(filename.lower().endswith(ext) for ext in IMG_EXTENSIONS)

def find_all_image_paths(root, recursive=True):
    paths = []
    assert os.path.isdir(root)

    for root, _, filenames in sorted(os.walk(root)):
        for filename in filenames:
            if is_image_file(filename):
                paths.append(os.path.join(root, filename))
        if not recursive:
            break
    
    return paths

def find_innermost_folders(root, relative=True):
    paths = []
    assert os.path.isdir(root)

    for root, inner_folders, filenames in sorted(os.walk(root)):
        if len(inner_folders) == 0:
            paths.append(os.path.relpath(root) if relative else root)
    
    return paths

In [4]:
im_folders = find_innermost_folders(INPUT_DATASET_ROOT)

pprint(im_folders)

['datasets\\RAW_DATA-20220610T141404Z-001\\RAW_DATA\\SH_0030\\v002\\01_FGCHarcterLine',
 'datasets\\RAW_DATA-20220610T141404Z-001\\RAW_DATA\\SH_0030\\v002\\04_FGCHarcter_fill',
 'datasets\\RAW_DATA-20220610T141404Z-001\\RAW_DATA\\SH_0030\\v002\\06_FGCHaracter_tiedown',
 'datasets\\RAW_DATA-20220610T141404Z-001\\RAW_DATA\\SH_0030\\v002\\08_BGkidLINE',
 'datasets\\RAW_DATA-20220610T141404Z-001\\RAW_DATA\\SH_0030\\v002\\11_BGkidFILL',
 'datasets\\RAW_DATA-20220610T141404Z-001\\RAW_DATA\\SH_0030\\v002\\13_BGkid_TIEDOWN',
 'datasets\\RAW_DATA-20220610T141404Z-001\\RAW_DATA\\SH_0030\\v002\\14_BGmum_LINE',
 'datasets\\RAW_DATA-20220610T141404Z-001\\RAW_DATA\\SH_0030\\v002\\16_BGmum_FILL',
 'datasets\\RAW_DATA-20220610T141404Z-001\\RAW_DATA\\SH_0030\\v002\\18_BGmum_TIEDOWN',
 'datasets\\RAW_DATA-20220610T141404Z-001\\RAW_DATA\\SH_0030\\v002\\19_BGguy_LINE',
 'datasets\\RAW_DATA-20220610T141404Z-001\\RAW_DATA\\SH_0030\\v002\\21_BGguy_FILL',
 'datasets\\RAW_DATA-20220610T141404Z-001\\RAW_DATA\\S

In [5]:
def compute_identifier(p):
    p = os.path.normpath(p)
    p_parts = p.split(os.sep)
    
    sh_indices = [i for i in range(len(p_parts)) if p_parts[i].startswith('SH')]
    if len(sh_indices) == 0:
        raise ValueError(f'SHOT folder not found: {p_parts}')
    
    sh_i = sh_indices[0]
    
    id_comps = []
    
    # shot name
    shot_name = p_parts[sh_i]
    id_comps.append(shot_name)
    
    # character name
    char_name = p_parts[sh_i + 1]
    id_comps.append(char_name)
    
    # may have type name such as cleanup, spotkey, and tiedown
    if len(p_parts) > sh_i + 2:
        type_name = p_parts[sh_i + 2].split('_', 1)[-1]
        id_comps.append(type_name)

    # may have character parts name such as 01_SHA_Connie_fill, and 06_LINE_CONNIE_UPPERBODY
    if len(p_parts) > (sh_i + 3):
        stage_name = p_parts[sh_i + 3].split('_', 1)[-1]
        id_comps.append(stage_name)
    
    # combine using dash '-', should be unique
    return '-'.join(id_comps).lower()

uni_im_folders = {}  # map identifier to actual folder

for p in im_folders:
    
    ident = compute_identifier(p)
    
    if ident in uni_im_folders:
        print(f'duplicate identifier: {ident}')
        uni_im_folders[ident] = None
    elif ident not in uni_im_folders or uni_im_folders[ident] is not None:
        uni_im_folders[ident] = p

uni_im_folders = {k: v for k, v in uni_im_folders.items() if v is not None}

In [6]:
pprint(uni_im_folders)

{'sh_0030-v002-bgguy_fill': 'datasets\\RAW_DATA-20220610T141404Z-001\\RAW_DATA\\SH_0030\\v002\\21_BGguy_FILL',
 'sh_0030-v002-bgguy_line': 'datasets\\RAW_DATA-20220610T141404Z-001\\RAW_DATA\\SH_0030\\v002\\19_BGguy_LINE',
 'sh_0030-v002-bgguytiedown': 'datasets\\RAW_DATA-20220610T141404Z-001\\RAW_DATA\\SH_0030\\v002\\23_BGguyTIEDOWN',
 'sh_0030-v002-bgkid_tiedown': 'datasets\\RAW_DATA-20220610T141404Z-001\\RAW_DATA\\SH_0030\\v002\\13_BGkid_TIEDOWN',
 'sh_0030-v002-bgkidfill': 'datasets\\RAW_DATA-20220610T141404Z-001\\RAW_DATA\\SH_0030\\v002\\11_BGkidFILL',
 'sh_0030-v002-bgkidline': 'datasets\\RAW_DATA-20220610T141404Z-001\\RAW_DATA\\SH_0030\\v002\\08_BGkidLINE',
 'sh_0030-v002-bgmum_fill': 'datasets\\RAW_DATA-20220610T141404Z-001\\RAW_DATA\\SH_0030\\v002\\16_BGmum_FILL',
 'sh_0030-v002-bgmum_line': 'datasets\\RAW_DATA-20220610T141404Z-001\\RAW_DATA\\SH_0030\\v002\\14_BGmum_LINE',
 'sh_0030-v002-bgmum_tiedown': 'datasets\\RAW_DATA-20220610T141404Z-001\\RAW_DATA\\SH_0030\\v002\\18_BGmum

In [7]:
line_paths = {}
fill_paths = {}
tied_paths = {}

line_match = re.compile(r'line|clean', re.IGNORECASE)
line_sub = re.compile(r'line|clean|body|_', re.IGNORECASE)

fill_match = re.compile(r'colour|col|fill', re.IGNORECASE)
fill_sub = re.compile(r'colour|col|fill|line|_', re.IGNORECASE)

tied_match = re.compile(r'td|tiedown', re.IGNORECASE)
tied_sub = re.compile(r'td|tiedown|line|_', re.IGNORECASE)


for ident, path in uni_im_folders.items():
    
    comps = ident.split('-')
    last_comp = comps[-1]
    prev_folders_ident = '-'.join(comps[:-1])  # ensure parent folders are the same except character parts folder
    
    if line_match.search(last_comp) is not None:
        last_comp_ident = line_sub.sub('', last_comp)
        line_paths[prev_folders_ident + last_comp_ident] = path
    
    if fill_match.search(last_comp) is not None:
        last_comp_ident = fill_sub.sub('', last_comp)
        fill_paths[prev_folders_ident + last_comp_ident] = path
        
    if tied_match.search(last_comp) is not None:
        last_comp_ident = tied_sub.sub('', last_comp)
        tied_paths[prev_folders_ident + last_comp_ident] = path


line_fill_matched_dir = set()
line_tied_matched_dir = set()
for line_ident in line_paths.keys():
    if line_ident in fill_paths and line_paths[line_ident] != fill_paths[line_ident]:
        line_fill_matched_dir.add((line_paths[line_ident], fill_paths[line_ident]))
    
    if line_ident in tied_paths and line_paths[line_ident] != tied_paths[line_ident]:
        line_tied_matched_dir.add((line_paths[line_ident], tied_paths[line_ident]))
        
pprint(line_tied_matched_dir)

{('datasets\\RAW_DATA-20220610T141404Z-001\\RAW_DATA\\SH_0030\\v002\\08_BGkidLINE',
  'datasets\\RAW_DATA-20220610T141404Z-001\\RAW_DATA\\SH_0030\\v002\\13_BGkid_TIEDOWN'),
 ('datasets\\RAW_DATA-20220610T141404Z-001\\RAW_DATA\\SH_0030\\v002\\14_BGmum_LINE',
  'datasets\\RAW_DATA-20220610T141404Z-001\\RAW_DATA\\SH_0030\\v002\\18_BGmum_TIEDOWN'),
 ('datasets\\RAW_DATA-20220610T141404Z-001\\RAW_DATA\\SH_0030\\v002\\19_BGguy_LINE',
  'datasets\\RAW_DATA-20220610T141404Z-001\\RAW_DATA\\SH_0030\\v002\\23_BGguyTIEDOWN'),
 ('datasets\\RAW_DATA-20220610T141404Z-001\\RAW_DATA\\SH_0030\\v002\\26_BGwomanLINE',
  'datasets\\RAW_DATA-20220610T141404Z-001\\RAW_DATA\\SH_0030\\v002\\29_BGwomanTD'),
 ('datasets\\RAW_DATA-20220610T141404Z-001\\RAW_DATA\\SH_0030\\v002\\30_BGoldmanLINE',
  'datasets\\RAW_DATA-20220610T141404Z-001\\RAW_DATA\\SH_0030\\v002\\33_BGoldmanTD'),
 ('datasets\\RAW_DATA-20220610T141404Z-001\\RAW_DATA\\SH_0040\\v002\\18_Line_Kurt',
  'datasets\\RAW_DATA-20220610T141404Z-001\\RAW_DATA

In [8]:
pprint(line_fill_matched_dir)

{('datasets\\RAW_DATA-20220610T141404Z-001\\RAW_DATA\\SH_0030\\v002\\01_FGCHarcterLine',
  'datasets\\RAW_DATA-20220610T141404Z-001\\RAW_DATA\\SH_0030\\v002\\04_FGCHarcter_fill'),
 ('datasets\\RAW_DATA-20220610T141404Z-001\\RAW_DATA\\SH_0030\\v002\\08_BGkidLINE',
  'datasets\\RAW_DATA-20220610T141404Z-001\\RAW_DATA\\SH_0030\\v002\\11_BGkidFILL'),
 ('datasets\\RAW_DATA-20220610T141404Z-001\\RAW_DATA\\SH_0030\\v002\\14_BGmum_LINE',
  'datasets\\RAW_DATA-20220610T141404Z-001\\RAW_DATA\\SH_0030\\v002\\16_BGmum_FILL'),
 ('datasets\\RAW_DATA-20220610T141404Z-001\\RAW_DATA\\SH_0030\\v002\\19_BGguy_LINE',
  'datasets\\RAW_DATA-20220610T141404Z-001\\RAW_DATA\\SH_0030\\v002\\21_BGguy_FILL'),
 ('datasets\\RAW_DATA-20220610T141404Z-001\\RAW_DATA\\SH_0030\\v002\\26_BGwomanLINE',
  'datasets\\RAW_DATA-20220610T141404Z-001\\RAW_DATA\\SH_0030\\v002\\28_BGwomanFILL'),
 ('datasets\\RAW_DATA-20220610T141404Z-001\\RAW_DATA\\SH_0030\\v002\\30_BGoldmanLINE',
  'datasets\\RAW_DATA-20220610T141404Z-001\\RAW_D

In [9]:
pprint(line_tied_matched_dir)

{('datasets\\RAW_DATA-20220610T141404Z-001\\RAW_DATA\\SH_0030\\v002\\08_BGkidLINE',
  'datasets\\RAW_DATA-20220610T141404Z-001\\RAW_DATA\\SH_0030\\v002\\13_BGkid_TIEDOWN'),
 ('datasets\\RAW_DATA-20220610T141404Z-001\\RAW_DATA\\SH_0030\\v002\\14_BGmum_LINE',
  'datasets\\RAW_DATA-20220610T141404Z-001\\RAW_DATA\\SH_0030\\v002\\18_BGmum_TIEDOWN'),
 ('datasets\\RAW_DATA-20220610T141404Z-001\\RAW_DATA\\SH_0030\\v002\\19_BGguy_LINE',
  'datasets\\RAW_DATA-20220610T141404Z-001\\RAW_DATA\\SH_0030\\v002\\23_BGguyTIEDOWN'),
 ('datasets\\RAW_DATA-20220610T141404Z-001\\RAW_DATA\\SH_0030\\v002\\26_BGwomanLINE',
  'datasets\\RAW_DATA-20220610T141404Z-001\\RAW_DATA\\SH_0030\\v002\\29_BGwomanTD'),
 ('datasets\\RAW_DATA-20220610T141404Z-001\\RAW_DATA\\SH_0030\\v002\\30_BGoldmanLINE',
  'datasets\\RAW_DATA-20220610T141404Z-001\\RAW_DATA\\SH_0030\\v002\\33_BGoldmanTD'),
 ('datasets\\RAW_DATA-20220610T141404Z-001\\RAW_DATA\\SH_0040\\v002\\18_Line_Kurt',
  'datasets\\RAW_DATA-20220610T141404Z-001\\RAW_DATA

In [10]:
def matched_dir_to_im_paths(matched_dir):
    im_paths = []
    for dir_A, dir_B in matched_dir:
        dir_A_im_paths = sorted(find_all_image_paths(dir_A))
        dir_B_im_paths = sorted(find_all_image_paths(dir_B))

        for path_A, path_B in zip(dir_A_im_paths, dir_B_im_paths):
            im_paths.append((path_A, path_B))

    return im_paths

In [11]:
line_fill_matched_im_paths = matched_dir_to_im_paths(line_fill_matched_dir)

print(len(line_fill_matched_im_paths))

783


In [12]:
line_tied_matched_im_paths = matched_dir_to_im_paths(line_tied_matched_dir)

print(len(line_tied_matched_im_paths))

732


In [13]:
def process_matched_images(matched_im_paths, name='line_fill'):
    
    input_dataset_folder = os.path.basename(os.path.normpath(INPUT_DATASET_ROOT))
    identifer = input_dataset_folder
    
    print(f'Identifier: {input_dataset_folder}')
    
    output_path = os.path.join(OUTPUT_DATASET_ROOT, name)
    
    print(f'Output Dataset Directory [{name}]: {output_path}')
    
    Path(output_path).mkdir(parents=True, exist_ok=True)
    
    worker_inputs = []
    for i, (path_A, path_B) in enumerate(matched_im_paths, 1):
        save_path = os.path.join(output_path, f'{identifer}_{i}.jpg')
        worker_inputs.append((path_A, path_B, save_path, len(worker_inputs)))
    
    print(f'Number of inputs: {len(worker_inputs)}')
    
    return worker_inputs, output_path


# set up worker input
line_fill_worker_inputs, line_fill_output_path = process_matched_images(line_fill_matched_im_paths, name='line_fill')
line_tied_worker_inputs, line_tied_output_path = process_matched_images(line_tied_matched_im_paths, name='line_tied')

Identifier: RAW_DATA-20220610T141404Z-001
Output Dataset Directory [line_fill]: ./datasets/processed/line_fill
Number of inputs: 783
Identifier: RAW_DATA-20220610T141404Z-001
Output Dataset Directory [line_tied]: ./datasets/processed/line_tied
Number of inputs: 732


In [14]:
# multithreading worker method
def process(paths):
    line_path, fill_path, save_path, i = paths
    
    # read image
    im_A = Image.open(line_path)
    im_B = Image.open(fill_path)

    # crop to non empty region
    non_empty_coord = im_A.getbbox()
    
    if non_empty_coord is None:
        # empty image
        return None, i
    
    im_A = im_A.crop(non_empty_coord)
    im_B = im_B.crop(non_empty_coord)
    
    # pad and resize to square
    size = 512
    im_A = ImageOps.pad(im_A, (size, size), method=Image.BICUBIC, color='rgba(0,0,0,0)')
    im_B = ImageOps.pad(im_B, (size, size), method=Image.BICUBIC, color='rgba(0,0,0,0)')
    
    # combine
    im = Image.new('RGBA', (size * 2, size))
    im.paste(im_A, (0, 0))
    im.paste(im_B, (size, 0))
    
    # convert to RGB, do not use .convert('RGB') as it gives black background
    rgb = Image.new("RGB", im.size, (255, 255, 255))
    rgb.paste(im, mask=im.split()[3])

    rgb.save(save_path)
    return imagehash.dhash(rgb), i

In [15]:
def multithreading_process(process_method, worker_inputs):
    seen = set()
    valids = 0
    dups = []
    empties = 0
    
    with Pool(8) as pool:
        for im_hash, i in tqdm(pool.imap_unordered(process_method, worker_inputs), total=len(worker_inputs)):
            if im_hash is not None:
                valids += 1

                # check duplicates
                if im_hash in seen:
                    dups.append(worker_inputs[i][2])  # for later delete file
                else:
                    seen.add(im_hash)
            else:
                empties += 1

    # remove duplicates
    for p in dups:
        Path(p).unlink(missing_ok=False)

    print(f'Amount of            data: {len(worker_inputs)}')    
    print(f'+-Amount of     empty  data: {empties}')
    print(f'+-Amount of non empty  data: {valids}')
    print(f'  +-Amount of duplicates data: {len(dups)}')
    print(f'  +-Amount of remaining  data: {len(seen)}')

In [16]:
multithreading_process(process, line_fill_worker_inputs)

  0%|          | 0/783 [00:00<?, ?it/s]

Amount of            data: 783
+-Amount of     empty  data: 2
+-Amount of non empty  data: 781
  +-Amount of duplicates data: 563
  +-Amount of remaining  data: 218


In [17]:
multithreading_process(process, line_tied_worker_inputs)

  0%|          | 0/732 [00:00<?, ?it/s]

Amount of            data: 732
+-Amount of     empty  data: 2
+-Amount of non empty  data: 730
  +-Amount of duplicates data: 511
  +-Amount of remaining  data: 219


## Train Test Split

In [21]:
def split_dataset(dataset_path):
    train_out_folder = os.path.join(dataset_path, 'train')
    test_out_folder = os.path.join(dataset_path, 'test')

    Path(train_out_folder).mkdir(parents=True, exist_ok=True)
    Path(test_out_folder).mkdir(parents=True, exist_ok=True)
    
    im_paths = find_all_image_paths(dataset_path, recursive=False)
    if len(im_paths) == 0:
        print(f'Dataset is empty: {dataset_path}')
        return
    
    train_paths, test_paths = train_test_split(im_paths, 
                                               train_size=TRAIN_TEST_RATIO, 
                                               random_state=RANDOM_SEED, 
                                               shuffle=True)

    print(f'Number of training samples: {len(train_paths)}')
    print(f'Number of testing  samples: {len(test_paths)}')

    # do not specify the specific file so that it raise warning whe file already exists
    # to avoid potential training samples.

    for train_path in tqdm(train_paths):
        shutil.move(train_path, train_out_folder)

    for test_path in tqdm(test_paths):
        shutil.move(test_path, test_out_folder)
        
split_dataset(line_fill_output_path)
split_dataset(line_tied_output_path)

Number of training samples: 172
Number of testing  samples: 43


  0%|          | 0/172 [00:00<?, ?it/s]

  0%|          | 0/43 [00:00<?, ?it/s]

Number of training samples: 175
Number of testing  samples: 44


  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/44 [00:00<?, ?it/s]