In [79]:
import os
import re
import sys
import time
import hashlib
from pathlib import Path
from pprint import pprint
# thread instead of process: https://stackoverflow.com/questions/21198857/python-multiprocessing-example-not-working
from multiprocessing.dummy import Pool
import multiprocessing

from tqdm.notebook import tqdm
from PIL import Image, ImageOps, ImageColor
from matplotlib import pyplot as plt

## Folder Structure
- RAW_DATA: 1
    - [SHOT_NAME]: n
        - [CHAR_NAME]: n
            - ?cleanup
                - ?[number]\_[type]\_[other]:   type = line / clean  --> line
                - ?[number]\_[type]\_[other]:   type = fill / col    --> color

In [2]:
def find_all_folders(root):
    try:
        return list(next(os.walk(root))[1])
    except StopIteration:
        return []

In [3]:
ROOT_DIR = r'.\datasets\RAW_DATA-20220516T194619Z-001\RAW_DATA'

shot_folders = find_all_folders(ROOT_DIR)

line2col_folder_map = {}

clean_line = re.compile(r'line|clean|_', re.IGNORECASE)
clean_col = re.compile(r'col|fill|_', re.IGNORECASE)

for shot_folder in tqdm(shot_folders):
    shot_dir = os.path.join(ROOT_DIR, shot_folder)
    char_names = find_all_folders(shot_dir)
    
    print()
    print()
    print(shot_folder)
    for char_name in char_names:
        print()
        print(char_name)

        char_dir = os.path.join(shot_dir, char_name, 'cleanup')
        char_part_folders = find_all_folders(char_dir)
        
        # map clean to original 
        clean_line_folders = {}  
        clean_col_folders = {}
        for part_folder in char_part_folders:
            # remove seq no and convert to lower case
            clean_folder = part_folder.split('_', maxsplit=1)[1].lower()
            # clean line, clean, _ from the folder name
            clean_line_folders[clean_line.sub('', clean_folder)] = part_folder
            # clean col, fill, _ from the folder name
            clean_col_folders[clean_col.sub('', clean_folder)] = part_folder
        
        for clean_line_folder in clean_line_folders:
            if clean_line_folder in clean_col_folders:

                line_original_folder = clean_line_folders[clean_line_folder]
                col_original_folder = clean_col_folders[clean_line_folder]
                if line_original_folder != col_original_folder:
                    # we got a matching folder, line -> col
                    print(f'{clean_line_folders[clean_line_folder]} -> {clean_col_folders[clean_line_folder]}')
                    line_path = os.path.join(char_dir, line_original_folder)
                    col_path = os.path.join(char_dir, col_original_folder)
                    
                    line2col_folder_map[line_path] = col_path
        

# pprint(line2col_folder_map)

  0%|          | 0/5 [00:00<?, ?it/s]



SH_0010

Connie
02_CONNIE_HAIR_LINE -> 03_COL_ConnieHair
04_CONNIE_HAND_LINE -> 05_COL_ConnieHand
06_LINE_CONNIE_UPPERBODY -> 07_COL_ConnieUpperbody
08_LINE_CONNIE_STILL -> 09_COL_ConnieStill

OldDude
19_OLdDudeLine -> 23_OldDudeCol

Ruben
24_RubenLine -> 27_RubenCol

SleepingDude
11_SleepingDude_Clean -> 13_SLeepingDudeCol


SH_0030

BGguy
14_BGguy_LINE -> 16_BGguy_FILL

BGkid
07_BGkidLINE -> 10_BGkidFILL

BGmum
11_BGmum_LINE -> 13_BGmum_FILL

BGoldman
20_BGoldmanLINE -> 22_BGoldmanFILL

BGStall
23_BGStallCharctersLine -> 25_BGStallCharctersFIll

BGwoman
17_BGwomanLINE -> 19_BGwomanFILL

FGcharacter
01_FGCHarcterLine -> 04_FGCHarcterFill
05_FGCHarcterLegsClean -> 06_FGCHarcterLegsFill


SH_0040

Connie
22_Line_Connie_armTablet -> 24_COL_Connie_armTablet
25_Line_Connie -> 27_COL_Connie
28_Line_ConnieStill -> 30_COL_ConnieStill

Janelle
19_Line_Janelle_hands -> 21_COL_Janelle_hands
31_Line_Janelle_body -> 33_COL_Janelle_body
34_Line_JanelleStill -> 36_COL_JanelleStill

Kurt
10_Line_Ku

In [4]:
IMG_EXTENSIONS = [
    '.jpg', '.jpeg',
    '.png', '.ppm', '.bmp',
    '.tif', '.tiff',
]

def is_image_file(filename):
    return any(filename.lower().endswith(ext) for ext in IMG_EXTENSIONS)

def find_all_image_paths(root):
    paths = []
    assert os.path.isdir(root)

    for root, _, filenames in sorted(os.walk(root)):
        for filename in filenames:
            if is_image_file(filename):
                paths.append(os.path.join(root, filename))
    
    return paths

In [26]:
line_fill_paths = []
for line_path, fill_path in tqdm(line2col_folder_map.items()):
    line_im_paths = sorted(find_all_image_paths(line_path))
    fill_im_paths = sorted(find_all_image_paths(fill_path))
    line_fill_paths += list(zip(line_im_paths, fill_im_paths))

print(f'Raw image pairs: {len(line_fill_paths)}')

  0%|          | 0/27 [00:00<?, ?it/s]

Raw image pairs: 1366


In [73]:
def process(paths):
    i, (line_path, fill_path) = paths
    
    # read image
    im_A = Image.open(line_path)
    im_B = Image.open(fill_path)

    # crop to non empty region
    non_empty_coord = im_A.getbbox()
    
    if non_empty_coord is None:
        # empty image
        return 0
    
    im_A = im_A.crop(non_empty_coord)
    im_B = im_B.crop(non_empty_coord)
    
    # pad and resize to square
    size = 512
    im_A = ImageOps.pad(im_A, (size, size), method=Image.BICUBIC, color='rgba(0,0,0,0)')
    im_B = ImageOps.pad(im_B, (size, size), method=Image.BICUBIC, color='rgba(0,0,0,0)')
    
    # combine
    im = Image.new('RGBA', (size * 2, size))
    im.paste(im_A, (0, 0))
    im.paste(im_B, (size, 0))
    
    # convert to RGB, do not use .convert('RGB')
    rgb = Image.new("RGB", im.size, (255, 255, 255))
    rgb.paste(im, mask=im.split()[3])

    rgb.save(os.path.join(DATASET_ROOT_DIR, f'{i}.jpg'), 'JPEG')
    return rgb

In [78]:
DATASET_ROOT_DIR = r'./datasets/processed_data'

Path(DATASET_ROOT_DIR).mkdir(parents=True, exist_ok=True)

worker_inputs = []
for i, (line_path, fill_path) in enumerate(line_fill_paths, 1):
    worker_inputs.append((i, (line_path, fill_path)))    
    
valid_pairs = 0

with Pool(8) as pool:
    for im in tqdm(pool.imap_unordered(process, worker_inputs), total=len(worker_inputs)):
        if im:
            valid_pairs += i

print(f'Number of valid data pairs: {valid_pairs}')

  0%|          | 0/1366 [00:00<?, ?it/s]

Number of valid data pairs: 1264


In [81]:
# remove duplicates

seen = set()
dups = []

for path in tqdm(find_all_image_paths(DATASET_ROOT_DIR)):
    with open(path, 'rb') as f:
        h = hashlib.sha256(f.read()).hexdigest()
        if h in seen:
            dups.append(path)
        else:
            seen.add(h)

# remove
for p in dups:
    Path(p).unlink(missing_ok=True)

print(f'Removed: {len(dups)}')

  0%|          | 0/1264 [00:00<?, ?it/s]

['./datasets/processed_data\\1002.jpg',
 './datasets/processed_data\\1004.jpg',
 './datasets/processed_data\\1005.jpg',
 './datasets/processed_data\\1007.jpg',
 './datasets/processed_data\\1008.jpg',
 './datasets/processed_data\\101.jpg',
 './datasets/processed_data\\1010.jpg',
 './datasets/processed_data\\1012.jpg',
 './datasets/processed_data\\1014.jpg',
 './datasets/processed_data\\1016.jpg',
 './datasets/processed_data\\1018.jpg',
 './datasets/processed_data\\1019.jpg',
 './datasets/processed_data\\1021.jpg',
 './datasets/processed_data\\1022.jpg',
 './datasets/processed_data\\1024.jpg',
 './datasets/processed_data\\1025.jpg',
 './datasets/processed_data\\1027.jpg',
 './datasets/processed_data\\1028.jpg',
 './datasets/processed_data\\1029.jpg',
 './datasets/processed_data\\103.jpg',
 './datasets/processed_data\\1030.jpg',
 './datasets/processed_data\\1031.jpg',
 './datasets/processed_data\\1032.jpg',
 './datasets/processed_data\\1033.jpg',
 './datasets/processed_data\\1034.jpg',
 '