In [42]:
import os
import random
import shutil
import xml.etree.ElementTree as ET
from pathlib import Path

import matplotlib.patches as patches
import matplotlib.pyplot as plt
import numpy as np
import scipy.io
from PIL import Image
from tqdm import tqdm


In [3]:
WORK_PATH = Path.cwd()
print(WORK_PATH)

/home/ting/Private-Projects/Tensorflow/mobilenetv3-ic


In [9]:
INTERMEDIATE_DIR_PATH = WORK_PATH / '_datasets' / 'filter'

# dataset source
FILTER_RMFD_DIR_PATH = INTERMEDIATE_DIR_PATH / 'RMFD'
FILTER_FMLD_DIR_PATH = INTERMEDIATE_DIR_PATH / 'FMLD'

# train or test
FILTER_TRAIN_RMFD_DIR_PATH = FILTER_RMFD_DIR_PATH / 'train'
FILTER_TEST_RMFD_DIR_PATH = FILTER_RMFD_DIR_PATH / 'test'

FILTER_TRAIN_FMLD_DIR_PATH = FILTER_FMLD_DIR_PATH / 'train'
FILTER_TEST_FMLD_DIR_PATH = FILTER_FMLD_DIR_PATH / 'test'

# masked or unmasked classes
FILTER_TRAIN_MASKED_RMFD_DIR_PATH = FILTER_TRAIN_RMFD_DIR_PATH / 'masked'
FILTER_TRAIN_UNMASKED_RMFD_DIR_PATH = FILTER_TRAIN_RMFD_DIR_PATH / 'unmasked'

FILTER_TEST_MASKED_RMFD_DIR_PATH = FILTER_TEST_RMFD_DIR_PATH / 'masked'
FILTER_TEST_UNMASKED_RMFD_DIR_PATH = FILTER_TEST_RMFD_DIR_PATH / 'unmasked'

FILTER_TRAIN_MASKED_FMLD_DIR_PATH = FILTER_TRAIN_FMLD_DIR_PATH / 'masked'
FILTER_TRAIN_UNMASKED_FMLD_DIR_PATH = FILTER_TRAIN_FMLD_DIR_PATH / 'unmasked'

FILTER_TEST_MASKED_FMLD_DIR_PATH = FILTER_TEST_FMLD_DIR_PATH / 'masked'
FILTER_TEST_UNMASKED_FMLD_DIR_PATH = FILTER_TEST_FMLD_DIR_PATH / 'unmasked'

# create
FILTER_TRAIN_MASKED_RMFD_DIR_PATH.mkdir(parents=True, exist_ok=True)
FILTER_TRAIN_UNMASKED_RMFD_DIR_PATH.mkdir(parents=True, exist_ok=True)
FILTER_TEST_MASKED_RMFD_DIR_PATH.mkdir(parents=True, exist_ok=True)
FILTER_TEST_UNMASKED_RMFD_DIR_PATH.mkdir(parents=True, exist_ok=True)
FILTER_TRAIN_MASKED_FMLD_DIR_PATH.mkdir(parents=True, exist_ok=True)
FILTER_TRAIN_UNMASKED_FMLD_DIR_PATH.mkdir(parents=True, exist_ok=True)
FILTER_TEST_MASKED_FMLD_DIR_PATH.mkdir(parents=True, exist_ok=True)
FILTER_TEST_UNMASKED_FMLD_DIR_PATH.mkdir(parents=True, exist_ok=True)


## RMFD

In [25]:
# RMFD
RMFD_PARENT_PATH = Path('./_datasets/raw/self-built-masked-face-recognition-dataset')
print(RMFD_PARENT_PATH.absolute())

/home/ting/Private-Projects/Tensorflow/mobilenetv3-ic/_datasets/raw/self-built-masked-face-recognition-dataset


In [26]:
# explore
MASKED_RMFD_PATH = RMFD_PARENT_PATH / 'AFDB_masked_face_dataset'
num_of_img = 0
for child_path in MASKED_RMFD_PATH.iterdir():
    for img_path in child_path.iterdir():
        num_of_img+=1

print(f'Num of MASKED img:{num_of_img}')

Num of MASKED img:2203


In [40]:
# explore
NON_MASKED_RMFD_PATH = RMFD_PARENT_PATH / 'AFDB_face_dataset'
num_of_img = 0
for child_path in NON_MASKED_RMFD_PATH.iterdir():
    for img_path in child_path.iterdir():
        num_of_img+=1

print(f'Num of NON MASKED img:{num_of_img}')

Num of NON MASKED img:90468


In [38]:
# copy masked to filter
for child_path in MASKED_RMFD_PATH.iterdir():
    label_count = 1
    for img_path in child_path.iterdir():
        dst_path = FILTER_TRAIN_MASKED_RMFD_DIR_PATH / f'{img_path.parent.name}_{label_count:02d}{img_path.suffix}'
        shutil.copy2(img_path, dst_path)
        label_count += 1
    


In [41]:
# copy unmasked to filter
for child_path in NON_MASKED_RMFD_PATH.iterdir():
    label_count = 1
    for img_path in child_path.iterdir():
        dst_path = FILTER_TRAIN_UNMASKED_RMFD_DIR_PATH / f'{img_path.parent.name}_{label_count:02d}{img_path.suffix}'
        shutil.copy2(img_path, dst_path)
        label_count += 1

## FMLD

In [18]:
# FMLD_train & test
FMLD_TRAIN_PARENT_DIR_PATH = Path('./_datasets/raw/MAFA_train/images')
FMLD_TEST_PARENT_DIR_PATH = Path('./_datasets/raw/MAFA_test/images')
FMLD_ANNOTATION_DIR_PATH = Path('./_datasets/raw/FMLD_annotations')
FMLD_TRAIN_ANNOTATION_DIR_PATH = FMLD_ANNOTATION_DIR_PATH / 'train'
FMLD_TEST_ANNOTATION_DIR_PATH = FMLD_ANNOTATION_DIR_PATH / 'test'
print(FMLD_TRAIN_PARENT_DIR_PATH.absolute())

/home/ting/Private-Projects/Tensorflow/mobilenetv3-ic/_datasets/raw/MAFA_train/images


In [19]:
# collect train & test image file name
FMLD_train_img_num = 0
FMLD_train_img_names = []
FMLD_train_img_stem_names = []
FMLD_test_img_num = 0
FMLD_test_img_names = []
FMLD_test_img_stem_names = []

for child_path in FMLD_TRAIN_PARENT_DIR_PATH.iterdir():
    FMLD_train_img_names.append(child_path.name)
    FMLD_train_img_stem_names.append(child_path.stem)

    FMLD_train_img_num+=1

for child_path in FMLD_TEST_PARENT_DIR_PATH.iterdir():
    FMLD_test_img_names.append(child_path.name)
    FMLD_test_img_stem_names.append(child_path.stem)

    FMLD_test_img_num+=1

print(f'Num of FMLD TRAIN img:{FMLD_train_img_num}')
print(f'Num of FMLD TEST img:{FMLD_test_img_num}')

Num of FMLD TRAIN img:25876
Num of FMLD TEST img:4935


In [20]:
# collect FMLD train & test annotation filename
FMLD_train_annot_file_num = 0
FMLD_train_annot_file_names = []
FMLD_train_img_stem_names_set = set(FMLD_train_img_stem_names)
FMLD_test_annot_file_num = 0
FMLD_test_annot_file_names = []
FMLD_test_img_stem_names_set = set(FMLD_test_img_stem_names)

for child_path in FMLD_TRAIN_ANNOTATION_DIR_PATH.iterdir():
    if child_path.stem in FMLD_train_img_stem_names_set:
        FMLD_train_annot_file_names.append(child_path.name)
        FMLD_train_annot_file_num += 1

for child_path in FMLD_TEST_ANNOTATION_DIR_PATH.iterdir():
    if child_path.stem in FMLD_test_img_stem_names_set:
        FMLD_test_annot_file_names.append(child_path.name)
        FMLD_test_annot_file_num += 1

print(f'Num of FMLD TRAIN annotation file:{FMLD_train_annot_file_num}')
print(f'Num of FMLD TEST annotation file:{FMLD_test_annot_file_num}')


Num of FMLD TRAIN annotation file:25876
Num of FMLD TEST annotation file:4935


In [17]:
# exp read xml file
FMLD_train_unmasked_num = 0
FMLD_train_masked_num = 0

for i in range(len(FMLD_train_annot_file_names)):
    xml_file = str(FMLD_TRAIN_ANNOTATION_DIR_PATH / FMLD_train_annot_file_names[i])
    xml_tree = ET.parse(xml_file)
    xml_root = xml_tree.getroot()
    filename = xml_root.find('filename').text
    filestem = Path(filename).stem
    img_file = str(FMLD_TRAIN_PARENT_DIR_PATH / filename)

    img_pil = Image.open(img_file)
    if img_pil.mode in ("RGBA", "P"):
        img_pil = img_pil.convert("RGB")
    img_np = np.asarray(img_pil, dtype="int32")
    h, w, _ = img_np.shape
    # print(f'xml_file: {xml_file}')
    # print(f'img_file: {img_file}')
    # print(f'w: {w}')
    # print(f'h: {h}')

    if i % 1000 == 0:
        print(f'Processed {i} files')

    label_count = 1
    for ii, boxes in enumerate(xml_root.iter('object'), start=1):
        name = boxes.find('name').text

        # filter class
        if name == 'unmasked_face':
            class_label = 'unmasked'
            FMLD_train_unmasked_num += 1
        elif name == 'masked_face':
            class_label = 'masked'
            FMLD_train_masked_num += 1

        else:
            continue

        ymin, xmin, ymax, xmax = None, None, None, None

        xmin = max(0, int(float(boxes.find("bndbox/xmin").text)))
        ymin = max(0, int(float(boxes.find("bndbox/ymin").text)))
        xmax = min(w, int(float(boxes.find("bndbox/xmax").text)))
        ymax = min(h, int(float(boxes.find("bndbox/ymax").text)))

        # (left x, top y, right x, bottom y)
        crop_img_pil = img_pil.crop((xmin, ymin, xmax, ymax))
        crop_img_pil.save(f'{str(FILTER_TRAIN_FMLD_DIR_PATH)}/{class_label}/{filestem}_{label_count:02d}.jpg', quality=100, subsampling=0)
        label_count += 1
        # print(f'name: {name}')
        # print(f'xmin: {xmin}')
        # print(f'ymin: {ymin}')
        # print(f'xmax: {xmax}')
        # print(f'ymax: {ymax}')
print(f'FMLD_train_unmasked_num: {FMLD_train_unmasked_num}')
print(f'FMLD_train_masked_num: {FMLD_train_masked_num}')


Processed 0 files
Processed 1000 files
Processed 2000 files
Processed 3000 files
Processed 4000 files
Processed 5000 files
Processed 6000 files
Processed 7000 files
Processed 8000 files
Processed 9000 files
Processed 10000 files
Processed 11000 files
Processed 12000 files
Processed 13000 files
Processed 14000 files
Processed 15000 files
Processed 16000 files
Processed 17000 files
Processed 18000 files
Processed 19000 files
Processed 20000 files
Processed 21000 files
Processed 22000 files
Processed 23000 files
Processed 24000 files
Processed 25000 files
MAFA_train_unmasked_num: 3645
MAFA_train_masked_num: 24603


In [21]:
# exp read xml file
FMLD_test_unmasked_num = 0
FMLD_test_masked_num = 0

for i in range(len(FMLD_test_annot_file_names)):
    xml_file = str(FMLD_TEST_ANNOTATION_DIR_PATH / FMLD_test_annot_file_names[i])
    xml_tree = ET.parse(xml_file)
    xml_root = xml_tree.getroot()
    filename = xml_root.find('filename').text
    filestem = Path(filename).stem
    img_file = str(FMLD_TEST_PARENT_DIR_PATH / filename)

    img_pil = Image.open(img_file)
    if img_pil.mode in ("RGBA", "P"):
        img_pil = img_pil.convert("RGB")
    img_np = np.asarray(img_pil, dtype="int32")
    h, w, _ = img_np.shape
    # print(f'xml_file: {xml_file}')
    # print(f'img_file: {img_file}')
    # print(f'w: {w}')
    # print(f'h: {h}')

    if i % 1000 == 0:
        print(f'Processed {i} files')

    label_count = 1
    for ii, boxes in enumerate(xml_root.iter('object'), start=1):
        name = boxes.find('name').text

        # filter class
        if name == 'unmasked_face':
            class_label = 'unmasked'
            FMLD_test_unmasked_num += 1
        elif name == 'masked_face':
            class_label = 'masked'
            FMLD_test_masked_num += 1

        else:
            continue

        ymin, xmin, ymax, xmax = None, None, None, None

        xmin = max(0, int(float(boxes.find("bndbox/xmin").text)))
        ymin = max(0, int(float(boxes.find("bndbox/ymin").text)))
        xmax = min(w, int(float(boxes.find("bndbox/xmax").text)))
        ymax = min(h, int(float(boxes.find("bndbox/ymax").text)))

        # (left x, top y, right x, bottom y)
        crop_img_pil = img_pil.crop((xmin, ymin, xmax, ymax))
        crop_img_pil.save(f'{str(FILTER_TEST_FMLD_DIR_PATH)}/{class_label}/{filestem}_{label_count:02d}.jpg', quality=100, subsampling=0)
        label_count += 1
        # print(f'name: {name}')
        # print(f'xmin: {xmin}')
        # print(f'ymin: {ymin}')
        # print(f'xmax: {xmax}')
        # print(f'ymax: {ymax}')
print(f'FMLD_test_unmasked_num: {FMLD_test_unmasked_num}')
print(f'FMLD_test_masked_num: {FMLD_test_masked_num}')


Processed 0 files
Processed 1000 files
Processed 2000 files
Processed 3000 files
Processed 4000 files
FMLD_test_unmasked_num: 2391
FMLD_test_masked_num: 4958


## Sampling

In [48]:
SAMPLED_DIR_PATH = WORK_PATH / '_datasets' / 'sampled'

# train or test
SAMPLED_TRAIN_DIR_PATH = SAMPLED_DIR_PATH / 'train'
SAMPLED_TEST_DIR_PATH = SAMPLED_DIR_PATH / 'test'

# masked or unmasked classes
SAMPLED_TRAIN_MASKED_DIR_PATH = SAMPLED_TRAIN_DIR_PATH / 'masked'
SAMPLED_TRAIN_UNMASKED_DIR_PATH = SAMPLED_TRAIN_DIR_PATH / 'unmasked'

SAMPLED_TEST_MASKED_DIR_PATH = SAMPLED_TEST_DIR_PATH / 'masked'
SAMPLED_TEST_UNMASKED_DIR_PATH = SAMPLED_TEST_DIR_PATH / 'unmasked'

# create
SAMPLED_TRAIN_MASKED_DIR_PATH.mkdir(parents=True, exist_ok=True)
SAMPLED_TRAIN_UNMASKED_DIR_PATH.mkdir(parents=True, exist_ok=True)
SAMPLED_TEST_MASKED_DIR_PATH.mkdir(parents=True, exist_ok=True)
SAMPLED_TEST_UNMASKED_DIR_PATH.mkdir(parents=True, exist_ok=True)

In [49]:
# collect frequency of each train test classes
filter_train_masked_RMFD_num = len(list(FILTER_TRAIN_MASKED_RMFD_DIR_PATH.iterdir()))
filter_train_unmasked_RMFD_num = len(list(FILTER_TRAIN_UNMASKED_RMFD_DIR_PATH.iterdir()))
filter_test_masked_RMFD_num = len(list(FILTER_TEST_MASKED_RMFD_DIR_PATH.iterdir()))
filter_test_unmasked_RMFD_num = len(list(FILTER_TEST_UNMASKED_RMFD_DIR_PATH.iterdir()))

filter_train_masked_FMLD_num = len(list(FILTER_TRAIN_MASKED_FMLD_DIR_PATH.iterdir()))
filter_train_unmasked_FMLD_num = len(list(FILTER_TRAIN_UNMASKED_FMLD_DIR_PATH.iterdir()))
filter_test_masked_FMLD_num = len(list(FILTER_TEST_MASKED_FMLD_DIR_PATH.iterdir()))
filter_test_unmasked_FMLD_num = len(list(FILTER_TEST_UNMASKED_FMLD_DIR_PATH.iterdir()))

print(f'filter_train_masked_RMFD_num: {filter_train_masked_RMFD_num}')
print(f'filter_train_masked_FMLD_num: {filter_train_masked_FMLD_num}')
print(f'filter_train_unmasked_RMFD_num: {filter_train_unmasked_RMFD_num}')
print(f'filter_train_unmasked_FMLD_num: {filter_train_unmasked_FMLD_num}')

print(f'filter_test_masked_RMFD_num: {filter_test_masked_RMFD_num}')
print(f'filter_test_masked_FMLD_num: {filter_test_masked_FMLD_num}')
print(f'filter_test_unmasked_RMFD_num: {filter_test_unmasked_RMFD_num}')
print(f'filter_test_unmasked_FMLD_num: {filter_test_unmasked_FMLD_num}')

filter_train_masked_RMFD_num: 2203
filter_train_masked_FMLD_num: 24603
filter_train_unmasked_RMFD_num: 90468
filter_train_unmasked_FMLD_num: 3645
filter_test_masked_RMFD_num: 0
filter_test_masked_FMLD_num: 4958
filter_test_unmasked_RMFD_num: 0
filter_test_unmasked_FMLD_num: 2391


- Target train
  - masked: 26806 (just combine)
  - unmasked: 26806 (combine then shuffle then prune)
- Target test
  - masked: 2391 (shuffle then prune)
  - unmasked: 2391

In [51]:
random.seed(42)
ts_train_masked_imgs = [path.absolute() for path in FILTER_TRAIN_MASKED_RMFD_DIR_PATH.iterdir()] \
    + [path.absolute() for path in FILTER_TRAIN_MASKED_FMLD_DIR_PATH.iterdir()]


ts_train_unmasked_imgs = [path.absolute() for path in FILTER_TRAIN_UNMASKED_RMFD_DIR_PATH.iterdir()] \
    + [path.absolute() for path in FILTER_TRAIN_UNMASKED_FMLD_DIR_PATH.iterdir()]
random.shuffle(ts_train_unmasked_imgs)
ts_train_unmasked_imgs = ts_train_unmasked_imgs[:26806]

ts_test_masked_imgs = [path.absolute() for path in FILTER_TEST_MASKED_FMLD_DIR_PATH.iterdir()]
random.shuffle(ts_test_masked_imgs)
ts_test_masked_imgs = ts_test_masked_imgs[:2391]

ts_test_unmasked_imgs = [path.absolute() for path in FILTER_TEST_UNMASKED_FMLD_DIR_PATH.iterdir()]

print(f'ts_train_masked_imgs: {len(ts_train_masked_imgs)}')
print(f'ts_train_unmasked_imgs: {len(ts_train_unmasked_imgs)}')
print(f'ts_test_masked_imgs: {len(ts_test_masked_imgs)}')
print(f'ts_test_unmasked_imgs: {len(ts_test_unmasked_imgs)}')


ts_train_masked_imgs: 26806
ts_train_unmasked_imgs: 26806
ts_test_masked_imgs: 2391
ts_test_unmasked_imgs: 2391


In [53]:
print(ts_test_unmasked_imgs[0])
print(type(ts_test_unmasked_imgs[0]))

/home/ting/Private-Projects/Tensorflow/mobilenetv3-ic/_datasets/filter/FMLD/test/unmasked/test_00003747_02.jpg
<class 'pathlib.PosixPath'>


In [55]:
# copy to sampled folder
for src_path in ts_train_masked_imgs:
    dst_path = SAMPLED_TRAIN_MASKED_DIR_PATH / src_path.name
    shutil.copy2(src_path, dst_path)

for src_path in ts_train_unmasked_imgs:
    dst_path = SAMPLED_TRAIN_UNMASKED_DIR_PATH / src_path.name
    shutil.copy2(src_path, dst_path)

for src_path in ts_test_masked_imgs:
    dst_path = SAMPLED_TEST_MASKED_DIR_PATH / src_path.name
    shutil.copy2(src_path, dst_path)

for src_path in ts_test_unmasked_imgs:
    dst_path = SAMPLED_TEST_UNMASKED_DIR_PATH / src_path.name
    shutil.copy2(src_path, dst_path)


In [None]:
# zip folder
## cd mobilenetv3-ic/_datasets/sampled
## zip -r face_mask_datasets.zip ./