# Crop & Resize Extreme Images to Make Dataset More Uniform

In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

In [2]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
os.environ['CUDA_VISIBLE_DEVICES'] = '1'
import tensorflow as tf
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [3]:
SRC_DIR = './Data/XRay/'
DST_DIR = './Data/XRay_/'
TEST_DIR = 'test/'
TRAIN_DIR = 'train/'

In [4]:
AR_PCT_LO = 5   # Aspect-Ratio Percentile Lower-Bound
AR_PCT_HI = 90  # Aspect-Ratio Percentile Upper-Bound
IMG_TGT_H = 800 # Resizing Target Height

In [5]:
os.mkdir(DST_DIR)
os.mkdir(DST_DIR + TEST_DIR)
os.mkdir(DST_DIR + TRAIN_DIR)

In [6]:
classes = os.listdir(SRC_DIR + TRAIN_DIR)
train_files = []
for klass in classes:
    class_files = [TRAIN_DIR + klass + os.sep + fname
                   for fname in os.listdir(SRC_DIR + TRAIN_DIR + klass)]
    print(f'{klass}: {len(class_files)}')
    train_files.extend(class_files)
len(train_files)

NORMAL: 1349
PNEUMONIA: 3883


5232

In [7]:
classes = os.listdir(SRC_DIR + TEST_DIR)
test_files = []
for klass in classes:
    class_files = [TEST_DIR + klass + os.sep + fname
                   for fname in os.listdir(SRC_DIR + TEST_DIR + klass)]
    print(f'{klass}: {len(class_files)}')
    test_files.extend(class_files)
len(test_files)

NORMAL: 234
PNEUMONIA: 389


623

In [8]:
for klass in classes:
    os.mkdir(DST_DIR + TEST_DIR + klass)
    os.mkdir(DST_DIR + TRAIN_DIR + klass)

In [9]:
def get_ars(fpaths):
    ars = []
    for fpath in fpaths:
        img = plt.imread(SRC_DIR + fpath)
        ars.append(img.shape[0] / img.shape[1])
    return sorted(ars)

In [10]:
def crop_to_ar(img, ar_lo, ar_hi):
    img_ar = img.shape[0] / img.shape[1]
    if img_ar < ar_lo:  # too wide
        w = img.shape[0] / ar_lo
        crop = int((img.shape[1] - w) // 2)
        if crop <= 0:
            return img
        else:
            return img[:, crop:-crop, ...]
    elif img_ar > ar_hi:  # too tall
        h = ar_hi * img.shape[1]
        crop = int((img.shape[0] - h) // 2)
        if crop <= 0:
            return img
        else:
            return img[crop:-crop, :, ...]
    return img

In [11]:
def process_img(fpath, ar_lo, ar_hi):
    img = plt.imread(SRC_DIR + fpath)
    if len(img.shape) > 2:
        assert len(img.shape) == 3
        assert img.shape[2] == 3
        img = img[..., 0]
    assert len(img.shape) == 2
    return crop_to_ar(img, ar_lo, ar_hi)

In [12]:
def process_files(fpaths, ar_lo, ar_hi):
    for fpath in fpaths:
        img = process_img(fpath, ar_lo, ar_hi)
        img = np.expand_dims(img, axis=-1)
        img_tgt_width = tf.cast(IMG_TGT_H / img.shape[0] * img.shape[1],
                                tf.int32)
        img = tf.image.resize(img, (IMG_TGT_H, img_tgt_width),
                              method='bicubic', antialias=True)
        img = tf.cast(img, tf.uint8)
        tf.io.write_file(DST_DIR + fpath,
                         tf.io.encode_jpeg(img, format='grayscale'))

In [13]:
train_ars = get_ars(train_files)
train_ar_lmh = np.percentile(train_ars, [0, AR_PCT_LO, 50, AR_PCT_HI, 100])
train_ar_lmh

array([0.29596413, 0.53543233, 0.70850311, 0.87016367, 1.19704433])

In [14]:
test_ars = get_ars(test_files)
test_ar_lmh = np.percentile(test_ars, [0, AR_PCT_LO, 50, AR_PCT_HI, 100])
test_ar_lmh

array([0.38738739, 0.54218626, 0.70588235, 0.87590194, 1.09018987])

In [15]:
process_files(train_files, train_ar_lmh[1], train_ar_lmh[3])

In [16]:
process_files(test_files, train_ar_lmh[1], train_ar_lmh[3])