### Image Augmentation

In [4]:
import pandas as pd
import random
import numpy as np
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import sklearn
import matplotlib.pyplot as plt

In [5]:
train = pd.read_csv('../fashionmnist/fashion-mnist_train.csv')

In [6]:
augmented_train = train.copy()
X_train = augmented_train.drop(['label'],axis = 1)
X_label = augmented_train['label']

X_train = X_train.astype('float32')

In [7]:
label_dict = {
    "T-shirt/top":0,
    "Trouser":1,
    "Pullover":2,
    "Dress":3,
    "Coat":4,
    "Sandal":5,
    "Shirt":6,
    "Sneaker":7,
    "Bag":8,
    "Ankle_boot":9
}

In [8]:
# 원하는 Label number 리스트로 담기
is_target = [False for _ in range(10)]
target_labels = ["T-shirt/top", "Trouser", "Pullover", "Dress", "Coat", "Shirt", "Sneaker", "Ankle_boot"] #증강할 옷 종류 담기
for label in target_labels:
    is_target[label_dict[label]] = True
target_res = [[] for _ in range(10)]

train_count = X_train.shape[0]
for ind in range(train_count):
    if is_target[X_label[ind]]:
        target_res[X_label[ind]].append(ind)

In [21]:
sample_ind = []
with open(f"결과약간통합.txt", 'r') as file:
    for ind in file.readlines():
        sample_ind.append(int(ind))
print(len(sample_ind))

7456


In [10]:
origin_train = X_train.values # pandas.DF -> numpy.ARRAY
origin_train = origin_train.reshape(-1, 28, 28, 1) # 60000x28x28로 변환

In [11]:
augmented_train = train.copy()
# plt.imshow(origin_train[59999], cmap='gray')
# plt.axis('off')
# plt.show()

print(f"Before data shape: {origin_train.shape}")
image_generator = ImageDataGenerator(
            width_shift_range=0.01, # 세로 이동 범위
            height_shift_range=0.05,# 가로 이동 범위
            dtype="int64",
            fill_mode="constant",
            cval=0,
            brightness_range=(0.3, 1.15)
            )


Before data shape: (60000, 28, 28, 1)


- 여기는 인덱스 직접 지정했을 때

In [25]:
augment_size = 6 # 각 표본별 증강할 개수
print(len(sample_ind))
print(f"Estimated Augmented shape: ({(len(sample_ind)*augment_size)}, 785)")


for ind in sample_ind:
    labels = [X_label[ind] for _ in range(augment_size)]
    images = np.array([origin_train[ind] for _ in range(augment_size)])
    aug_imgs, aug_labels = image_generator.flow(images, labels, batch_size=augment_size, shuffle=False, seed=1127).next() #save_prefix='augmented', save_to_dir="./augment", 
    aug_imgs = aug_imgs.reshape(augment_size, 784)
    aug_imgs = np.insert(aug_imgs, 0, aug_labels, axis=1)
    df_augmented = pd.DataFrame(aug_imgs, columns=augmented_train.columns)
    augmented_train = pd.concat([augmented_train, df_augmented], ignore_index=True)

augmented_train = augmented_train.fillna(0.0)
print(f"After data shape: {augmented_train.shape}")
augmented_train = sklearn.utils.shuffle(augmented_train)

7456
Estimated Augmented shape: (104736, 785)
After data shape: (111769, 785)


- 여기는 랜덤으로 샘플링 할 때

In [33]:
sample_size = 200 # 각 label마다 추출할 표본 개수
augment_size = 40 # 각 표본별 증강할 개수

print(f"Estimated Augmented shape: ({(sample_size*len(target_labels)*augment_size) + origin_train.shape[0]}, 785)")

for label in target_labels:
    abstract_class  = random.sample(target_res[label_dict[label]], sample_size) #옷을 종류별로 sample_size만큼 무작위 선택
    labels = [label_dict[label] for _ in range(augment_size)]
    for ind in abstract_class:
        images = np.array([origin_train[ind] for _ in range(augment_size)])
        aug_imgs, aug_labels = image_generator.flow(images, labels, batch_size=augment_size, shuffle=False, seed=1127).next() #save_prefix='augmented', save_to_dir="./augment", 
        aug_imgs = aug_imgs.reshape(augment_size, 784)
        aug_imgs = np.insert(aug_imgs, 0, aug_labels, axis=1)
        df_augmented = pd.DataFrame(aug_imgs, columns=augmented_train.columns)
        augmented_train = pd.concat([augmented_train, df_augmented], ignore_index=True)

augmented_train = augmented_train.fillna(0.0)
print(f"After data shape: {augmented_train.shape}")
augmented_train = sklearn.utils.shuffle(augmented_train)

Before data shape: (60000, 28, 28, 1)
Estimated Augmented shape: (124000, 785)
After data shape: (124000, 785)


In [26]:
augmented_train.to_csv('../Preprocessing/104736_only_sample.csv', index=False)