### Scaling Preprocessing Task

#### 표정 분류
- **flow_from_dataframe**을 사용한다.

In [3]:
from glob import glob
import os

# original 폴더 안 이미지들의 이름을 'directory명+숫자.png'로 변경
root = './datasets/face/original/'

directories = glob(os.path.join(root,'*'))
directory_names = []

# root 경로 안 directory들의 이름 출력
for directory in directories:
    directory_names.append(directory[directory.rindex('\\') + 1:])

# 각 directory 내 이미지 파일의 이름을 'directory명+숫자(01, 02, ...).png'로 변경
for name in directory_names:
    for i, file_name in enumerate(os.listdir(os.path.join(root, name))):
        old_file = os.path.join(root + name + '/', file_name)
        new_file = os.path.join(root + name + '/', name + str(i + 1) + '.png')

        os.rename(old_file, new_file)

FileExistsError: [WinError 183] 파일이 이미 있으므로 만들 수 없습니다: './datasets/face/original/angry/angry10.png' -> './datasets/face/original/angry/angry2.png'

In [62]:
# 이미지와 배치 사이즈를 상수로 선언 
IMAGE_SIZE = 64
BATCH_SIZE = 64

In [63]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# ImageDataGenerator 선언 - RGB 값을 0 ~ 1사이로 스케일링만
image_data_generator = ImageDataGenerator(rescale=1./255)

# original 폴더 안 클래스들로 ImageDataGenerator 객체 생성
generator = image_data_generator.flow_from_directory(root,
                                                     target_size=(IMAGE_SIZE, IMAGE_SIZE),
                                                     batch_size=BATCH_SIZE,
                                                     class_mode='categorical')

print(generator.class_indices)

Found 28709 images belonging to 7 classes.
{'angry': 0, 'disgust': 1, 'fear': 2, 'happy': 3, 'neutral': 4, 'sad': 5, 'surprise': 6}


In [64]:
# class_indices의 key: value 순서를 반대로 한 새로운 dict 생성
target_name = {v: k for k, v in generator.class_indices.items()}
target_name

{0: 'angry',
 1: 'disgust',
 2: 'fear',
 3: 'happy',
 4: 'neutral',
 5: 'sad',
 6: 'surprise'}

In [65]:
target_names = []

# 반복문으로 target_name 안 문자열 데이터(value)를 target_names에 추가
for target in generator.classes:
    target_names.append(target_name[target])

In [66]:
import pandas as pd

# 경로, target_names(list) 및 target 클래스로 데이터프레임 생성
f_df = pd.DataFrame({'file_paths': generator.filepaths, 'target_names': target_names ,'targets': generator.classes})
f_df

Unnamed: 0,file_paths,target_names,targets
0,./datasets/face/original/angry\angry1.png,angry,0
1,./datasets/face/original/angry\angry10.png,angry,0
2,./datasets/face/original/angry\angry100.png,angry,0
3,./datasets/face/original/angry\angry1000.png,angry,0
4,./datasets/face/original/angry\angry1001.png,angry,0
...,...,...,...
28704,./datasets/face/original/surprise\surprise995.png,surprise,6
28705,./datasets/face/original/surprise\surprise996.png,surprise,6
28706,./datasets/face/original/surprise\surprise997.png,surprise,6
28707,./datasets/face/original/surprise\surprise998.png,surprise,6


In [67]:
# file_path의 '\\'를 '/'로 대체
f_df.loc[:, 'file_paths'] = a_df.file_paths.apply(lambda x: x.replace('\\', '/'))
f_df

Unnamed: 0,file_paths,target_names,targets
0,./datasets/face/original/angry/angry1.png,angry,0
1,./datasets/face/original/angry/angry10.png,angry,0
2,./datasets/face/original/angry/angry100.png,angry,0
3,./datasets/face/original/angry/angry1000.png,angry,0
4,./datasets/face/original/angry/angry1001.png,angry,0
...,...,...,...
28704,./datasets/face/original/surprise/surprise995.png,surprise,6
28705,./datasets/face/original/surprise/surprise996.png,surprise,6
28706,./datasets/face/original/surprise/surprise997.png,surprise,6
28707,./datasets/face/original/surprise/surprise998.png,surprise,6


In [68]:
from sklearn.model_selection import train_test_split

# 데이터 세트 분할
train_images, test_images, train_targets, test_targets = \
            train_test_split(f_df.file_paths, f_df.targets, stratify=a_df.targets, test_size=0.2, random_state=124)

print(train_targets.value_counts())
print(test_targets.value_counts())

targets
3    5772
4    3972
5    3864
2    3277
0    3196
6    2537
1     349
Name: count, dtype: int64
targets
3    1443
4     993
5     966
2     820
0     799
6     634
1      87
Name: count, dtype: int64


In [69]:
from sklearn.model_selection import train_test_split

# validation 데이터 세트 분할
train_images, validation_images, train_targets, validation_targets = \
train_test_split(train_images, train_targets, stratify=train_targets, test_size=0.2, random_state=124)

print(train_targets.value_counts())
print(validation_targets.value_counts())

targets
3    4617
4    3177
5    3091
2    2622
0    2557
6    2030
1     279
Name: count, dtype: int64
targets
3    1155
4     795
5     773
2     655
0     639
6     507
1      70
Name: count, dtype: int64


In [70]:
# train, validation, test Dataframe 생성
train_df = f_df.iloc[train_images.index].reset_index(drop=True)
validation_df = f_df.iloc[validation_images.index].reset_index(drop=True)
test_df = f_df.iloc[test_images.index].reset_index(drop=True)

print(train_df.shape)
print(validation_df.shape)
print(test_df.shape)

(18373, 3)
(4594, 3)
(5742, 3)


In [71]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import albumentations as A

IMAGE_SIZE = 64
BATCH_SIZE = 64

# train 데이터의 전처리 함수 - Augmentation + scaling(tf / torch)
def preprocessing_scaling_for_train(image, mode='tf'):
    # train 데이터는 Augmentation 실행
    aug = A.HorizontalFlip(p=0.5)
    image = aug(image=image)['image']

    # TensorFLow로 스케일링(VGG16, Xception 등)하는 모델 사용 시, RGB 값을 -1 ~ 1 사이로 변환
    if mode == 'tf': # -1 ~ 1 scale
        image = image / 127.5
        image -= 1.

    # PyTorch로 스케일링하는 모델(ResNet, DenseNet 등) 사용 시, RGB 값을 z-score로 변환
    elif mode == 'torch':
        image = image / 255.
        mean = [0.485, 0.456, 0.406]
        std = [0.229, 0.224, 0.225]
        
        image[:, :, 0] = (image[:, :, 0] - mean[0])/std[0]
        image[:, :, 1] = (image[:, :, 1] - mean[1])/std[1]
        image[:, :, 2] = (image[:, :, 2] - mean[2])/std[2]
        
    return image

# validation, test 데이터의 전처리 함수 - scaling
def preprocessing_scaling(image, mode='tf'):
    if mode == 'tf':
        image = image / 127.5
        image -= 1.
    
    elif mode == 'torch':
        image = image / 255.
        mean = [0.485, 0.456, 0.406]
        std = [0.229, 0.224, 0.225]
        
        image[:, :, 0] = (image[:, :, 0] - mean[0])/std[0]
        image[:, :, 1] = (image[:, :, 1] - mean[1])/std[1]
        image[:, :, 2] = (image[:, :, 2] - mean[2])/std[2]
        
    return image

# VGG16 모델을 사용하기 때문에, 해당 모델의 스케일링 방식인 tf 사용 (tf가 default라 따로 기재 안 함)
train_generator = ImageDataGenerator(preprocessing_function=preprocessing_scaling_for_train)
validation_generator = ImageDataGenerator(preprocessing_function=preprocessing_scaling)
test_generator = ImageDataGenerator(preprocessing_function=preprocessing_scaling)

# 각 dataframe으로부터 flow한 train, validation, test flow 생성 - 모델 fit 및 evalutaion 용도
train_flow = train_generator.flow_from_dataframe(dataframe=train_df, 
                                                 x_col='file_paths', 
                                                 y_col='target_names',
                                                 target_size=(IMAGE_SIZE, IMAGE_SIZE),
                                                 class_mode='categorical',
                                                 shuffle=True)

validation_flow = validation_generator.flow_from_dataframe(dataframe=validation_df, 
                                                           x_col='file_paths', 
                                                           y_col='target_names',
                                                           target_size=(IMAGE_SIZE, IMAGE_SIZE),
                                                           class_mode='categorical')

test_flow = test_generator.flow_from_dataframe(dataframe=test_df, 
                                               x_col='file_paths', 
                                               y_col='target_names',
                                               target_size=(IMAGE_SIZE, IMAGE_SIZE),
                                               class_mode='categorical')

# 각 flow의 클래스 종류 출력 (제대로 생성되었는지 확인)
print(train_flow.class_indices)
print(validation_flow.class_indices)
print(test_flow.class_indices)

Found 18373 validated image filenames belonging to 7 classes.
Found 4594 validated image filenames belonging to 7 classes.
Found 5742 validated image filenames belonging to 7 classes.
{'angry': 0, 'disgust': 1, 'fear': 2, 'happy': 3, 'neutral': 4, 'sad': 5, 'surprise': 6}
{'angry': 0, 'disgust': 1, 'fear': 2, 'happy': 3, 'neutral': 4, 'sad': 5, 'surprise': 6}
{'angry': 0, 'disgust': 1, 'fear': 2, 'happy': 3, 'neutral': 4, 'sad': 5, 'surprise': 6}


In [72]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense , Conv2D , Dropout , Flatten , Activation, MaxPooling2D , GlobalAveragePooling2D
from tensorflow.keras.layers import BatchNormalization

from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.applications import ResNet50V2
from tensorflow.keras.applications import Xception

# 모델 생성 함수(VGG16, ResNet50V2, Xception)
def create_model(model_name='vgg16', verbose=False):
    input_tensor = Input(shape=(IMAGE_SIZE, IMAGE_SIZE, 3))

    # 입력받은 model_name에 따라 다른 모델 사용
    # VGG16
    if model_name == 'vgg16':
        model = VGG16(input_tensor=input_tensor, include_top=False, weights='imagenet')

    # ResNet50V2
    elif model_name == 'resnet50':
        model = ResNet50V2(input_tensor=input_tensor, include_top=False, weights='imagenet')

    # Xception (Inception 기반)
    elif model_name == 'xception':
        model = Xception(input_tensor=input_tensor, include_top=False, weights='imagenet')

    # 모델의 출력값을 변수 x에 할당
    x = model.output

    # Classifier
    # VGG16 이외의 모델은 층 구조가 깊기 때문에 Dropout 사용 
    x = GlobalAveragePooling2D()(x)
    if model_name != 'vgg16':
        x = Dropout(rate=0.5)(x)
        
    x = Dense(50, activation='relu')(x)
    if model_name != 'vgg16':
        x = Dropout(rate=0.5)(x)

    # 이미지 데이터의 클래스가 7가지였기 때문에 Dense도 7로 지정
    output = Dense(7, activation='softmax', name='output')(x)
    
    model = Model(inputs=input_tensor, outputs=output)

    # 함수의 verbose 파라미터를 True로 설정 시, 모델 객체 생성과 함께 구조도 같이 출력
    if verbose:
        model.summary()

    # 모델 객체 반환
    return model

In [86]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import CategoricalCrossentropy

# VGG16 모델 생성 (+ 모델 구조 출력)
model = create_model(model_name='vgg16', verbose=True)

# 모델 컴파일링
model.compile(optimizer=Adam(), loss=CategoricalCrossentropy, metrics=['acc'])

In [87]:
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping

# callback 선언
mcp_cb = ModelCheckpoint(
    filepath="./callback_files/weights.{epoch:03d}-{val_loss:.4f}-{acc:.4f}.weights.h5",
    monitor='val_loss',
    save_best_only=False,
    save_weights_only=True,
    mode='min'
)

rlr_cb = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.1,
    patience=2,
    mode='min'
)

ely_cb = EarlyStopping(
    monitor='val_loss',
    patience=4,
    mode='min'
)

In [88]:
import gc

# Garbage Collecting
gc.collect()

10348

In [89]:
N_EPOCHS = 10

# 모델 학습
history = model.fit(train_flow,
                    batch_size=BATCH_SIZE,
                    epochs=N_EPOCHS,
                    validation_data=validation_flow,
                    callbacks=[mcp_cb, rlr_cb, ely_cb])

Epoch 1/10
[1m575/575[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m695s[0m 1s/step - acc: 0.2406 - loss: 1.8872 - val_acc: 0.2514 - val_loss: 1.8101 - learning_rate: 0.0010
Epoch 2/10
[1m575/575[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m743s[0m 1s/step - acc: 0.2558 - loss: 1.8130 - val_acc: 0.2514 - val_loss: 1.8118 - learning_rate: 0.0010
Epoch 3/10
[1m318/575[0m [32m━━━━━━━━━━━[0m[37m━━━━━━━━━[0m [1m5:04[0m 1s/step - acc: 0.2498 - loss: 1.8063

KeyboardInterrupt: 

In [90]:
# 모델 평가
model.evaluate(test_flow)

[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 315ms/step - acc: 0.2475 - loss: 1.8099


[1.814647912979126, 0.2513061761856079]

In [92]:
import matplotlib.pyplot as plt

# 각 epoch 별 train과 validation의 정확도 변화를 시각화하는 함수
def show_history(history):
    plt.figure(figsize=(6, 6))
    plt.yticks(np.arange(0, 1, 0.05))
    plt.plot(history.history['acc'], label='train')
    plt.plot(history.history['val_acc'], label='validation')
    plt.legend()

# 위 함수로 epoch 별 정확도 변화 시각화
show_history(history)

NameError: name 'history' is not defined

### 💡 특정 사전 훈련 모델의 전처리 방식을 그대로 가져다 쓰는 방법

In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications.densenet import preprocess_input

# 특정 모델의 preprocess_input을 preprocessing_function 파라미터 값으로 할당하면, 해당 모델의 전처리 방식을 가져다 쓸 수 있다
# 단, 이 경우 Augmentation은 아래 코드처럼 별도의 파라미터로 설정해줘야 한다
idg = ImageDataGenerator(preprocessing_function=preprocess_input, horizontal_flip=True)