### Scaling Preprocess
- 0 ~ 1, -1 ~ 1, z-score 변환 중 하나를 선택하여 범위를 축소하는 작업을 의미한다.
- 사전 훈련 모델은 주로 TensorFlow와 PyTorch 프레임워크 방식을 사용한다.
- TensorFlow는 -1 ~ 1, PyTorch는 z-score 방식으로 변환하는 것이 각 프레임워크의 전통이다.

<img src='./images/scaling.png' width='400px' style='margin-top: 20px;'>

In [1]:
# translate.py 파일 읽어오기
with open('./datasets/animals/translate.py') as f:
    content = f.readline()

    # { 부터 }의 문자열을 읽어옴 (dict) 
    contents1 = eval(content[content.index('{'):content.index('}') + 1])

    # 위 dict 의 key, value 순서를 반대로 변경
    contents2 = {v: k for k, v in contents1.items()}

print(contents1, contents2, sep='\n')

{'cane': 'dog', 'cavallo': 'horse', 'elefante': 'elephant', 'farfalla': 'butterfly', 'gallina': 'chicken', 'gatto': 'cat', 'mucca': 'cow', 'pecora': 'sheep', 'scoiattolo': 'squirrel', 'dog': 'cane', 'elephant': 'elefante', 'butterfly': 'farfalla', 'chicken': 'gallina', 'cat': 'gatto', 'cow': 'mucca', 'spider': 'ragno', 'squirrel': 'scoiattolo'}
{'dog': 'cane', 'horse': 'cavallo', 'elephant': 'elefante', 'butterfly': 'farfalla', 'chicken': 'gallina', 'cat': 'gatto', 'cow': 'mucca', 'sheep': 'pecora', 'squirrel': 'scoiattolo', 'cane': 'dog', 'elefante': 'elephant', 'farfalla': 'butterfly', 'gallina': 'chicken', 'gatto': 'cat', 'mucca': 'cow', 'ragno': 'spider', 'scoiattolo': 'squirrel'}


In [2]:
from glob import glob
import os

root = './datasets/animals/original/'

# 위 root 경로 안 모든 파일명이 담긴 list 생성 - 동물 이름(영어 or 기타 외국어) 폴더들
directories = glob(os.path.join(root,'*'))

for directory in directories:
    # 폴더명을 영어로 변경
    try:
        os.rename(directory, os.path.join(root, contents1[directory[directory.rindex('\\') + 1:]]))
    except KeyError as e:
        os.rename(directory, os.path.join(root, contents2[directory[directory.rindex('\\') + 1:]]))

In [3]:
root = './datasets/animals/original/'

directories = glob(os.path.join(root,'*'))
directory_names = []

# root 경로 안 directory들의 이름 출력 (영어로 변환 잘 되었는지 확인)
for directory in directories:
    directory_names.append(directory[directory.rindex('\\') + 1:])

print(directory_names)

['butterfly', 'cat', 'chicken', 'cow', 'dog', 'elephant', 'horse', 'sheep', 'spider', 'squirrel']


In [4]:
root = './datasets/animals/original/'

# 각 directory 내 이미지 파일의 이름을 'directory명+숫자(01, 02, ...).png'로 변경
for name in directory_names:
    for i, file_name in enumerate(os.listdir(os.path.join(root, name))):
        old_file = os.path.join(root + name + '/', file_name)
        new_file = os.path.join(root + name + '/', name + str(i + 1) + '.png')

        os.rename(old_file, new_file)

In [5]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

root = './datasets/animals/original/'

idg = ImageDataGenerator(rescale=1./255)

generator = idg.flow_from_directory(
    root,
    target_size=(150, 150),
    batch_size=32,
    class_mode='categorical'
)

print(generator.class_indices)

Found 26179 images belonging to 10 classes.
{'butterfly': 0, 'cat': 1, 'chicken': 2, 'cow': 3, 'dog': 4, 'elephant': 5, 'horse': 6, 'sheep': 7, 'spider': 8, 'squirrel': 9}


In [6]:
# class_indices의 key: value 순서를 반대로 한 새로운 dict 생성
target_name = {v: k for k, v in generator.class_indices.items()}
target_name

{0: 'butterfly',
 1: 'cat',
 2: 'chicken',
 3: 'cow',
 4: 'dog',
 5: 'elephant',
 6: 'horse',
 7: 'sheep',
 8: 'spider',
 9: 'squirrel'}

In [7]:
target_names = []

for target in generator.classes:
    target_names.append(target_name[target])

In [14]:
import pandas as pd

# 경로 및 target 클래스로 데이터프레임 생성
a_df = pd.DataFrame({'file_paths': generator.filepaths, 'target_names': target_names ,'targets': generator.classes})
a_df

Unnamed: 0,file_paths,target_names,targets
0,./datasets/animals/original/butterfly\butterfl...,butterfly,0
1,./datasets/animals/original/butterfly\butterfl...,butterfly,0
2,./datasets/animals/original/butterfly\butterfl...,butterfly,0
3,./datasets/animals/original/butterfly\butterfl...,butterfly,0
4,./datasets/animals/original/butterfly\butterfl...,butterfly,0
...,...,...,...
26174,./datasets/animals/original/squirrel\squirrel9...,squirrel,9
26175,./datasets/animals/original/squirrel\squirrel9...,squirrel,9
26176,./datasets/animals/original/squirrel\squirrel9...,squirrel,9
26177,./datasets/animals/original/squirrel\squirrel9...,squirrel,9


In [15]:
# file_path의 '\\'를 '/'로 대체
a_df.loc[:, 'file_paths'] = a_df.file_paths.apply(lambda x: x.replace('\\', '/'))
a_df

Unnamed: 0,file_paths,target_names,targets
0,./datasets/animals/original/butterfly/butterfl...,butterfly,0
1,./datasets/animals/original/butterfly/butterfl...,butterfly,0
2,./datasets/animals/original/butterfly/butterfl...,butterfly,0
3,./datasets/animals/original/butterfly/butterfl...,butterfly,0
4,./datasets/animals/original/butterfly/butterfl...,butterfly,0
...,...,...,...
26174,./datasets/animals/original/squirrel/squirrel9...,squirrel,9
26175,./datasets/animals/original/squirrel/squirrel9...,squirrel,9
26176,./datasets/animals/original/squirrel/squirrel9...,squirrel,9
26177,./datasets/animals/original/squirrel/squirrel9...,squirrel,9


In [17]:
from sklearn.model_selection import train_test_split

# 데이터 세트 분할
train_images, test_images, train_targets, test_targets = \
            train_test_split(a_df.file_paths, a_df.targets, stratify=a_df.targets, test_size=0.2, random_state=124)

print(train_targets.value_counts())
print(test_targets.value_counts())

targets
4    3890
8    3857
2    2478
6    2098
0    1690
3    1493
9    1490
7    1456
1    1334
5    1157
Name: count, dtype: int64
targets
4    973
8    964
2    620
6    525
0    422
3    373
9    372
7    364
1    334
5    289
Name: count, dtype: int64


In [18]:
from sklearn.model_selection import train_test_split

# validation 데이터 세트 분할
train_images, validation_images, train_targets, validation_targets = \
train_test_split(train_images, train_targets, stratify=train_targets, test_size=0.2, random_state=124)

print(train_targets.value_counts())
print(validation_targets.value_counts())

targets
4    3112
8    3086
2    1982
6    1678
0    1352
3    1194
9    1192
7    1165
1    1067
5     926
Name: count, dtype: int64
targets
4    778
8    771
2    496
6    420
0    338
3    299
9    298
7    291
1    267
5    231
Name: count, dtype: int64


In [None]:
# train, validation, test  Dataframe 생성

In [None]:
from tensorflow.keras.preprocessing.images import ImageDataGenerator
import albumentations as A

IMAGE_SIZE = 64
BATCH_SIZE = 64

def transform(iamge):
    aug = A.HorizontalFlip(p=0.5)
    
    return aug(image=image)['image']

train_generator = ImageDataGenerator(preprocessing_function=transform, rescale=1./255)
validation_generator = ImageDataGenerator(rescale=1./255)
test_generator = ImageDataGenerator(rescale=1./255)

# Dataframe으로부터 플로우
# flow_from_dataframe의 y_col에는 문자열 데이터만 들어갈 수 있다
train_flow = train_generator.flow_from_dataframe(dataframe=train_df,
                                                x_col='file_paths',
                                                y_col='target_names',
                                                target_size=(IMAGE_SIZE, IMAGE_SIZE),
                                                class_mode='categorical',
                                                shuffle=True)

validation_flow = validation_generator.flow_from_dataframe(dataframe=validation_df,
                                                x_col='file_paths',
                                                y_col='target_names',
                                                target_size=(IMAGE_SIZE, IMAGE_SIZE),
                                                class_mode='categorical')

test_flow = test_generator.flow_from_dataframe(dataframe=test_df,
                                                x_col='file_paths',
                                                y_col='target_names',
                                                target_size=(IMAGE_SIZE, IMAGE_SIZE),
                                                class_mode='categorical')