In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import os 
from glob import glob 
from PIL import Image
import albumentations as A
from albumentations import ImageOnlyTransform
from augraphy import *
from tqdm import tqdm

INFO:albumentations.check_version:A new version of Albumentations is available: 1.4.13 (you have 1.4.12). Upgrade using: pip install -U albumentations. To disable automatic update checks, set the environment variable NO_ALBUMENTATIONS_UPDATE to 1.


# 1. Data 증강 

In [2]:
PRE_PATH = '/upstage-cv-classification-cv2/'
TRAIN_IMAGE_PATH = PRE_PATH + 'data/train'
TRAIN_AUG_IMAGE_PATH = PRE_PATH + 'data/train_base_aug' # 증강한 이미지들을 담을 폴더명 미리 지정

META_CSV_PATH = PRE_PATH + 'data/meta.csv'
META_DF = pd.read_csv(META_CSV_PATH)

TRAIN_CSV_PATH = PRE_PATH + 'data/train.csv'
TRAIN_DF = pd.read_csv(TRAIN_CSV_PATH)

In [3]:
# 아무런 변환 없음
original = A.Compose([])

# 회전 변환 + 수평 뒤집기
hf_rotate_000 = A.Compose([A.HorizontalFlip(p=1)])
hf_rotate_045 = A.Compose([A.HorizontalFlip(p=1), A.Rotate(limit=(45, 45), p=1)])
hf_rotate_090 = A.Compose([A.HorizontalFlip(p=1), A.Rotate(limit=(90, 90), p=1)])
hf_rotate_135 = A.Compose([A.HorizontalFlip(p=1), A.Rotate(limit=(135, 135), p=1)])
hf_rotate_180 = A.Compose([A.HorizontalFlip(p=1), A.Rotate(limit=(180, 180), p=1)])
hf_rotate_225 = A.Compose([A.HorizontalFlip(p=1), A.Rotate(limit=(225, 225), p=1)])
hf_rotate_270 = A.Compose([A.HorizontalFlip(p=1), A.Rotate(limit=(270, 270), p=1)])
hf_rotate_315 = A.Compose([A.HorizontalFlip(p=1), A.Rotate(limit=(315, 315), p=1)])

# 회전 변환
rotate_045 = A.Compose([A.Rotate(limit=(45, 45), p=1)])
rotate_090 = A.Compose([A.Rotate(limit=(90, 90), p=1)])
rotate_135 = A.Compose([A.Rotate(limit=(135, 135), p=1)])
rotate_180 = A.Compose([A.Rotate(limit=(180, 180), p=1)])
rotate_225 = A.Compose([A.Rotate(limit=(225, 225), p=1)])
rotate_270 = A.Compose([A.Rotate(limit=(270, 270), p=1)])
rotate_315 = A.Compose([A.Rotate(limit=(315, 315), p=1)])

# 여러 이미지 변환을 정의한 리스트입니다. 각 변환은 튜플로 되어 있으며, 튜플의 첫 번째 요소는 변환의 접두사(prefix)이고, 두 번째 요소는 변환 객체입니다.
base_aug_types = [
    (f"original_", original),
    (f"hf_r000_", hf_rotate_000),
    (f"hf_r045_", hf_rotate_045),
    (f"hf_r090_", hf_rotate_090),
    (f"hf_r135_", hf_rotate_135),
    (f"hf_r180_", hf_rotate_180),
    (f"hf_r225_", hf_rotate_225),
    (f"hf_r270_", hf_rotate_270),
    (f"hf_r315_", hf_rotate_315),
    (f"r045_", rotate_045),
    (f"r090_", rotate_090),
    (f"r135_", rotate_135),
    (f"r180_", rotate_180),
    (f"r225_", rotate_225),
    (f"r270_", rotate_270),
    (f"r315_", rotate_315) 
]

In [4]:
ids = []
targets = []

for index, ID, target in tqdm(TRAIN_DF.itertuples(), total=TRAIN_DF.shape[0], desc='Image augmentation'):
    image_path = os.path.join(TRAIN_IMAGE_PATH, ID)
    image = np.array(Image.open(image_path))
    
    # `base_aug_types`에 정의된 각 변환에 대해 반복합니다.
    for prefix, aug_function in base_aug_types:
        # 변환 함수를 사용하여 이미지를 변환합니다.
        transformed_image = aug_function(image=image)['image']
        new_ID = prefix + ID
        
        ids.append(new_ID)
        targets.append(target)
        Image.fromarray(transformed_image).save(os.path.join(TRAIN_AUG_IMAGE_PATH, new_ID))

aug_data = {
    'ID': ids,
    'target': targets
}
aug_data = pd.DataFrame(aug_data)

Image augmentation: 100%|██████████| 1570/1570 [00:51<00:00, 30.31it/s]


## 1-1. 원본 데이터 복사

In [5]:
import shutil

def copy_files(source_dir, dest_dir):
    # 소스 폴더가 존재하는지 확인합니다.
    if not os.path.exists(source_dir):
        print(f"Error: Source directory '{source_dir}' does not exist.")
        return
    
    try:
        # 소스 디렉토리의 모든 파일을 반복합니다.
        for filename in os.listdir(source_dir):
            source_file = os.path.join(source_dir, filename)
            dest_file = os.path.join(dest_dir, filename)
            
            # 파일을 목적지 폴더로 복사합니다.
            shutil.copy2(source_file, dest_file)
        print("All files copied successfully.")

    except Exception as e:
        print(f"An error occurred: {e}")

copy_files(TRAIN_IMAGE_PATH, TRAIN_AUG_IMAGE_PATH)

All files copied successfully.


## 1-2 train_agu.csv 만들기

In [6]:
# 원본 DataFrame `TRAIN_KR_DF`와 증강된 데이터 `aug_data`를 결합하여 새로운 DataFrame `df`를 만듭니다.
df = pd.concat([TRAIN_DF, aug_data])

# 저장할 파일 경로 정의
TRAIN_AUG_CSV_PATH = PRE_PATH + 'data/train_base_aug.csv'
df.to_csv(TRAIN_AUG_CSV_PATH, index=False)

## 1-3 라벨링 잘 못된 데이터 수정

In [7]:
df = pd.read_csv(TRAIN_AUG_CSV_PATH)

# 조건에 따라 타겟 값을 변경하는 함수입니다.
def update_target(row):
    if "45f0d2dfc7e47c03" in row['ID']: return 7
    elif "aec62dced7af97cd" in row['ID']: return 14
    elif "8646f2c3280a4f49" in row['ID']: return 3
    elif "1ec14a14bbe633db" in row['ID']: return 7
    elif "7100c5c67aecadc5" in row['ID']: return 7
    elif "c5182ab809478f12" in row['ID']: return 14
    elif "38d1796b6ad99ddd" in row['ID']: return 10
    elif "0583254a73b48ece" in row['ID']: return 10
    else: return row['target']

df['target'] = df.apply(update_target, axis=1)
df.to_csv(TRAIN_AUG_CSV_PATH, index=False)