In [None]:
import numpy as np
import albumentations as at
import cv2
import os
import random
import pandas as pd
import matplotlib.pyplot as plt

# 경로 설정
input_folder = '/root/data/home/data/train'
output_folder = '/root/data/home/data/V6_train'
os.makedirs(output_folder, exist_ok=True)

train_csv_path = '/root/data/home/data/train.csv'
output_csv_path = '/root/data/home/data/V6_train_labels.csv'

# 데이터 로드
df = pd.read_csv(train_csv_path)
label_dict = dict(zip(df['ID'], df['target']))

# 변환 파이프라인 정의
augmentation_pipeline = at.Compose([
    at.SomeOf([
        at.GaussNoise(p=1, var_limit=(500, 1000)),
        at.RandomBrightnessContrast(brightness_limit=(-0.3, 0.3), contrast_limit=(-0.3, 0.3), p=1),
        at.HorizontalFlip(p=1),
        at.VerticalFlip(p=1),
    ], n=random.randint(2, 3), p=1),
    at.SomeOf([
        at.CoarseDropout(max_holes=8, max_height=32, max_width=32, p=1),  # Cutout
        at.GridDistortion(num_steps=5, distort_limit=0.3, p=1),  # Grid Distortion
        at.ElasticTransform(alpha=1, sigma=50, alpha_affine=50, p=1),  # Elastic Transform
        at.ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=45, p=1),  # Random Shift, Scale, Rotate
        at.CLAHE(clip_limit=2, p=1),  # CLAHE for Contrast Limited Adaptive Histogram Equalization
        at.HueSaturationValue(hue_shift_limit=20, sat_shift_limit=30, val_shift_limit=20, p=1),  # Hue Saturation
        at.MultiplicativeNoise(multiplier=(0.5, 1.5), p=0.5),  # Multiplicative Noise for color variation
    ], n=random.randint(1, 2), p=1),
    #정방형으로 돌아가기 때문에, 10도씩 여백을 줌
    at.OneOf([
        at.Rotate(limit=(10, 30), border_mode=cv2.BORDER_CONSTANT, value=(255, 255, 255), p=1),
        at.Rotate(limit=(150, 170), border_mode=cv2.BORDER_CONSTANT, value=(255, 255, 255), p=1),
        at.Rotate(limit=(190, 210), border_mode=cv2.BORDER_CONSTANT, value=(255, 255, 255), p=1),
        at.Rotate(limit=(330, 350), border_mode=cv2.BORDER_CONSTANT, value=(255, 255, 255), p=1),
    ], p=1),
])
# MixUp 함수 정의 (랜덤하게 적용)
def mixup(image_path, label, labels_df, alpha=0.5, apply_prob=0.5):
    if np.random.rand() > apply_prob:
        return cv2.imread(image_path)
    
    same_class_images = labels_df[labels_df['target'] == label]['ID'].tolist()
    image_name = os.path.basename(image_path)
    if image_name in same_class_images:
        same_class_images.remove(image_name)
    
    if same_class_images:
        random_image_name = random.choice(same_class_images)
        random_image_path = os.path.join(input_folder, random_image_name)
        
        image1 = cv2.imread(image_path)
        image2 = cv2.imread(random_image_path)
        if image1 is not None and image2 is not None:
            image2_resized = cv2.resize(image2, (image1.shape[1], image1.shape[0]))
            mixed_image = cv2.addWeighted(image1, alpha, image2_resized, 1 - alpha, 0)
            return mixed_image
    return cv2.imread(image_path)

# 이미지 증강 및 저장 함수
output_data = []
def augment_and_save(image_path, output_folder, augmentations, label, labels_df, num_augments=60):
    filename = os.path.splitext(os.path.basename(image_path))[0]
    image = cv2.imread(image_path)
    if image is None:
        print(f"이미지 로드 실패: {image_path}")
        return
    
    # 원본 이미지 저장
    original_output_path = os.path.join(output_folder, f"{filename}_origin.jpg")
    cv2.imwrite(original_output_path, image)
    output_data.append([f"{filename}_origin.jpg", label])
    
    # MixUp 및 증강 이미지 저장
    for i in range(num_augments):
        mixed_image = mixup(image_path, label, labels_df, apply_prob=0.2)
        
        # 증강 처리
        augmented_image = augmentations(image=mixed_image)['image']
        
        # 파일명 형식 변경
        if np.array_equal(mixed_image, image):
            augment_type = "original"  # MixUp이 적용되지 않은 경우
        else:
            augment_type = "mixup"
        
        output_path = os.path.join(output_folder, f"{filename}_{augment_type}_aug_{i+1}.jpg")
        cv2.imwrite(output_path, augmented_image)
        output_data.append([f"{filename}_{augment_type}_aug_{i+1}.jpg", label])

# 모든 이미지에 대해 증강 및 저장
for img_file in os.listdir(input_folder):
    img_path = os.path.join(input_folder, img_file)
    label = label_dict.get(img_file)
    if label is not None:
        augment_and_save(img_path, output_folder, augmentation_pipeline, label, df)

# CSV 파일로 저장
output_df = pd.DataFrame(output_data, columns=['ID', 'target'])
output_df.to_csv(output_csv_path, index=False)



In [15]:
import os
import pandas as pd
import shutil

# 경로 설정
v1_folder = '/root/data/home/data/prcd_train'
v2_folder = '/root/data/home/data/V2_train'
v6_folder = '/root/data/home/data/V6_train'
v7_folder = '/root/data/home/data/V7_train'
v7_csv_path = '/root/data/home/data/V7_train_labels.csv'

# V7 디렉토리 생성
os.makedirs(v7_folder, exist_ok=True)

# V1, V2, V6의 파일을 V7로 복사하고 라벨 정보 수집
label_data = []

# V1 폴더의 파일 복사 및 라벨 수집
v1_labels = pd.read_csv('/root/data/home/data/prcd_train_labels.csv')
for _, row in v1_labels.iterrows():
    src_path = os.path.join(v1_folder, row['ID'])
    dst_path = os.path.join(v7_folder, row['ID'])
    shutil.copy(src_path, dst_path)
    label_data.append([row['ID'], row['target']])

# V2 폴더의 파일 복사 및 라벨 수집
v2_labels = pd.read_csv('/root/data/home/data/V2_train_labels.csv')
for _, row in v2_labels.iterrows():
    src_path = os.path.join(v2_folder, row['ID'])
    dst_path = os.path.join(v7_folder, row['ID'])
    shutil.copy(src_path, dst_path)
    label_data.append([row['ID'], row['target']])

# V6 폴더의 파일 복사 및 라벨 수집
v6_labels = pd.read_csv('/root/data/home/data/V6_train_labels.csv')
for _, row in v6_labels.iterrows():
    src_path = os.path.join(v6_folder, row['ID'])
    dst_path = os.path.join(v7_folder, row['ID'])
    shutil.copy(src_path, dst_path)
    label_data.append([row['ID'], row['target']])

# 라벨 정보를 데이터프레임으로 변환하고 CSV로 저장
v7_labels_df = pd.DataFrame(label_data, columns=['ID', 'target'])
v7_labels_df.to_csv(v7_csv_path, index=False)

print(f"V7 데이터셋이 {v7_folder}에 생성되었고, 라벨 CSV 파일이 {v7_csv_path}에 저장되었습니다.")


V7 데이터셋이 /root/data/home/data/V7_train에 생성되었고, 라벨 CSV 파일이 /root/data/home/data/V7_train_labels.csv에 저장되었습니다.


In [None]:
augmentation_pipeline = at.Compose([
    at.SomeOf([
        at.GaussNoise(p=1, var_limit=(500, 1000)),
        at.RandomBrightnessContrast(brightness_limit=(-0.3, 0.3), contrast_limit=(-0.3, 0.3), p=1),
        at.HorizontalFlip(p=1),
        at.VerticalFlip(p=1),
    ], n=random.randint(2, 3), p=1),
    at.SomeOf([
        at.CoarseDropout(max_holes=8, max_height=32, max_width=32, p=1),  # Cutout
        at.GridDistortion(num_steps=5, distort_limit=0.3, p=1),  # Grid Distortion
        at.ElasticTransform(alpha=1, sigma=50, alpha_affine=50, p=1),  # Elastic Transform
        at.ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=45, p=1),  # Random Shift, Scale, Rotate
        at.CLAHE(clip_limit=2, p=1),  # CLAHE for Contrast Limited Adaptive Histogram Equalization
        at.HueSaturationValue(hue_shift_limit=20, sat_shift_limit=30, val_shift_limit=20, p=1),  # Hue Saturation
        at.MultiplicativeNoise(multiplier=(0.5, 1.5), p=0.5),  # Multiplicative Noise for color variation
    ], n=random.randint(1, 2), p=1),
    at.Rotate(limit=(-180, 180), border_mode=cv2.BORDER_CONSTANT, value=(255, 255, 255), p=1),
])