In [None]:
import os
import time
import random
import shutil
import glob
import cv2

import timm
import torch
import albumentations as A
import pandas as pd
import numpy as np
import torch.nn as nn
from albumentations.pytorch import ToTensorV2
from torch.optim import Adam
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score

# Augraphy 증강

In [None]:
import os
import cv2
from augraphy import AugraphyPipeline, NoiseTexturize, DirtyDrum, InkBleed, LightingGradient, SubtleNoise, BleedThrough,BadPhotoCopy
from concurrent.futures import ThreadPoolExecutor,as_completed
from tqdm import tqdm

origin_df = pd.read_csv('../data/train.csv')
# 1. Augraphy 파이프라인 설정
augraphy_pipeline = AugraphyPipeline([
    NoiseTexturize(
        sigma_range=(10, 15),             # 노이즈의 강도를 높게 설정 (뚜렷한 텍스처)
        turbulence_range=(2, 4),          # 약간의 불규칙성 (자연스러운 패턴)
        texture_width_range=(150, 200),   # 중간 정도의 텍스처 크기
        texture_height_range=(150, 200),
        p=1.0                        # 효과 적용 확률 (100% 적용)
    )
])

# 2. 이미지 경로 및 저장 경로 설정
input_dir = '../data/train'
output_dir = "../data/train_augmented" # 저장할 디렉토리

# 저장 디렉토리 생성
os.makedirs(output_dir, exist_ok=True)

def process_and_save_image(filename):
    img_path = os.path.join(input_dir, filename)
    image = cv2.imread(img_path)
    augmented_image = augraphy_pipeline(image)
    output_path = os.path.join(output_dir, f"aug_texture_{filename}")
    cv2.imwrite(output_path, augmented_image)

    return output_path
# 멀티스레딩을 통해 이미지 처리 (진행률 표시)
filenames = [f for f in os.listdir(input_dir) if f.endswith(('.jpg', '.jpeg', '.png'))]
# tqdm을 사용하여 진행률 표시
with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
    # 각 작업을 제출하고 Future 객체를 저장
    futures = {executor.submit(process_and_save_image, filename): filename for filename in filenames}
    # 진행률 표시
    for future in tqdm(as_completed(futures), total=len(futures), desc='Processing Images'):
        filename = futures[future]
        try:
            result = future.result()  # 결과를 가져오고
            #print(f"Saved: {result}")
        except Exception as e:
            print(f"Error processing {filename}: {e}")

# Albumentation 증강

In [None]:
median_blur = A.MedianBlur(blur_limit=17, p=1.0)
motion_blur = A.MotionBlur(blur_limit=17, p=1.0)
random_brightness_contrast = A.RandomBrightnessContrast(brightness_limit=(-0.8, 0.8), contrast_limit=0.2, p=1.0)
hue_saturation = A.HueSaturationValue(hue_shift_limit=60, sat_shift_limit=60, val_shift_limit=60, p=1.0)
defocus = A.Defocus(p=1.0)
gamma =A.RandomGamma(p=1.0)
max_gauss_noise = A.GaussNoise(var_limit=(800,1500), p=1.0)
rot90 = A.Rotate(limit=(90,90), p=1.0)
rot180 = A.Rotate(limit=(180,180), p=1.0)
rot270 = A.Rotate(limit=(270,270), p=1.0)
#
horizontal_flip = A.HorizontalFlip(p=1.0)  # 항상 Horizontal Flip 적용
vertical_flip = A.VerticalFlip(p=1.0)      # 항상 Vertical Flip 적용
#
input_dir = "../data/train"  # 원본 이미지가 있는 폴더
output_dir = "../data/train_augmented/"  # 증강된 이미지를 저장할 폴더
os.makedirs(output_dir, exist_ok=True)  # 저장 폴더가 없으면 생성

#위에서 한번 증강을 했고, 해당 증강된 데이터에 대해 처리해야하기 때문에 output에서 파일 읽기
files_with_keyword = [f for f in os.listdir(output_dir)]
for img_name in tqdm(files_with_keyword):
    img_path = os.path.join(output_dir, img_name)
    image = cv2.imread(img_path)

    if image is None:
        print(f"Could not read image {img_name}. Skipping.")
        continue
    image_flip = horizontal_flip(image=image)["image"]

    rot01 = rot90(image=image)["image"]
    rot01_filename = f"rot90_nt_{img_name}"
    cv2.imwrite(os.path.join(output_dir, rot01_filename), rot01)
    
    rot02 = rot180(image=image)["image"]
    rot02_filename = f"rot180_nt_{img_name}"
    cv2.imwrite(os.path.join(output_dir, rot02_filename), rot02)

    rot03 = rot270(image=image)["image"]
    rot03_filename = f"rot270_nt_{img_name}"
    cv2.imwrite(os.path.join(output_dir, rot03_filename), rot03)

    frot01 = rot90(image=image_flip)["image"]
    frot01_filename = f"frot90_nt_{img_name}"
    cv2.imwrite(os.path.join(output_dir, frot01_filename), frot01)
    
    frot02 = rot180(image=image_flip)["image"]
    frot02_filename = f"frot180_nt_{img_name}"
    cv2.imwrite(os.path.join(output_dir, frot02_filename), frot02)

    frot03 = rot270(image=image_flip)["image"]
    frot03_filename = f"frot270_nt_{img_name}"
    cv2.imwrite(os.path.join(output_dir, frot03_filename),frot03)

    # MedianBlur 적용
    med_blur = median_blur(image=image)["image"]
    med_blur_filename = f"med_blur_{img_name}"
    cv2.imwrite(os.path.join(output_dir, med_blur_filename), med_blur)
    
    # Motion Blur 적용
    mot_blur = motion_blur(image=image)["image"]
    mot_blur_filename = f"mot_blur_{img_name}"
    cv2.imwrite(os.path.join(output_dir, mot_blur_filename), mot_blur)
    
    # Random Brightness Contrast 적용
    bright_contrast = random_brightness_contrast(image=image)["image"]
    bright_contrast_filename = f"bright_contrast_{img_name}"
    cv2.imwrite(os.path.join(output_dir, bright_contrast_filename), bright_contrast)
    
    # Hue Saturation Value 적용
    hue_sat = hue_saturation(image=image)["image"]
    hue_sat_filename = f"hue_sat_{img_name}"
    cv2.imwrite(os.path.join(output_dir, hue_sat_filename), hue_sat)
    
    # Defocus 적용
    defoc = defocus(image=image)["image"]
    defoc_filename = f"defocus_{img_name}"
    cv2.imwrite(os.path.join(output_dir, defoc_filename), defoc)
    
    # Gamma 적용
    gm = gamma(image=image)["image"]
    gamma_filename = f"gamma_{img_name}"
    cv2.imwrite(os.path.join(output_dir, gamma_filename), gm)

    # Very Strong Gaussian Noise 적용
    m_gn = max_gauss_noise(image=image)["image"]
    m_gauss_noise_filename = f"very_strong_gauss_noise_{img_name}"
    cv2.imwrite(os.path.join(output_dir, m_gauss_noise_filename), m_gn)


list_dir = os.listdir(output_dir)

for img_name in tqdm(list_dir):
    img_path = os.path.join(output_dir, img_name)
    image = cv2.imread(img_path)
    
    if image is None:
        print(f"Could not read image {img_name}. Skipping.")
        continue
    
    # Horizontal Flip 적용
    h_flip = horizontal_flip(image=image)["image"]
    h_flip_filename = f"hf_{img_name}"
    cv2.imwrite(os.path.join(output_dir, h_flip_filename), h_flip)

    # Vertical Flip 적용
    v_flip = vertical_flip(image=image)["image"]
    v_flip_filename = f"vf_{img_name}"
    cv2.imwrite(os.path.join(output_dir, v_flip_filename), v_flip)

## 기타 작업코드

In [None]:
df = pd.read_csv('../data/train.csv')
df.apply(lambda row: shutil.copy(os.path.join('../data/train', row['ID']), '../data/train_augmented'), axis=1)
#데이터 train에 있는 원본파일을 train_augmented에 복제

In [None]:
len([f for f in os.listdir(output_dir) if f.endswith('.jpg')])

## 증강파일 labeling한 train_augmented dataframe 생성 후 저장

In [None]:
train_df = pd.read_csv('../data/train.csv')

In [None]:
train_dir = '../data/train_augmented'

In [None]:
# 각 ID에 대해 prefix가 붙은 파일을 찾아 추가할 행들을 저장할 리스트 생성
additional_rows = []

# train_df의 각 row에 대해 처리
for idx, row in train_df.iterrows():
    base_id = row['ID']
    target = row['target']
    
    # train 디렉토리의 파일명들을 순회하며, prefix가 붙은 파일명을 찾음
    for filename in os.listdir(train_dir):
        # prefix가 붙은 파일 중 원본 ID와 일치하는 파일만 선택
        if filename.endswith(base_id) and filename != base_id:
            # 추가할 행을 리스트에 저장
            additional_rows.append({'ID': filename, 'target': target})

# 기존 train_df에 추가된 행들을 추가
new_train_df = pd.concat([train_df, pd.DataFrame(additional_rows)], ignore_index=True)

In [None]:
new_train_df.to_csv('../data/train_augmented.csv', index=False)