## Contents
- Import Library & Define Functions
- Hyper-parameters
- Load Data
- Train Model
- Inference & Save File


## Import Library & Define Functions
* 학습 및 추론에 필요한 라이브러리를 로드합니다.
* 학습 및 추론에 필요한 함수와 클래스를 정의합니다.

In [None]:
import os
import random

import timm
import torch
import albumentations as A
import pandas as pd
import numpy as np
import torch.nn as nn
from albumentations.pytorch import ToTensorV2
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from timm.scheduler import CosineLRScheduler

from datetime import datetime
import time
from zoneinfo import ZoneInfo
import wandb

In [None]:
train_time = datetime.fromtimestamp(time.time(), tz=ZoneInfo("Asia/Seoul")).strftime("%Y%m%d-%H%M%S")
train_time

wandb.init(project="document-classification-renew", name=f"run-{train_time}")

In [None]:
# 시드를 고정합니다.
SEED = 42
os.environ['PYTHONHASHSEED'] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.benchmark = True

In [None]:
# 데이터셋 클래스를 정의합니다.
class ImageDataset(Dataset):
    def __init__(self, df, img_dir, transform=None):
        self.data = df
        self.img_dir = img_dir
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_name = os.path.join(self.img_dir, self.data.iloc[idx, 0])
        image = np.array(Image.open(img_name).convert('RGB'))
        label = self.data.iloc[idx, 1]

        if self.transform:
            augmented = self.transform(image=image)
            image = augmented['image']

        return image, label

## Hyper-parameters
* 학습 및 추론에 필요한 하이퍼파라미터들을 정의합니다.

In [None]:
# device
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# model config
MODEL_NAME = 'tf_efficientnetv2_s.in21k_ft_in1k'

# training config
IMG_SIZE = 384
LR = 1e-3
WD = 1e-4
EPOCHS = 25
PATIENCE = 5
WARM_UP_EPOCHS = 3
WARM_UP_LR = 1e-5
MIN_LR = 1e-6
BATCH_SIZE = 32

NUM_WORKERS = 16
DELTA = 1e-4
SHAPE = 'RESIZE' # 'RESIZE' or 'PADDING'
DROP_RATE=0.2

# label smoothing
SA = 0.05 # 강도
ST = 0.9 # 적용 기준

wandb.config.update({
    "model": MODEL_NAME,
    "img_size": IMG_SIZE,
    "learning_rate": LR,
    'weight_decay': WD,
    "epochs": EPOCHS,
    "batch_size": BATCH_SIZE,
    "num_workers": NUM_WORKERS,
    'patience': PATIENCE,
    'delta': DELTA,
    'shape': SHAPE,
    'drop_rate': DROP_RATE,
    'smoothing_alpha': SA,
    'smoothing_threshold': ST,
})

# 데이터셋
TRAIN_CSV_FILE = 'train_temp_fixed_augmented_balanced_150.csv'
TRAIN_IMG_DIR = 'train_augmented_balanced_150'
TEST_IMG_DIR = "test"
wandb.log({
    'train_csv_file' : TRAIN_CSV_FILE,
    "train_dir" : TRAIN_IMG_DIR,
    "test_dir" : TEST_IMG_DIR,
})

TRAIN_RATIO = 1.0

## Load Data
* 학습, 테스트 데이터셋과 로더를 정의합니다.

In [None]:
normalize = A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])

if SHAPE == 'PADDING':
    transform = A.Compose([
        A.LongestMaxSize(max_size=IMG_SIZE),
        A.PadIfNeeded(min_height=IMG_SIZE, min_width=IMG_SIZE, border_mode=cv2.BORDER_CONSTANT, value=[124, 116, 104]), # 평균 색상으로 패딩
        normalize,
        ToTensorV2(),
    ])
else:
    transform = A.Compose([
        A.Resize(IMG_SIZE, IMG_SIZE),
        normalize,
        ToTensorV2(),
    ])

In [None]:
def split_dataset(csv_file, img_dir, train_ratio=0.8):
    df = pd.read_csv(csv_file)

    # 원본 이미지 ID 추출 (확장자 제거)
    original_df = df[df['is_augmented'] == False].copy()
    original_df['original_id'] = original_df['ID'].apply(lambda x: os.path.splitext(x)[0].split('.')[0])

    if train_ratio >= 1.0 :
        train_df, valid_df = original_df, original_df
    else:
        # Stratified split : 원본 이미지만 선택하여 분할
        train_df, valid_df = train_test_split(original_df, train_size=train_ratio, stratify=original_df['target'], random_state=SEED)

    # 증강된 이미지 포함
    df = df[df['is_augmented'] == True]
    
    train = pd.concat([train_df, df[df['ID'].str.contains('|'.join(train_df['original_id']))]])
    valid = pd.concat([valid_df, df[df['ID'].str.contains('|'.join(valid_df['original_id']))]])

    train_count = len(train)
    valid_count = len(valid)
    wandb.log({
        "train_ratio" : train_ratio,
        "train_count" : train_count,
        "valid_count" : valid_count
    })

    print(f"Train set: {train_count} images (Original: {len(train_df)}, Augmented: {train_count - len(train_df)})")
    print(f"Valid set: {valid_count} images (Original: {len(valid_df)}, Augmented: {valid_count - len(valid_df)})")
    
    print("\nClass distribution:")
    print("Train:", train['target'].value_counts(normalize=True))
    print("Valid:", valid['target'].value_counts(normalize=True))

    # ImageDataset 생성
    train_dataset = ImageDataset(train, img_dir, transform=transform)
    valid_dataset = ImageDataset(valid, img_dir, transform=transform)
    
    return train_dataset, valid_dataset

train_dataset, val_dataset = split_dataset(csv_file=f"data/{TRAIN_CSV_FILE}", img_dir=f"data/{TRAIN_IMG_DIR}", train_ratio=TRAIN_RATIO)

In [None]:
test_dataset = ImageDataset(
    pd.read_csv("data/sample_submission.csv"),
    f"data/{TEST_IMG_DIR}",
    transform
)
print(f"Test set: {len(test_dataset)} images")

In [None]:
# DataLoader 정의
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS, pin_memory=True, drop_last=True) # 마지막 배치가 batch_size보다 작으면 버림
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True)

## Train Model
* 모델을 로드하고, 학습을 진행합니다.

In [None]:
def evaluate(loader, model, loss_fn, device):
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for inputs, labels in tqdm(loader, desc="Evaluating"):
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = loss_fn(outputs, labels)
            total_loss += loss.item()
            
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    total_loss /= len(loader)
    acc = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='macro')
    
    return total_loss, acc, f1

In [None]:
class EarlyStopping:
    def __init__(self, patience=7, verbose=False, delta=0, path='checkpoint.pt'):
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = float('inf')
        self.delta = delta
        self.path = path

    def __call__(self, val_loss, model):
        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif np.isclose(score, self.best_score, atol=self.delta) or score < self.best_score:
            self.counter += 1
            print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        if self.verbose:
            print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}). Saving model ...')
        torch.save(model.state_dict(), self.path)
        self.val_loss_min = val_loss

In [None]:
# one epoch 학습을 위한 함수입니다.
def train_one_epoch(loader, model, optimizer, loss_fn, device):
    model.train()

    train_loss = 0
    preds_list = []
    targets_list = []

    pbar = tqdm(loader)
    for image, targets in pbar:
        image = image.to(device)
        targets = targets.to(device)

        model.zero_grad(set_to_none=True)

        preds = model(image)
        loss = loss_fn(preds, targets)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        preds_list.extend(preds.argmax(dim=1).detach().cpu().numpy())
        targets_list.extend(targets.detach().cpu().numpy())

        pbar.set_description(f"Loss: {loss.item():.4f}")

    train_loss /= len(loader)
    train_acc = accuracy_score(targets_list, preds_list)
    train_f1 = f1_score(targets_list, preds_list, average='macro')

    ret = {
        "train_loss": train_loss,
        "train_acc": train_acc,
        "train_f1": train_f1,
    }

    # wandb에 훈련 메트릭 로깅
    wandb.log(ret)
    
    return ret

In [None]:
loss_fn = nn.CrossEntropyLoss()

def genOptimizer(model):
    return AdamW(model.parameters(), lr=LR, weight_decay=WD)

def genScheduler(optimizer):    
    return CosineLRScheduler(
        optimizer,
        t_initial=EPOCHS,
        lr_min=MIN_LR,
        warmup_lr_init=WARM_UP_LR,
        warmup_t=WARM_UP_EPOCHS,
        cycle_limit=1,
        t_in_epochs=True,
    )

def train(loader, model, optimizer, scheduler):
    model_file_path = f'model/{train_time}_best.pt'

    # Early stopping 설정
    early_stopping = EarlyStopping(PATIENCE, verbose=True, delta=DELTA, path=model_file_path)
    
    for epoch in range(EPOCHS):
        ret = train_one_epoch(loader, model, optimizer, loss_fn, device=DEVICE)
        val_loss, val_acc, val_f1 = evaluate(val_loader, model, loss_fn, DEVICE)
        scheduler.step(epoch)
        
        ret['epoch_count'] = epoch+1
        ret['val_loss'] = val_loss
        ret['val_acc'] = val_acc
        ret['val_f1'] = val_f1

        # wandb에 에폭 로깅
        wandb.log(ret)

        print(f"Epoch {epoch+1}/{EPOCHS}")
        print(f"Train - Loss: {ret['train_loss']:.4f}, Acc: {ret['train_acc']:.4f}, Macro F1: {ret['train_f1']:.4f}")
        print(f"Val - Loss: {val_loss:.4f}, Acc: {val_acc:.4f}, Macro F1: {val_f1:.4f}")

        # Early stopping 체크
        early_stopping(val_loss, model)
        if early_stopping.early_stop:
            print("Early stopping")
            break

    # 최종 모델 저장
    model_file_path = f'model/{train_time}_final.pt'
    torch.save(model.state_dict(), model_file_path)

In [None]:
# load model
model = timm.create_model(
    MODEL_NAME,
    pretrained=True,
    num_classes=17,
    drop_rate=DROP_RATE
).to(DEVICE)

train_optimizer = genOptimizer(model)
wandb.log({"optimizer" : {train_optimizer.__class__.__name__}})
train(train_loader, model, train_optimizer, genScheduler(train_optimizer))

## 평가

In [None]:
model.load_state_dict(torch.load(f'model/{train_time}_best.pt'))
model.to(DEVICE)

# 학습 후 각 데이터셋에 대한 평가
train_results = evaluate(train_loader, model, loss_fn, DEVICE)
valid_results = evaluate(val_loader, model, loss_fn, DEVICE)

total_results = {
    "final_train_loss": train_results[0],
    "final_train_accuracy": train_results[1],
    "final_train_f1": train_results[2],
    "final_valid_loss": valid_results[0],
    "final_valid_accuracy": valid_results[1],
    "final_valid_f1": valid_results[2]
}

# 평가 결과 로깅
print(total_results)
wandb.log(total_results)

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

def evaluate_and_visualize_errors(model, dataloader, device, num_samples=10):
    model.eval()
    all_preds = []
    all_labels = []
    error_images = []
    error_preds = []
    error_labels = []

    with torch.no_grad():
        for images, labels in tqdm(dataloader, desc="Evaluating"):
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images)
            _, preds = torch.max(outputs, 1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

            # 오류 식별
            errors = preds != labels
            error_images.extend(images[errors].cpu())
            error_preds.extend(preds[errors].cpu().numpy())
            error_labels.extend(labels[errors].cpu().numpy())

    # 정확도 계산
    accuracy = np.mean(np.array(all_preds) == np.array(all_labels))
    print(f"Accuracy: {accuracy:.4f}")

    # 오류 시각화
    num_samples = min(num_samples, len(error_images))
    fig, axes = plt.subplots(1, num_samples, figsize=(20, 4))
    for i in range(num_samples):
        img = error_images[i].permute(1, 2, 0).numpy()
        axes[i].imshow(img)
        axes[i].set_title(f"Pred: {error_preds[i]}, True: {error_labels[i]}")
        axes[i].axis('off')
    plt.tight_layout()
    plt.show()

    # 혼동 행렬 생성 및 시각화
    cm = confusion_matrix(all_labels, all_preds)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.show()

    return error_images, error_preds, error_labels

# 오류 분석
def analyze_errors(error_preds, error_labels):
    error_pairs = list(zip(error_preds, error_labels))
    error_counts = {}
    for pred, true in error_pairs:
        key = f"Pred: {pred}, True: {true}"
        error_counts[key] = error_counts.get(key, 0) + 1
    
    print("Most common errors:")
    for error, count in sorted(error_counts.items(), key=lambda x: x[1], reverse=True)[:5]:
        print(f"{error}: {count} times")

# 검증 세트에 대한 평가 및 오류 시각화
print("Valid Set Errors:")
val_errors = evaluate_and_visualize_errors(model, val_loader, DEVICE)

print("Valid Set Error Analysis:")
analyze_errors(val_errors[1], val_errors[2])

# Inference

In [None]:
def smooth_predictions(predictions, alpha=0.1):
    num_classes = predictions.shape[1]
    smoothed = alpha / num_classes + (1 - alpha) * predictions
    return smoothed

def pred(model_file_name, alpha=0.1, threshold=0.5):
    print(f"Generating predictions for submission...{model_file_name}")
    preds_list = []
    smooth_preds_list = []
    probs_list = []
    
    model.load_state_dict(torch.load(f'model/{model_file_name}'))
    model.eval()
    
    for image, _ in tqdm(test_loader):
        image = image.to(DEVICE)
        
        with torch.no_grad():
            preds = model(image)
        
        # 예측
        probs = torch.softmax(preds, dim=1).detach().cpu().numpy()
        preds_list.extend(np.argmax(probs, axis=1))
        probs_list.extend(probs)
    
    probs_array = np.array(probs_list)
    max_probs = np.max(probs_array, axis=1)
    
    # threshold를 넘는 예측에 대해서만 라벨 스무딩 적용
    smooth_probs = np.where(
        max_probs[:, np.newaxis] >= threshold,
        smooth_predictions(probs_array, alpha),
        probs_array
    )
    
    smooth_preds_list = np.argmax(smooth_probs, axis=1)
    normal_preds_list = np.array(preds_list)

    for orig, smooth, orig_prob, smooth_prob in zip(normal_preds_list[:10], smooth_preds_list[:10], probs_array[:10], smooth_probs[:10]):
        print(f"Original: {orig} (max prob: {np.max(orig_prob):.4f}), Smoothed: {smooth} (max prob: {np.max(smooth_prob):.4f})")
    
    return normal_preds_list, smooth_preds_list, probs_array, smooth_probs

best_preds, best_smooth_preds, best_probs, best_smooth_probs = pred(f'{train_time}_best.pt', alpha=SA, threshold=ST)
final_preds, final_smooth_preds, final_probs, final_smooth_probs = pred(f'{train_time}_final.pt', alpha=SA, threshold=ST)

In [None]:
import import_ipynb
import evaluator

def calc(path):
    file_paths = [
        "output/9766.csv",
        path
    ]

    predictions = evaluator.load_predictions(file_paths)

    # 다른 예측 찾기
    different_predictions = evaluator.find_different_predictions(predictions)

    # 결과 출력
    print(different_predictions)

    # Macro F1 점수 계산
    ground_truth_name = os.path.basename(file_paths[0]).split('.')[0]
    prediction_name = os.path.basename(file_paths[-1]).split('.')[0]
    ground_truth = predictions[ground_truth_name][ground_truth_name]
    prediction = predictions[prediction_name][prediction_name]
    macro_f1 = evaluator.calculate_macro_f1(ground_truth, prediction)
    print(f"Macro F1 Score: {macro_f1}")

    return [macro_f1, different_predictions, ground_truth_name, prediction_name, ground_truth, prediction]

In [None]:
def gen_pred_csv(preds, file_name):
    df = pd.read_csv("data/sample_submission.csv")
    df['target'] = preds
    path = os.path.join('output_temp', file_name)
    df.to_csv(path, index=False)
    return path, df

best_datas = gen_pred_csv(best_preds, f'{train_time}_best.csv')
best_smooth_datas = gen_pred_csv(best_smooth_preds, f'{train_time}_best_smooth.csv')
final_datas = gen_pred_csv(final_preds, f'{train_time}_final.csv')
final_smooth_datas = gen_pred_csv(final_smooth_preds, f'{train_time}_final_smooth.csv')

best_predictions = [calc(best_datas[0]), best_datas[1]]
best_smooth_predictions = [calc(best_smooth_datas[0]), best_smooth_datas[1]]
final_predictions = [calc(final_datas[0]), final_datas[1]]
final_smooth_predictions = [calc(final_smooth_datas[0]), final_smooth_datas[1]]

In [None]:
def log_results(best, final, best_smooth, final_smooth):
    print(f"Best F1: {best[0]:.4f}, Final F1: {final[0]:.4f}")
    print(f"Best Smooth F1: {best_smooth[0]:.4f}, Final Smooth F1: {final_smooth[0]:.4f}")
    
    wandb.log({
        "best f1": best[0],
        "final f1": final[0],
        "best smooth f1": best_smooth[0],
        "final smooth f1": final_smooth[0]
    })

def select_best_prediction(best, final, best_smooth, final_smooth):
    all_predictions = [
        ("Best", best[0][0], best),
        ("Final", final[0][0], final),
        ("Best Smooth", best_smooth[0][0], best_smooth),
        ("Final Smooth", final_smooth[0][0], final_smooth)
    ]
    
    best_name, best_f1, param = max(all_predictions, key=lambda x: x[1])
    
    print(f"Selected prediction: {best_name} (F1: {best_f1:.4f})")
    return best_f1, param

# 결과 로깅
log_results(best_predictions[0], final_predictions[0], best_smooth_predictions[0], final_smooth_predictions[0])

# 최상의 예측 선택
selected_prediction = select_best_prediction(best_predictions, final_predictions, best_smooth_predictions, final_smooth_predictions)
predictions = selected_prediction[1][0]
pred_df = selected_prediction[1][1]

# Save File

In [None]:
sample_submission_df = pd.read_csv("data/sample_submission.csv")
assert (sample_submission_df['ID'] == pred_df['ID']).all()

In [None]:
submission_file_path = os.path.join('output', f'{train_time}.csv')
sample_submission_df.to_csv(submission_file_path, index=False)

In [None]:
import import_ipynb
import evaluator

# 이미지 디렉토리 경로
image_dir = test_dataset.img_dir

# 예측 로드
macro_f1 = predictions[0]
different_predictions = predictions[1]
ground_truth_name = predictions[2]
prediction_name = predictions[3]
ground_truth = predictions[4]
prediction = predictions[5]

# 오류 분포 그래프 표시
evaluator.plot_error_distribution(ground_truth, prediction)

# 모든 클래스에 대해 클래스당 1개씩 이미지 표시
# evaluator.display_images_and_predictions(different_predictions, image_dir, ground_truth_name, prediction_name, class_filter=None, max_images_per_class=2)

# 특정 클래스(예: 0, 1, 2)에 대해 클래스당 최대 3개씩 이미지 표시
evaluator.display_images_and_predictions(different_predictions, image_dir, ground_truth_name, prediction_name, class_filter=[3, 7, 14], max_images_per_class=3)

In [None]:
print(f"prov macro_f1 : {macro_f1}")
wandb.log({"prov f1": macro_f1})

In [None]:
wandb.finish()