# Transformer를 활용한 부도 예측 모델링
1. 트랜스포머 인코더 구조를 활용하기 위하여, 수치형 데이터와 범주형 데이터 모두 32차원 벡터로 변환
2. multi-head-self attention 통과 후 FC layer 통과
3. 리소스 한계로 epoch 10에 머물렀기에 성능이 부족할 수 있지만, epoch가 10에 도달하는 과정에서 성능 개선의 폭이 줄어들지 않았기에, 추가 성능 향상 가능성 있음
4. 추가적인 대량 데이터가 들어오면, 성능 향상이 이루어질 가능성이 있고, 추후에는 텍스트 데이터를 추가하는 등의 확장성이 뛰어난 점에 착안하여 트랜스포머 아키텍쳐 선택

# 컬럼 명에 Flag가 들어가거나, 유니크 값이 N개 미만인 경우, 사실상 범주형 정보로 분류

In [1]:
import pandas as pd
Data_EDA = pd.read_csv('../../Data/home-credit-default-risk/application_train.csv',index_col='SK_ID_CURR')

In [2]:
import pandas as pd
import numpy as np
import pandas.api.types as ptypes

threshold = 10

cat_cols_by_cardinality = []
num_cols_by_cardinality = []

for col in Data_EDA.columns:

    # 고유값 개수 확인
    unique_count = Data_EDA[col].nunique()
    
    if 'flag' in col.lower():
        cat_cols_by_cardinality.append(col)
    elif 'amt' in col.lower():
        num_cols_by_cardinality.append(col)

    # 실수형/정수형이면 일단 수치형으로 분류하되, 
    # 만약 유니크 값이 작은 범주 느낌이라면 cat_cols_by_cardinality 로 옮길 수도 있음
    elif ptypes.is_numeric_dtype(Data_EDA[col]):
        if unique_count < threshold:
            cat_cols_by_cardinality.append(col)
        else:
            num_cols_by_cardinality.append(col)
    else:
        # 문자인 경우 범주형으로 분류
        cat_cols_by_cardinality.append(col)

print("범주형(유니크값 < 10):", cat_cols_by_cardinality)
print("수치형(유니크값 >= 10):", num_cols_by_cardinality)


범주형(유니크값 < 10): ['TARGET', 'NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL', 'OCCUPATION_TYPE', 'REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY', 'WEEKDAY_APPR_PROCESS_START', 'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION', 'LIVE_REGION_NOT_WORK_REGION', 'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY', 'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE', 'DEF_60_CNT_SOCIAL_CIRCLE', 'FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16

## 결측치도 의미가 있을 수 있으므로, 보간하지 않고, MISSING으로 처리

In [None]:
for c in cat_cols_by_cardinality:
    Data_EDA[c] = Data_EDA[c].fillna("MISSING")

## 범주형 정보 레이블 인코딩

In [3]:
# 범주형 컬럼 List에서 Target 컬럼을 제외함
cat_cols_by_cardinality_less_target = [x for x in cat_cols_by_cardinality if x != 'TARGET']

print(cat_cols_by_cardinality_less_target) 

['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL', 'OCCUPATION_TYPE', 'REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY', 'WEEKDAY_APPR_PROCESS_START', 'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION', 'LIVE_REGION_NOT_WORK_REGION', 'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY', 'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE', 'DEF_60_CNT_SOCIAL_CIRCLE', 'FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 'FL

In [4]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
# LabelEncoder를 보관할 딕셔너리 (각 컬럼별로 학습된 encoder를 저장)
Data_EDA_labeled = Data_EDA.copy()
encoders = {}
# 각 범주형 컬럼에 대해 LabelEncoder로 변환
for col in cat_cols_by_cardinality_less_target:
    le = LabelEncoder()
    Data_EDA_labeled[col] = le.fit_transform(Data_EDA[col])
    encoders[col] = le

## 수치형 데이터 보간 및 스케일링
- Iterative imputer
- robust scaler

In [5]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import numpy as np
import pandas as pd
from sklearn.preprocessing import RobustScaler

num_data = Data_EDA[num_cols_by_cardinality].values

imp = IterativeImputer(max_iter=10, random_state=42) #하이퍼 파라미터에 대하여 추가적으로 시험해볼 것
num_data_mice = imp.fit_transform(num_data)

Data_mice = Data_EDA_labeled.copy()
Data_mice[num_cols_by_cardinality] = num_data_mice

scaler = RobustScaler()

Data_EDA_scaled_mice = Data_mice.copy()
Data_EDA_scaled_mice[num_cols_by_cardinality] = scaler.fit_transform(Data_mice[num_cols_by_cardinality])

# Transformer 기반 모델링

In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer



In [9]:
train_df, test_df = train_test_split(Data_EDA_scaled, test_size=0.2, random_state=42, stratify=Data_EDA_scaled["TARGET"])

X_train_cat = train_df[cat_cols_by_cardinality_less_target].values  # (행, 50)
X_train_num = train_df[num_cols_by_cardinality].values  # (행, 45)
y_train = train_df["TARGET"].values

X_test_cat = test_df[cat_cols_by_cardinality_less_target].values
X_test_num = test_df[num_cols_by_cardinality].values
y_test = test_df["TARGET"].values

print("Train size:", len(train_df), "Test size:", len(test_df))

Train size: 246008 Test size: 61503


In [10]:
print(X_train_cat.shape)
print(X_train_num.shape)
print(y_train.shape)

(246008, 51)
(246008, 69)
(246008,)


In [11]:
class TabularDataset(Dataset):
    def __init__(self, X_cat, X_num, y):
        super().__init__()
        self.X_cat = X_cat
        self.X_num = X_num
        self.y = y
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        cat_feats = self.X_cat[idx]  # shape: (51,)
        num_feats = self.X_num[idx]  # shape: (69,)
        label = self.y[idx]
        return {
            "cat": torch.tensor(cat_feats, dtype=torch.long),
            "num": torch.tensor(num_feats, dtype=torch.float),
            "label": torch.tensor(label, dtype=torch.long)
        }

train_dataset = TabularDataset(X_train_cat, X_train_num, y_train)
test_dataset = TabularDataset(X_test_cat, X_test_num, y_test)

batch_size = 32  # 미니 배치 사이즈를 32로 설정
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [13]:
cat_max_dict = {}
for i, c in enumerate(cat_cols_by_cardinality_less_target):
    num_cats = Data_EDA_scaled[c].max() + 1  # 최대 인덱스 + 1
    cat_max_dict[i] = num_cats


In [14]:
print(type(cat_max_dict))
print(cat_max_dict)

<class 'dict'>
{0: 2, 1: 3, 2: 2, 3: 2, 4: 8, 5: 8, 6: 5, 7: 6, 8: 6, 9: 2, 10: 2, 11: 2, 12: 2, 13: 2, 14: 2, 15: 19, 16: 3, 17: 3, 18: 7, 19: 2, 20: 2, 21: 2, 22: 2, 23: 2, 24: 2, 25: 58, 26: 5, 27: 4, 28: 8, 29: 3, 30: 10, 31: 2, 32: 2, 33: 2, 34: 2, 35: 2, 36: 2, 37: 2, 38: 2, 39: 2, 40: 2, 41: 2, 42: 2, 43: 2, 44: 2, 45: 2, 46: 2, 47: 2, 48: 2, 49: 2, 50: 2}


In [22]:
from sklearn.utils.class_weight import compute_class_weight

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class_weights = compute_class_weight(class_weight="balanced", classes=np.array([0,1]), y=y_train)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

In [25]:
print(type(class_weights))
class_weights

<class 'torch.Tensor'>


tensor([0.5439, 6.1936])

In [26]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score

class TabTransformerModel(nn.Module):
    def __init__(self,
                 num_cat_features: int,
                 cat_max_dict: dict,
                 num_num_features: int,
                 d_model: int = 32,
                 nhead: int = 4,
                 num_layers: int = 2,
                 dim_feedforward: int = 64,
                 final_hidden: int = 128,
                 dropout_rate: float = 0.3):
        super().__init__()
        
        self.d_model = d_model
        self.num_cat_features = num_cat_features
        self.num_num_features = num_num_features
        
        # 범주형 임베딩
        self.embeddings = nn.ModuleList([
            nn.Embedding(int(cat_max_dict[i]), d_model) for i in range(num_cat_features)
        ])
        
        # 수치형 컬럼별 임베딩 (각 컬럼을 d_model 차원으로 변환)
        self.numeric_embeddings = nn.ModuleList([
            nn.Linear(1, d_model) for _ in range(num_num_features)
        ])
        
        # Position embedding
        self.position_embedding = nn.Embedding(num_cat_features + num_num_features, d_model)
        
        # Transformer Encoder
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            dropout=dropout_rate,
            batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        
        # 최종 Fully Connected
        self.fc = nn.Sequential(
            nn.Linear(d_model, final_hidden),
            nn.BatchNorm1d(final_hidden),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            
            nn.Linear(final_hidden, final_hidden),
            nn.BatchNorm1d(final_hidden),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            
            nn.Linear(final_hidden, 2)  # 이진 분류 (로짓 출력)
        )
        
    def forward(self, x_cat: torch.Tensor, x_num: torch.Tensor) -> torch.Tensor:
        # 범주형 임베딩
        cat_emb = torch.cat([self.embeddings[i](x_cat[:, i]).unsqueeze(1) for i in range(self.num_cat_features)], dim=1)
        
        # 수치형 데이터 임베딩 (각 컬럼별로 d_model 차원 변환)
        num_emb = torch.cat([self.numeric_embeddings[i](x_num[:, i].unsqueeze(1)).unsqueeze(1) for i in range(self.num_num_features)], dim=1)
        
        # 범주형 + 수치형 결합 (멀티 토큰 형태)
        combined_emb = torch.cat([cat_emb, num_emb], dim=1)
      
        
        # Transformer 인코딩
        encoded = self.transformer(combined_emb)
        summary = torch.mean(encoded, dim=1)  # 평균 풀링
        
        # 최종 분류
        logits = self.fc(summary)
        return logits


# 학습 및 평가

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TabTransformerModel(
    num_cat_features=len(cat_cols_by_cardinality_less_target),
    cat_max_dict=cat_max_dict,
    num_num_features=len(num_cols_by_cardinality),
    d_model=32
).to(device)

criterion = nn.CrossEntropyLoss(weight=class_weights)
optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)
lr_scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3)

num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    
    for batch in train_loader:
        cat_feats = batch["cat"].to(device)
        num_feats = batch["num"].to(device)
        labels = batch["label"].to(device)
        
        optimizer.zero_grad()
        logits = model(cat_feats, num_feats)
        loss = criterion(logits, labels)
        
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        
    epoch_loss = running_loss / len(train_loader)
    lr_scheduler.step(epoch_loss)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}")
    
    # 평가 모드
    model.eval()
    y_true, y_pred, y_prob = [], [], []
    
    with torch.no_grad():
        for batch in test_loader:
            cat_feats = batch["cat"].to(device)
            num_feats = batch["num"].to(device)
            labels = batch["label"].to(device)
            
            logits = model(cat_feats, num_feats)
            probs = F.softmax(logits, dim=1)
            preds = torch.argmax(probs, dim=1)
            
            y_true.extend(labels.cpu().numpy())
            y_pred.extend(preds.cpu().numpy())
            y_prob.extend(probs[:, 1].cpu().numpy())
    
    acc = accuracy_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_prob)
    f1 = f1_score(y_true, y_pred)
    
    print(f"   -> Test Accuracy: {acc:.4f}, ROC-AUC: {roc_auc:.4f}, F1-score: {f1:.4f}")


Epoch 1/10, Loss: 0.6630
   -> Test Accuracy: 0.7089, ROC-AUC: 0.6885, F1-score: 0.2366
Epoch 2/10, Loss: 0.6325
   -> Test Accuracy: 0.7356, ROC-AUC: 0.6905, F1-score: 0.2525
Epoch 3/10, Loss: 0.6254
   -> Test Accuracy: 0.7192, ROC-AUC: 0.7172, F1-score: 0.2519
Epoch 4/10, Loss: 0.6177
   -> Test Accuracy: 0.7674, ROC-AUC: 0.7259, F1-score: 0.2651
Epoch 5/10, Loss: 0.6120
   -> Test Accuracy: 0.7040, ROC-AUC: 0.7269, F1-score: 0.2551
Epoch 6/10, Loss: 0.6114
   -> Test Accuracy: 0.7198, ROC-AUC: 0.7247, F1-score: 0.2586
Epoch 7/10, Loss: 0.6095
   -> Test Accuracy: 0.7703, ROC-AUC: 0.7306, F1-score: 0.2715
Epoch 8/10, Loss: 0.6077
   -> Test Accuracy: 0.7243, ROC-AUC: 0.7347, F1-score: 0.2656
Epoch 9/10, Loss: 0.6056
   -> Test Accuracy: 0.7897, ROC-AUC: 0.7370, F1-score: 0.2796
Epoch 10/10, Loss: 0.6048
   -> Test Accuracy: 0.7535, ROC-AUC: 0.7436, F1-score: 0.2744


# Auto ML 추가
- 위에서의 학습량이 부족하여, 하이퍼파라미터 최적화와 epoch를 늘리기 위하여 AutoML 기획하였지만,
- 리소스 한계로 최종까지 진행 못함

In [6]:
from sklearn.model_selection import train_test_split

# 전체 데이터셋 (X, y)
X = Data_EDA_scaled_mice.drop(columns=["TARGET"])  # 특징 데이터
y = Data_EDA_scaled_mice["TARGET"]  # 레이블

Train: (215257, 120), Validation: (30751, 120), Test: (61503, 120)


In [27]:
cat_max_dict = {}
for i, c in enumerate(cat_cols_by_cardinality_less_target):
    num_cats = Data_EDA_scaled_mice[c].max() + 1  # 최대 인덱스 + 1
    cat_max_dict[i] = num_cats


In [28]:
print(type(cat_max_dict))
print(cat_max_dict)

<class 'dict'>
{0: 2, 1: 3, 2: 2, 3: 2, 4: 8, 5: 8, 6: 5, 7: 6, 8: 6, 9: 2, 10: 2, 11: 2, 12: 2, 13: 2, 14: 2, 15: 19, 16: 3, 17: 3, 18: 7, 19: 2, 20: 2, 21: 2, 22: 2, 23: 2, 24: 2, 25: 58, 26: 5, 27: 4, 28: 8, 29: 3, 30: 10, 31: 2, 32: 2, 33: 2, 34: 2, 35: 2, 36: 2, 37: 2, 38: 2, 39: 2, 40: 2, 41: 2, 42: 2, 43: 2, 44: 2, 45: 2, 46: 2, 47: 2, 48: 2, 49: 2, 50: 2}


In [30]:
import torch
from sklearn.utils.class_weight import compute_class_weight

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class_weights = compute_class_weight(class_weight="balanced", classes=np.array([0,1]), y=y_train)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

In [31]:
print(type(class_weights))
class_weights

<class 'torch.Tensor'>


tensor([0.5439, 6.1934])

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import StratifiedKFold
import optuna
import numpy as np
from torch.utils.data import TensorDataset
from torch.utils.data import Dataset, DataLoader

class TabTransformerModel(nn.Module):
    def __init__(self,
                 num_cat_features: int,
                 cat_max_dict: dict,
                 num_num_features: int,
                 d_model: int = 32,
                 nhead: int = 4,
                 num_layers: int = 2,
                 dim_feedforward: int = 64,
                 final_hidden: int = 128,
                 dropout_rate: float = 0.3):
        super().__init__()
        
        self.d_model = d_model
        self.num_cat_features = num_cat_features
        self.num_num_features = num_num_features
        
        # 범주형 임베딩
        self.embeddings = nn.ModuleList([
            nn.Embedding(int(cat_max_dict[i]), d_model) for i in range(num_cat_features)
        ])
        
        # 수치형 컬럼별 임베딩 (각 컬럼을 d_model 차원으로 변환)
        self.numeric_embeddings = nn.ModuleList([
            nn.Linear(1, d_model) for _ in range(num_num_features)
        ])
        
        # Position embedding
        self.position_embedding = nn.Embedding(num_cat_features + num_num_features, d_model)
        
        # Transformer Encoder
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            dropout=dropout_rate,
            batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        
        # 최종 Fully Connected
        self.fc = nn.Sequential(
            nn.Linear(d_model, final_hidden),
            nn.BatchNorm1d(final_hidden),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            
            nn.Linear(final_hidden, final_hidden),
            nn.BatchNorm1d(final_hidden),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            
            nn.Linear(final_hidden, 2)  # 이진 분류 (로짓 출력)
        )
        
    def forward(self, x_cat: torch.Tensor, x_num: torch.Tensor) -> torch.Tensor:
        # 범주형 임베딩
        cat_emb = torch.cat([self.embeddings[i](x_cat[:, i].long()).unsqueeze(1) for i in range(self.num_cat_features)], dim=1)
        
        # 수치형 데이터 임베딩 (각 컬럼별로 d_model 차원 변환)
        num_emb = torch.cat([self.numeric_embeddings[i](x_num[:, i].unsqueeze(1)).unsqueeze(1) for i in range(self.num_num_features)], dim=1)
        
        # 범주형 + 수치형 결합 (멀티 토큰 형태)
        combined_emb = torch.cat([cat_emb, num_emb], dim=1)
        
        # Position embedding 추가
        positions = torch.arange(self.num_cat_features + self.num_num_features, device=x_cat.device).unsqueeze(0)
        pos_emb = self.position_embedding(positions)
        combined_emb = combined_emb + pos_emb
        
        # Transformer 인코딩
        encoded = self.transformer(combined_emb)
        summary = torch.mean(encoded, dim=1)  # 평균 풀링
        
        # 최종 분류
        logits = self.fc(summary)
        return logits


# Optuna를 사용한 K-Fold Cross Validation 적용

def objective(trial):
    d_model = trial.suggest_categorical("d_model", [16, 32, 64])
    num_layers = trial.suggest_int("num_layers", 1, 4)
    dropout_rate = trial.suggest_float("dropout_rate", 0.1, 0.5)
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-4, 1e-2)
    weight_decay = trial.suggest_loguniform("weight_decay", 1e-6, 1e-2)
    
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)  # K-Fold Cross Validation 적용
    avg_val_loss = 0
    
    for train_idx, valid_idx in skf.split(X, y):  # K-Fold 분할 적용
        X_train_fold, X_valid_fold = X.iloc[train_idx], X.iloc[valid_idx]
        y_train_fold, y_valid_fold = y.iloc[train_idx], y.iloc[valid_idx]
        
        train_dataset = TensorDataset(torch.tensor(X_train_fold[cat_cols_by_cardinality_less_target].to_numpy(), dtype=torch.long), torch.tensor(X_train_fold[num_cols_by_cardinality].to_numpy(), dtype=torch.float32),torch.tensor(y_train_fold.to_numpy(), dtype=torch.long))
        valid_dataset = TensorDataset(torch.tensor(X_valid_fold[cat_cols_by_cardinality_less_target].to_numpy(), dtype=torch.long), torch.tensor(X_valid_fold[num_cols_by_cardinality].to_numpy(), dtype=torch.float32), torch.tensor(y_valid_fold.to_numpy(), dtype=torch.long))
        
        train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
        valid_loader = DataLoader(valid_dataset, batch_size=64, shuffle=False)
        
        model = TabTransformerModel(
            num_cat_features=len(cat_cols_by_cardinality_less_target),
            cat_max_dict=cat_max_dict,
            num_num_features=len(num_cols_by_cardinality),
            d_model=d_model,
            num_layers=num_layers,
            dropout_rate=dropout_rate
        ).to(device)
        
        class_weights = compute_class_weight(class_weight="balanced", classes=np.array([0, 1]), y=y_train_fold)
        class_weights = torch.tensor(class_weights, dtype=torch.float32).to(device)
        
        criterion = nn.CrossEntropyLoss(weight=class_weights)
        optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
        
        num_epochs = 10
        for epoch in range(num_epochs):
            model.train()
            for batch in train_loader:
                cat_feats, num_feats, labels = batch
                optimizer.zero_grad()
                logits = model(cat_feats.to(device), num_feats.to(device))
                loss = criterion(logits, labels.to(device))
                loss.backward()
                optimizer.step()
        
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in valid_loader:
                cat_feats, num_feats, labels = batch
                logits = model(cat_feats.to(device), num_feats.to(device))
                val_loss += criterion(logits, labels.to(device)).item()
        
        avg_val_loss += val_loss / len(valid_loader)
    
    return avg_val_loss / 5  # 평균 Validation Loss 반환

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=20)

print("Best hyperparameters:", study.best_params)


[I 2025-03-10 23:34:36,401] A new study created in memory with name: no-name-74a65536-8e9a-46ec-bf8d-a0e8923c01d3
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-4, 1e-2)
  weight_decay = trial.suggest_loguniform("weight_decay", 1e-6, 1e-2)
