In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score

from xgboost import XGBClassifier
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from pytorch_tabnet.tab_model import TabNetClassifier

df = pd.read_csv("./spea_data_nvme/spea_pass_fail_data.csv")
X = df.iloc[:, :-1].values.astype(np.float32)
y = df.iloc[:, -1].values.astype(np.int64)

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

print(X_train.shape,X_val.shape)

(8799999, 28) (2200000, 28)


In [None]:
xgb = XGBClassifier(
    n_estimators=1000, # early stopping 걸면 1000도 충분
    max_depth=4, # 휴리스틱하게 결정.
    learning_rate=0.03,
    subsample=0.7,
    colsample_bytree=0.7,
    min_child_weight=10,
    gamma=1.0,
    reg_alpha=0.1,
    reg_lambda=1.0,
    objective="binary:logistic",
    eval_metric="auc",
    tree_method="gpu_hist",
    random_state=42
)

xgb.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    early_stopping_rounds=50,
    verbose=10,
)

xgb.save_model('./save/XGB/xgb.model')
xgb_pred = (xgb.predict_proba(X_val)[:, 1] > 0.5).astype(int)
print("XGBoost: Accuracy:", accuracy_score(y_val, xgb_pred))
print("XGBoost: ROC-AUC :", roc_auc_score(y_val, xgb.predict_proba(X_val)[:, 1]))

[0]	validation_0-auc:0.73445
[1]	validation_0-auc:0.74109
[2]	validation_0-auc:0.75071



    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)


[3]	validation_0-auc:0.75422
[4]	validation_0-auc:0.75720
[5]	validation_0-auc:0.76507
[6]	validation_0-auc:0.76461
[7]	validation_0-auc:0.76595
[8]	validation_0-auc:0.76714
[9]	validation_0-auc:0.76779
[10]	validation_0-auc:0.76817
[11]	validation_0-auc:0.76846
[12]	validation_0-auc:0.77100
[13]	validation_0-auc:0.77147
[14]	validation_0-auc:0.77278
[15]	validation_0-auc:0.77446
[16]	validation_0-auc:0.77478
[17]	validation_0-auc:0.77574
[18]	validation_0-auc:0.77665
[19]	validation_0-auc:0.77722
[20]	validation_0-auc:0.77742
[21]	validation_0-auc:0.77899
[22]	validation_0-auc:0.77962
[23]	validation_0-auc:0.78094
[24]	validation_0-auc:0.78142
[25]	validation_0-auc:0.78189
[26]	validation_0-auc:0.78247
[27]	validation_0-auc:0.78268
[28]	validation_0-auc:0.78327
[29]	validation_0-auc:0.78387
[30]	validation_0-auc:0.78431
[31]	validation_0-auc:0.78538
[32]	validation_0-auc:0.78578
[33]	validation_0-auc:0.78621
[34]	validation_0-auc:0.78681
[35]	validation_0-auc:0.78706
[36]	validation_0


    E.g. tree_method = "hist", device = "cuda"

  self.get_booster().save_model(fname)
  self.get_booster().save_model(fname)
Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)


XGBoost: Accuracy: 0.7479072727272728
XGBoost: ROC-AUC : 0.8302976379457191


In [None]:
class MLP(nn.Module):
    def __init__(self, in_dim=28):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 1),
            nn.Sigmoid(), # BCE Loss.
        )
    def forward(self, x):
        return self.net(x)
device = "cuda" if torch.cuda.is_available() else "cpu"
model = MLP(in_dim=X_train.shape[1]).to(device)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

train_ds = TensorDataset(torch.tensor(X_train), torch.tensor(y_train, dtype=torch.float32))
val_ds   = TensorDataset(torch.tensor(X_val),   torch.tensor(y_val,   dtype=torch.float32))
train_loader = DataLoader(train_ds, batch_size=256, shuffle=True)
val_loader   = DataLoader(val_ds,   batch_size=1024, shuffle=False)

best_auc, patience, patience_limit = 0, 0, 10
for epoch in range(100):
    model.train()
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device).unsqueeze(1)
        optimizer.zero_grad()
        loss = criterion(model(xb), yb)
        loss.backward()
        optimizer.step()
    model.eval()
    with torch.no_grad():
        preds = []
        for xb, _ in val_loader:
            xb = xb.to(device)
            preds.append(model(xb).cpu().numpy())
        preds = np.vstack(preds).ravel()
        auc = roc_auc_score(y_val, preds)
        acc = accuracy_score(y_val,preds)
        #print(f"MLP AUC:{auc} ACC : {acc},{epoch}epoch.")
    if auc > best_auc:
        best_auc, patience = auc, 0
        torch.save(model.state_dict(), "./save/MLP/mlp.pt")
    else:
        patience += 1
    if patience >= patience_limit:
        print(f"MLP trian Early Stop : {epoch}epoch.")
        break
print("MLP best ROC-AUC:", best_auc)

MLP best ROC-AUC: 0.8423877616119122


In [7]:
tabnet = TabNetClassifier(
    n_d=64, n_a=64, n_steps=5,
    gamma=1.5, n_independent=2, n_shared=2,
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-3),
    scheduler_params={"step_size":50, "gamma":0.9},
    scheduler_fn=torch.optim.lr_scheduler.StepLR,
    verbose=1,
    device_name=device, 
)

tabnet.fit(
    X_train=X_train, y_train=y_train,
    eval_set=[(X_val, y_val)],
    eval_name=["val"],
    eval_metric=["auc"],
    max_epochs=200,
    patience=20,
    batch_size=1024, virtual_batch_size=128,
)

tab_pred = (tabnet.predict_proba(X_val)[:, 1] > 0.5).astype(int)
print("TabNet Accuracy:", accuracy_score(y_val, tab_pred))
print("TabNet ROC-AUC :", roc_auc_score(y_val, tabnet.predict_proba(X_val)[:, 1]))



epoch 0  | loss: 0.61265 | val_auc: 0.76576 |  0:03:09s
epoch 1  | loss: 0.56799 | val_auc: 0.79087 |  0:06:18s
epoch 2  | loss: 0.54789 | val_auc: 0.80576 |  0:09:22s
epoch 3  | loss: 0.52922 | val_auc: 0.82323 |  0:12:27s
epoch 4  | loss: 0.51327 | val_auc: 0.83274 |  0:15:40s
epoch 5  | loss: 0.5034  | val_auc: 0.83898 |  0:18:49s
epoch 6  | loss: 0.49586 | val_auc: 0.84358 |  0:21:59s
epoch 7  | loss: 0.49134 | val_auc: 0.84721 |  0:25:05s
epoch 8  | loss: 0.4874  | val_auc: 0.84907 |  0:28:09s
epoch 9  | loss: 0.4847  | val_auc: 0.85158 |  0:31:20s
epoch 10 | loss: 0.48295 | val_auc: 0.852   |  0:34:27s
epoch 11 | loss: 0.48126 | val_auc: 0.85219 |  0:37:32s
epoch 12 | loss: 0.47945 | val_auc: 0.85435 |  0:40:40s
epoch 13 | loss: 0.47792 | val_auc: 0.85537 |  0:43:47s
epoch 14 | loss: 0.47701 | val_auc: 0.85638 |  0:46:52s
epoch 15 | loss: 0.47509 | val_auc: 0.85749 |  0:49:56s
epoch 16 | loss: 0.47414 | val_auc: 0.85759 |  0:53:02s
epoch 17 | loss: 0.4736  | val_auc: 0.85759 |  0



TabNet Accuracy: 0.7832968181818182
TabNet ROC-AUC : 0.8688608324332339


In [8]:
tabnet.save_model("./save/TABNET/tabnet_best_model") # zip 으로 저장됨.

Successfully saved model at ./save/TABNET/tabnet_best_model.zip


'./save/TABNET/tabnet_best_model.zip'

In [10]:
from pytorch_tabnet.tab_model import TabNetClassifier

loaded_tabnet = TabNetClassifier()
loaded_tabnet.load_model("./save/TABNET/tabnet_best_model.zip")

loaded_pred = (loaded_tabnet.predict_proba(X_val)[:, 1] > 0.5).astype(int)
print("Loaded Accuracy:", accuracy_score(y_val, loaded_pred))
print("Loaded ROC-AUC :", roc_auc_score(y_val, loaded_tabnet.predict_proba(X_val)[:, 1]))



Loaded Accuracy: 0.7832968181818182
Loaded ROC-AUC : 0.8688608324332339
