In [22]:
import pandas as pd

In [7]:
train_df = pd.read_csv("C:\\Users\\solba\\dacon-project\\data\\processing\\train_processed.csv")
test_df = pd.read_csv("C:\\Users\\solba\\dacon-project\\data\\processing\\test_processed.csv")

In [23]:
import numpy as np
import pandas as pd

train_df = pd.read_csv(r"C:\Users\solba\dacon-project\data\processing\train_processed.csv")
test_df  = pd.read_csv(r"C:\Users\solba\dacon-project\data\processing\test_processed.csv")

TARGET_COL = "completed"
ID_COL = "ID"  # 다르면 바꿔

# --- y ---
y = train_df[TARGET_COL].astype(int).values

# --- X (train/test 동일 컬럼 맞추기) ---
drop_cols = [c for c in [TARGET_COL, ID_COL] if c in train_df.columns]
X = train_df.drop(columns=drop_cols)

# test도 ID 제외하고 피처만
test_drop_cols = [c for c in [ID_COL] if c in test_df.columns]
X_test = test_df.drop(columns=test_drop_cols)

# 컬럼 정렬/교집합 맞추기 (혹시라도 전처리 과정에서 차이 생겼을 때 대비)
common_cols = X.columns.intersection(X_test.columns)
X = X[common_cols]
X_test = X_test[common_cols]

# 타입 정리 (CatBoost에 안전)
X = X.astype(np.float32)
X_test = X_test.astype(np.float32)

print("train X shape:", X.shape, " / test X shape:", X_test.shape)
print("class balance:", np.bincount(y))


train X shape: (748, 308)  / test X shape: (814, 308)
class balance: [525 223]


In [28]:
import numpy as np
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, average_precision_score

X_all = X.values
y_all = y

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def cv_catboost(class_w1=1.0, task_type="GPU"):
    rocs, prs, best_iters = [], [], []
    for tr_idx, va_idx in skf.split(X_all, y_all):
        Xtr, Xva = X_all[tr_idx], X_all[va_idx]
        ytr, yva = y_all[tr_idx], y_all[va_idx]

        model = CatBoostClassifier(
            iterations=5000,
            learning_rate=0.03,
            depth=6,
            loss_function="Logloss",
            eval_metric="AUC",
            random_seed=42,
            verbose=0,
            task_type=task_type,
            devices="0" if task_type == "GPU" else None,
            class_weights=[1.0, class_w1],
            od_type="Iter",
            od_wait=200,
        )
        model.fit(Xtr, ytr, eval_set=(Xva, yva), use_best_model=True)
        proba = model.predict_proba(Xva)[:, 1]

        rocs.append(roc_auc_score(yva, proba))
        prs.append(average_precision_score(yva, proba))
        best_iters.append(model.get_best_iteration())

    return (np.mean(rocs), np.std(rocs),
            np.mean(prs), np.std(prs),
            int(np.median(best_iters)))

for w in [1.0, 1.2, 1.5, 2.0]:
    mroc, sroc, mpr, spr, med_iter = cv_catboost(class_w1=w, task_type="GPU")
    print(f"w={w} | ROC {mroc:.4f}±{sroc:.4f} | PR {mpr:.4f}±{spr:.4f} | med_best_iter={med_iter}")


Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU


w=1.0 | ROC 0.6071±0.0465 | PR 0.3889±0.0203 | med_best_iter=27


Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU


w=1.2 | ROC 0.6144±0.0427 | PR 0.3976±0.0159 | med_best_iter=10


Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU


w=1.5 | ROC 0.6079±0.0534 | PR 0.4026±0.0406 | med_best_iter=26


Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU


w=2.0 | ROC 0.6142±0.0534 | PR 0.4101±0.0445 | med_best_iter=22


In [29]:
import numpy as np
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, average_precision_score

X_all = X.values
y_all = y
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def cv_once(params):
    rocs, prs, iters = [], [], []
    for tr_idx, va_idx in skf.split(X_all, y_all):
        Xtr, Xva = X_all[tr_idx], X_all[va_idx]
        ytr, yva = y_all[tr_idx], y_all[va_idx]

        model = CatBoostClassifier(**params)
        model.fit(Xtr, ytr, eval_set=(Xva, yva), use_best_model=True)
        proba = model.predict_proba(Xva)[:, 1]

        rocs.append(roc_auc_score(yva, proba))
        prs.append(average_precision_score(yva, proba))
        iters.append(model.get_best_iteration())

    return (np.mean(rocs), np.std(rocs),
            np.mean(prs), np.std(prs),
            int(np.median(iters)))

base = dict(
    iterations=8000,
    learning_rate=0.02,
    depth=6,
    loss_function="Logloss",
    eval_metric="Logloss",     # ✅ GPU에서도 안정적
    random_seed=42,
    verbose=0,
    task_type="GPU",
    devices="0",
    class_weights=[1.0, 2.0],  # ✅ 네 CV에서 최고
    od_type="Iter",
    od_wait=300
)

cands = [
    {**base, "l2_leaf_reg": 6, "random_strength": 1, "min_data_in_leaf": 10},
    {**base, "l2_leaf_reg": 10, "random_strength": 2, "min_data_in_leaf": 20},
    {**base, "l2_leaf_reg": 14, "random_strength": 3, "min_data_in_leaf": 30},
]

for i, p in enumerate(cands):
    mroc, sroc, mpr, spr, med_iter = cv_once(p)
    print(f"[{i}] ROC {mroc:.4f}±{sroc:.4f} | PR {mpr:.4f}±{spr:.4f} | med_best_iter={med_iter} | {p['l2_leaf_reg']=}, {p['random_strength']=}, {p['min_data_in_leaf']=}")


[0] ROC 0.5879±0.0379 | PR 0.3886±0.0357 | med_best_iter=51 | p['l2_leaf_reg']=6, p['random_strength']=1, p['min_data_in_leaf']=10
[1] ROC 0.6013±0.0463 | PR 0.3859±0.0363 | med_best_iter=93 | p['l2_leaf_reg']=10, p['random_strength']=2, p['min_data_in_leaf']=20
[2] ROC 0.5897±0.0389 | PR 0.3766±0.0358 | med_best_iter=183 | p['l2_leaf_reg']=14, p['random_strength']=3, p['min_data_in_leaf']=30


In [30]:
import numpy as np
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, average_precision_score

X_all = X.values
y_all = y
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def cv_cat(params):
    rocs, prs, iters = [], [], []
    for tr_idx, va_idx in skf.split(X_all, y_all):
        Xtr, Xva = X_all[tr_idx], X_all[va_idx]
        ytr, yva = y_all[tr_idx], y_all[va_idx]

        m = CatBoostClassifier(**params)
        m.fit(Xtr, ytr, eval_set=(Xva, yva), use_best_model=True)
        p = m.predict_proba(Xva)[:, 1]
        rocs.append(roc_auc_score(yva, p))
        prs.append(average_precision_score(yva, p))
        iters.append(m.get_best_iteration())
    return np.mean(rocs), np.std(rocs), np.mean(prs), np.std(prs), int(np.median(iters))

base = dict(
    iterations=5000,
    learning_rate=0.03,
    depth=6,
    loss_function="Logloss",
    eval_metric="AUC",
    random_seed=42,
    verbose=0,
    task_type="GPU",
    devices="0",
    class_weights=[1.0, 2.0],
    od_type="Iter",
    od_wait=200,
)

cands = [
    {**base, "depth": 4},
    {**base, "depth": 6},
    {**base, "depth": 8},
    # 샘플링/피처샘플링 추가 (이게 꽤 잘 먹는 경우 많음)
    {**base, "depth": 6, "subsample": 0.8, "rsm": 0.8},
    {**base, "depth": 8, "subsample": 0.8, "rsm": 0.8},
]

for i, p in enumerate(cands):
    mroc, sroc, mpr, spr, med_iter = cv_cat(p)
    print(f"[{i}] ROC {mroc:.4f}±{sroc:.4f} | PR {mpr:.4f}±{spr:.4f} | med_iter={med_iter} | depth={p.get('depth')}, subs={p.get('subsample', None)}, rsm={p.get('rsm', None)}")


Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU


[0] ROC 0.6031±0.0410 | PR 0.3894±0.0190 | med_iter=24 | depth=4, subs=None, rsm=None


Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU


[1] ROC 0.6142±0.0534 | PR 0.4101±0.0445 | med_iter=22 | depth=6, subs=None, rsm=None


Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU


[2] ROC 0.6077±0.0405 | PR 0.3963±0.0203 | med_iter=38 | depth=8, subs=None, rsm=None


CatBoostError: catboost/private/libs/options/catboost_options.cpp:794: Error: default bootstrap type (bayesian) doesn't support 'subsample' option

In [31]:
from catboost import CatBoostClassifier
import pandas as pd
import numpy as np

X_all = X.values
y_all = y
X_test_all = X_test.values

final_model = CatBoostClassifier(
    iterations=5000,
    learning_rate=0.03,
    depth=6,
    loss_function="Logloss",
    eval_metric="AUC",
    random_seed=42,
    verbose=200,
    task_type="GPU",
    devices="0",
    class_weights=[1.0, 2.0],
    od_type="Iter",
    od_wait=200,
)

final_model.fit(X_all, y_all, use_best_model=False)
test_proba = final_model.predict_proba(X_test_all)[:, 1]

sub = pd.DataFrame({
    "ID": test_df["ID"] if "ID" in test_df.columns else np.arange(len(test_df)),
    "completed": test_proba
})
sub.to_csv("submission.csv", index=False)
print("Saved submission.csv")


Default metric period is 5 because AUC is/are not implemented for GPU


0:	total: 14.8ms	remaining: 1m 14s
200:	total: 2.94s	remaining: 1m 10s
400:	total: 5.91s	remaining: 1m 7s
600:	total: 8.84s	remaining: 1m 4s
800:	total: 11.8s	remaining: 1m 1s
1000:	total: 14.8s	remaining: 59.2s
1200:	total: 17.8s	remaining: 56.4s
1400:	total: 20.8s	remaining: 53.4s
1600:	total: 23.7s	remaining: 50.4s
1800:	total: 26.7s	remaining: 47.5s
2000:	total: 29.7s	remaining: 44.6s
2200:	total: 32.7s	remaining: 41.6s
2400:	total: 35.6s	remaining: 38.5s
2600:	total: 38.5s	remaining: 35.5s
2800:	total: 41.5s	remaining: 32.5s
3000:	total: 44.4s	remaining: 29.6s
3200:	total: 47.4s	remaining: 26.6s
3400:	total: 50.5s	remaining: 23.7s
3600:	total: 53.5s	remaining: 20.8s
3800:	total: 56.5s	remaining: 17.8s
4000:	total: 59.5s	remaining: 14.9s
4200:	total: 1m 2s	remaining: 11.9s
4400:	total: 1m 5s	remaining: 8.91s
4600:	total: 1m 8s	remaining: 5.93s
4800:	total: 1m 11s	remaining: 2.96s
4999:	total: 1m 14s	remaining: 0us
Saved submission.csv


In [33]:
sub.head(50)

Unnamed: 0,ID,completed
0,TEST_000,0.052819
1,TEST_001,0.726685
2,TEST_002,0.244911
3,TEST_003,0.299103
4,TEST_004,0.103613
5,TEST_005,0.462054
6,TEST_006,0.015988
7,TEST_007,0.092544
8,TEST_008,0.467571
9,TEST_009,0.049187


In [35]:
from pathlib import Path

# Ensure output directory exists
output_dir = Path(r"C:\Users\solba\dacon-project\output")
output_dir.mkdir(parents=True, exist_ok=True)

# Threshold completed at 0.45 and save
if 'sub' not in globals():
    raise NameError("sub DataFrame is not defined")
if 'completed' not in sub.columns:
    raise KeyError("'completed' column not found in sub")

sub['completed'] = (sub['completed'] >= 0.3).astype(int)
save_path = output_dir / "submission.csv"
sub.to_csv(save_path, index=False)
print(f"Saved submission to {save_path}")


Saved submission to C:\Users\solba\dacon-project\output\submission.csv
