In [None]:
from catboost import CatBoostClassifier, Pool
import pandas as pd
import optuna
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.metrics import roc_auc_score
from dataclasses import dataclass
from simple_parsing import parse

@dataclass
class Args:
    train_path: str = "data/train.parquet"
    test_path: str = "data/test.parquet"
    ssub_path: str = "data/sample_submission.csv"
    sub_path: str = "submission.csv"

args = parse(Args)

SEED = 228

In [None]:
train = pd.read_parquet(args.train_path, engine="fastparquet")
train

In [None]:
test = pd.read_parquet(args.test_path, engine="fastparquet")
test

In [None]:
split_idx = int(len(train) * 0.8)

train_df = train.iloc[:split_idx]
val_df = train.iloc[split_idx:]

In [None]:
train_pool = Pool(
    data=train_df.drop(["a6_flg"], axis=1),
    label=train_df["a6_flg"],
    cat_features=["product"],
    timestamp=train_df["month_dt"],
)
val_pool = Pool(
    data=val_df.drop(["a6_flg"], axis=1),
    label=val_df["a6_flg"],
    cat_features=["product"],
    timestamp=val_df["month_dt"],
)
full_train_pool = Pool(
    data=train.drop(["a6_flg"], axis=1),
    label=train["a6_flg"],
    cat_features=["product"],
    timestamp=train["month_dt"],
)
test_pool = Pool(data=test, cat_features=["product"], timestamp=test["month_dt"])

In [None]:
def objective(trial):
    param = {
        "eval_metric": "AUC",
        "task_type": "GPU",
        "verbose": False,
        "random_seed": SEED,
        "loss_function": "Logloss",
        "use_best_model": True,
        "boosting_type": "Plain",
        "iterations": trial.suggest_int("iterations", 2000, 5000),
        "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.3, log=True),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-2, 10.0, log=True),
        "random_strength": trial.suggest_float("random_strength", 1e-9, 10.0, log=True),
        "border_count": trial.suggest_int("border_count", 32, 254),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
    }

    grow_policy = trial.suggest_categorical("grow_policy", ["SymmetricTree", "Depthwise", "Lossguide"])
    param["grow_policy"] = grow_policy

    if grow_policy == "SymmetricTree":
        param["depth"] = trial.suggest_int("depth", 2, 10)
    elif grow_policy == "Depthwise":
        param["depth"] = trial.suggest_int("depth", 4, 12)
    elif grow_policy == "Lossguide":
        param["max_leaves"] = trial.suggest_int("max_leaves", 16, 64)
        param["depth"] = trial.suggest_int("depth", 4, 12)

    bootstrap_type = trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli"])
    param["bootstrap_type"] = bootstrap_type

    if bootstrap_type == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif bootstrap_type == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1.0)

    auto_class_weights = trial.suggest_categorical("auto_class_weights", ["None", "Balanced", "SqrtBalanced"])
    if auto_class_weights != "None":
        param["auto_class_weights"] = auto_class_weights

    model = CatBoostClassifier(**param)

    model.fit(
        train_pool,
        eval_set=val_pool,
        verbose=0,
        early_stopping_rounds=200
    )

    return model.get_best_score()["validation"]["AUC"]

In [None]:
study = optuna.create_study(
    direction="maximize",
    sampler=optuna.samplers.TPESampler(seed=SEED),
)

study.optimize(objective, n_trials=50, timeout=600)

print(f"Best trial found: {study.best_value}")
print(f"Params: {study.best_params}")

In [None]:
best_params = study.best_params.copy()

best_params.update({"eval_metric": "AUC", "task_type": "GPU", "random_seed": SEED})

final_model = CatBoostClassifier(**best_params)

final_model.fit(full_train_pool, verbose=100, plot=False)

In [None]:
ss_sub = pd.read_csv(args.ssub_path)

In [None]:
ss_sub["a6_flg"] = final_model.predict_proba(test_pool)[:, 1]

In [None]:
ss_sub.to_csv(args.sub_path, index=False)