In [None]:
from model import preprocess_data_lgbm as preprocess_data
import os
import pandas as pd
from utils import create_train_test_group
import lightgbm as lgb


from ray import tune
from ray.tune.schedulers import ASHAScheduler
from ray.tune.integration.lightgbm import TuneReportCheckpointCallback


import shutil

In [2]:
"""
The following part is the important parameter for the LightGBM training
"""
csv_file_name = "./Data/mro_daily_clean.csv"
target_mro = ["mro"]

maintain_repair_mro = "full"

add_mro_prev = True
add_purchase_time = True
add_driver_behavior = True
agg_weeks = 1
agg_fun = ["mean", "sum", "max", "min", "std", "skew"]
# time window could be 4, 8, 12
time_window = 8

# ------------------------------------------
# LightGBM Parameters
metric: list = ["binary_logloss", "binary_error", "auc", "average_precision"]
learning_rate: float = 0.05
num_leaves: int = 64
max_depth: int = 8
is_unbalance: bool = True
# boosting could be "gbdt", "rf" (random forest) and "dart"
boosting: str = "gbdt"


# ------------------------------------------
# data record folder
data_lgbm_file_name = f"data_lgbm_db{int(add_driver_behavior)}_mp{int(add_mro_prev)}_pt{int(add_purchase_time)}_aw{agg_weeks}_tw{time_window}.gzip"
data_lgbm_path = os.path.join("./Data", data_lgbm_file_name)
data_lgbm_path = os.path.abspath(data_lgbm_path)

# ------------------------------------------
# model record folder
model_name = f"model_lgbm_{boosting}_db{int(add_driver_behavior)}_mp{int(add_mro_prev)}_pt{int(add_purchase_time)}_aw{agg_weeks}_tw{time_window}.txt"
model_output_dir = "./output/lgbm"
os.makedirs(model_output_dir, exist_ok=True)
model_path = os.path.join(model_output_dir, model_name)


# ------------------------------------------
# train control and scaling control parameters
num_workers = 4
num_boost_round = 1000
early_stopping_round = 10


"""
End of the Parameter Config
"""

'\nEnd of the Parameter Config\n'

In [3]:
if os.path.isfile(data_lgbm_path):
    print(f"Data file {data_lgbm_path} exists.")
    data_lgbm = pd.read_parquet(data_lgbm_path)
else:
    print(f"{data_lgbm_path} does not exist.")
    # control parameter: data preparation
    data = preprocess_data(
        file_name=csv_file_name,
        target_mro=target_mro,
        maintain_repair_mro=maintain_repair_mro,
        add_mro_prev=add_mro_prev,
        add_purchase_time=add_purchase_time,
        add_driver_behavior=add_driver_behavior,
        agg_weeks=agg_weeks,
        agg_fun=agg_fun,
        time_window=time_window,
    )

    data_lgbm = create_train_test_group(
        data=data,
        sample_frac=1.0,
        test_size=0.1,
        valid_size=0.1,
        random_state=42,
    )

    data_lgbm.to_parquet(data_lgbm_path, compression="gzip", engine="pyarrow")

Data file /home/user14/Cyber/MroPred/Data/data_lgbm_db1_mp1_pt1_aw1_tw8.gzip exists.


In [4]:
def prepare_data(data: pd.DataFrame):
    """Load and split the dataset into train, validation, and test sets."""

    train_dataset = data[data["group"] == "train"]
    valid_dataset = data[data["group"] == "valid"]
    test_dataset = data[data["group"] == "test"]

    train_dataset = train_dataset.drop(["group", "id"], axis=1)
    valid_dataset = valid_dataset.drop(["group", "id"], axis=1)
    test_dataset = test_dataset.drop(["group", "id"], axis=1)

    return train_dataset, valid_dataset, test_dataset


train_dataset, valid_dataset, test_dataset = prepare_data(data_lgbm)

In [None]:
def train_lgbm(config, train_dataset: pd.DataFrame, valid_dataset: pd.DataFrame):
    # train_dataset, valid_dataset, _ = prepare_data(data_lgbm)
    # train set
    train_set = lgb.Dataset(
        train_dataset.drop(["target_mro"], axis=1), label=train_dataset["target_mro"]
    )
    # valid set
    valid_X = valid_dataset.drop(["target_mro"], axis=1)
    valid_y = valid_dataset["target_mro"]
    valid_set = lgb.Dataset(valid_X, label=valid_y)

    gbm = lgb.train(
        config,
        train_set,
        valid_sets=[valid_set],
        valid_names=["eval"],
        callbacks=[
            TuneReportCheckpointCallback(
                {
                    "binary_error": "eval-binary_error",
                    "auc": "eval-auc"
                    # "binary_logloss": "eval-binary_logloss",
                }
            )
        ],
    )

In [6]:
tune_result_storage_path = './output/lgbm/lgbm_tuning_results'

if os.path.exists(tune_result_storage_path):
    shutil.rmtree(tune_result_storage_path)
    print(f"Already exists {tune_result_storage_path}, remove it and create a new one.")
    
os.makedirs(tune_result_storage_path, exist_ok=True)
tune_result_storage_path = os.path.abspath(tune_result_storage_path)

Already exists ./output/lgbm/lgbm_tuning_results, remove it and create a new one.


In [7]:
if __name__ == "__main__":
    config = {
        "objective": "binary",
        "metric": ["binary_logloss", "binary_error", "auc", "average_precision"],
        "verbose": 1,
        "is_unbalance": True,
        # "max_depth": 8,
        "max_depth": tune.randint(4, 20),
        "boosting_type": "gbdt",
        "device_type": "cpu",
        "num_leaves": tune.randint(10, 1000),
        "learning_rate": tune.loguniform(1e-8, 1e-1),
        # "learning_rate": 0.05,
    }

    tuner = tune.Tuner(
        # train_lgbm,
        # tune.with_parameters(
        #     train_lgbm, train_dataset=train_dataset, valid_dataset=valid_dataset
        # ),
        tune.with_resources(
            tune.with_parameters(
                train_lgbm, train_dataset=train_dataset, valid_dataset=valid_dataset
            ),
            {"cpu": 16},
        ),
        tune_config=tune.TuneConfig(
            # metric="binary_error",
            # mode="min",
            metric="auc",
            mode="max",
            scheduler=ASHAScheduler(),
            num_samples=20,
        ),
        run_config=tune.RunConfig(
            name="lgbm_tuning_experiment",
            storage_path=tune_result_storage_path,
        ),
        param_space=config,
    )
    results = tuner.fit()
    # print(f"Best hyperparameters found were: {results.get_best_result().config}")

0,1
Current time:,2025-06-08 19:08:37
Running for:,00:09:52.57
Memory:,374.1/1007.5 GiB

Trial name,status,loc,learning_rate,max_depth,num_leaves,iter,total time (s),binary_error,auc
train_lgbm_8c17a_00000,TERMINATED,144.214.55.187:2735290,1.59272e-07,14,452,100,183.682,0.0477882,0.617553
train_lgbm_8c17a_00001,TERMINATED,144.214.55.187:2735291,6.59116e-05,7,81,100,84.82,0.0477882,0.616567
train_lgbm_8c17a_00002,TERMINATED,144.214.55.187:2735741,3.92115e-08,6,416,1,23.2963,0.0477882,0.610633
train_lgbm_8c17a_00003,TERMINATED,144.214.55.187:2735905,1.4909e-05,14,987,1,25.195,0.0477882,0.61138
train_lgbm_8c17a_00004,TERMINATED,144.214.55.187:2736048,0.0109627,16,815,1,26.62,0.0477882,0.613278
train_lgbm_8c17a_00005,TERMINATED,144.214.55.187:2736185,1.92851e-06,10,432,4,26.7715,0.0477882,0.615617
train_lgbm_8c17a_00006,TERMINATED,144.214.55.187:2736294,5.64033e-06,11,245,100,138.529,0.0477882,0.618459
train_lgbm_8c17a_00007,TERMINATED,144.214.55.187:2736436,6.1235e-06,6,204,1,22.4241,0.0477882,0.610633
train_lgbm_8c17a_00008,TERMINATED,144.214.55.187:2736588,4.66871e-06,4,880,1,23.4354,0.0477882,0.596727
train_lgbm_8c17a_00009,TERMINATED,144.214.55.187:2736744,0.000688985,14,911,1,27.179,0.0477882,0.611934


[36m(train_lgbm pid=2735290)[0m [LightGBM] [Info] Number of positive: 150915, number of negative: 3019730
[36m(train_lgbm pid=2735290)[0m [LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.753888 seconds.
[36m(train_lgbm pid=2735290)[0m You can set `force_col_wise=true` to remove the overhead.
[36m(train_lgbm pid=2735290)[0m [LightGBM] [Info] Total Bins 56850
[36m(train_lgbm pid=2735290)[0m [LightGBM] [Info] Number of data points in the train set: 3170645, number of used features: 296
[36m(train_lgbm pid=2735290)[0m [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.047598 -> initscore=-2.996206
[36m(train_lgbm pid=2735290)[0m [LightGBM] [Info] Start training from score -2.996206


[36m(train_lgbm pid=2735291)[0m   if ray.train.get_context().get_world_rank() in (0, None):
[36m(train_lgbm pid=2735291)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/user14/Cyber/MroPred/output/lgbm/lgbm_tuning_results/lgbm_tuning_experiment/train_lgbm_8c17a_00001_1_learning_rate=0.0001,max_depth=7,num_leaves=81_2025-06-08_18-58-45/checkpoint_000000)


[36m(train_lgbm pid=2735741)[0m [LightGBM] [Info] Number of positive: 150915, number of negative: 3019730[32m [repeated 2x across cluster][0m
[36m(train_lgbm pid=2735291)[0m [LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.834430 seconds.
[36m(train_lgbm pid=2735291)[0m You can set `force_col_wise=true` to remove the overhead.
[36m(train_lgbm pid=2735291)[0m [LightGBM] [Info] Total Bins 56850
[36m(train_lgbm pid=2735291)[0m [LightGBM] [Info] Number of data points in the train set: 3170645, number of used features: 296
[36m(train_lgbm pid=2735291)[0m [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.047598 -> initscore=-2.996206
[36m(train_lgbm pid=2735291)[0m [LightGBM] [Info] Start training from score -2.996206
[36m(train_lgbm pid=2735741)[0m [LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.538750 seconds.
[36m(train_lgbm pid=2735741)[0m You can set `force_col_wise=true` to remove the ove



[36m(train_lgbm pid=2735905)[0m [LightGBM] [Info] Number of positive: 150915, number of negative: 3019730
[36m(train_lgbm pid=2735905)[0m [LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.566027 seconds.
[36m(train_lgbm pid=2735905)[0m You can set `force_col_wise=true` to remove the overhead.
[36m(train_lgbm pid=2735905)[0m [LightGBM] [Info] Total Bins 56850
[36m(train_lgbm pid=2735905)[0m [LightGBM] [Info] Number of data points in the train set: 3170645, number of used features: 296
[36m(train_lgbm pid=2735905)[0m [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.047598 -> initscore=-2.996206
[36m(train_lgbm pid=2735905)[0m [LightGBM] [Info] Start training from score -2.996206




[36m(train_lgbm pid=2736048)[0m [LightGBM] [Info] Number of positive: 150915, number of negative: 3019730
[36m(train_lgbm pid=2736048)[0m [LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.969611 seconds.
[36m(train_lgbm pid=2736048)[0m You can set `force_col_wise=true` to remove the overhead.
[36m(train_lgbm pid=2736048)[0m [LightGBM] [Info] Total Bins 56850
[36m(train_lgbm pid=2736048)[0m [LightGBM] [Info] Number of data points in the train set: 3170645, number of used features: 296
[36m(train_lgbm pid=2736048)[0m [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.047598 -> initscore=-2.996206
[36m(train_lgbm pid=2736048)[0m [LightGBM] [Info] Start training from score -2.996206


[36m(train_lgbm pid=2735290)[0m   if ray.train.get_context().get_world_rank() in (0, None):
[36m(train_lgbm pid=2735290)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/user14/Cyber/MroPred/output/lgbm/lgbm_tuning_results/lgbm_tuning_experiment/train_lgbm_8c17a_00000_0_learning_rate=0.0000,max_depth=14,num_leaves=452_2025-06-08_18-58-45/checkpoint_000000)


[36m(train_lgbm pid=2736185)[0m [LightGBM] [Info] Number of positive: 150915, number of negative: 3019730
[36m(train_lgbm pid=2736185)[0m [LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.484257 seconds.
[36m(train_lgbm pid=2736185)[0m You can set `force_col_wise=true` to remove the overhead.
[36m(train_lgbm pid=2736185)[0m [LightGBM] [Info] Total Bins 56850
[36m(train_lgbm pid=2736185)[0m [LightGBM] [Info] Number of data points in the train set: 3170645, number of used features: 296
[36m(train_lgbm pid=2736185)[0m [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.047598 -> initscore=-2.996206
[36m(train_lgbm pid=2736185)[0m [LightGBM] [Info] Start training from score -2.996206




[36m(train_lgbm pid=2736294)[0m [LightGBM] [Info] Number of positive: 150915, number of negative: 3019730
[36m(train_lgbm pid=2736294)[0m [LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.305977 seconds.
[36m(train_lgbm pid=2736294)[0m You can set `force_col_wise=true` to remove the overhead.
[36m(train_lgbm pid=2736294)[0m [LightGBM] [Info] Total Bins 56850
[36m(train_lgbm pid=2736294)[0m [LightGBM] [Info] Number of data points in the train set: 3170645, number of used features: 296
[36m(train_lgbm pid=2736294)[0m [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.047598 -> initscore=-2.996206
[36m(train_lgbm pid=2736294)[0m [LightGBM] [Info] Start training from score -2.996206




[36m(train_lgbm pid=2736436)[0m [LightGBM] [Info] Number of positive: 150915, number of negative: 3019730
[36m(train_lgbm pid=2736436)[0m [LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.404782 seconds.
[36m(train_lgbm pid=2736436)[0m You can set `force_col_wise=true` to remove the overhead.
[36m(train_lgbm pid=2736436)[0m [LightGBM] [Info] Total Bins 56850
[36m(train_lgbm pid=2736436)[0m [LightGBM] [Info] Number of data points in the train set: 3170645, number of used features: 296
[36m(train_lgbm pid=2736436)[0m [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.047598 -> initscore=-2.996206
[36m(train_lgbm pid=2736436)[0m [LightGBM] [Info] Start training from score -2.996206




[36m(train_lgbm pid=2736588)[0m [LightGBM] [Info] Number of positive: 150915, number of negative: 3019730
[36m(train_lgbm pid=2736588)[0m [LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.694520 seconds.
[36m(train_lgbm pid=2736588)[0m You can set `force_col_wise=true` to remove the overhead.
[36m(train_lgbm pid=2736588)[0m [LightGBM] [Info] Total Bins 56850
[36m(train_lgbm pid=2736588)[0m [LightGBM] [Info] Number of data points in the train set: 3170645, number of used features: 296
[36m(train_lgbm pid=2736588)[0m [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.047598 -> initscore=-2.996206
[36m(train_lgbm pid=2736588)[0m [LightGBM] [Info] Start training from score -2.996206




[36m(train_lgbm pid=2736744)[0m [LightGBM] [Info] Number of positive: 150915, number of negative: 3019730
[36m(train_lgbm pid=2736744)[0m [LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.869453 seconds.
[36m(train_lgbm pid=2736744)[0m You can set `force_col_wise=true` to remove the overhead.
[36m(train_lgbm pid=2736744)[0m [LightGBM] [Info] Total Bins 56850
[36m(train_lgbm pid=2736744)[0m [LightGBM] [Info] Number of data points in the train set: 3170645, number of used features: 296
[36m(train_lgbm pid=2736744)[0m [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.047598 -> initscore=-2.996206
[36m(train_lgbm pid=2736744)[0m [LightGBM] [Info] Start training from score -2.996206




[36m(train_lgbm pid=2736889)[0m [LightGBM] [Info] Number of positive: 150915, number of negative: 3019730
[36m(train_lgbm pid=2736889)[0m [LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.916674 seconds.
[36m(train_lgbm pid=2736889)[0m You can set `force_col_wise=true` to remove the overhead.
[36m(train_lgbm pid=2736889)[0m [LightGBM] [Info] Total Bins 56850
[36m(train_lgbm pid=2736889)[0m [LightGBM] [Info] Number of data points in the train set: 3170645, number of used features: 296
[36m(train_lgbm pid=2736889)[0m [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.047598 -> initscore=-2.996206
[36m(train_lgbm pid=2736889)[0m [LightGBM] [Info] Start training from score -2.996206


[36m(train_lgbm pid=2736294)[0m   if ray.train.get_context().get_world_rank() in (0, None):
[36m(train_lgbm pid=2736294)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/user14/Cyber/MroPred/output/lgbm/lgbm_tuning_results/lgbm_tuning_experiment/train_lgbm_8c17a_00006_6_learning_rate=0.0000,max_depth=11,num_leaves=245_2025-06-08_18-58-45/checkpoint_000000)


[36m(train_lgbm pid=2737034)[0m [LightGBM] [Info] Number of positive: 150915, number of negative: 3019730
[36m(train_lgbm pid=2737034)[0m [LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.974849 seconds.
[36m(train_lgbm pid=2737034)[0m You can set `force_col_wise=true` to remove the overhead.
[36m(train_lgbm pid=2737034)[0m [LightGBM] [Info] Total Bins 56850
[36m(train_lgbm pid=2737034)[0m [LightGBM] [Info] Number of data points in the train set: 3170645, number of used features: 296
[36m(train_lgbm pid=2737034)[0m [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.047598 -> initscore=-2.996206
[36m(train_lgbm pid=2737034)[0m [LightGBM] [Info] Start training from score -2.996206




[36m(train_lgbm pid=2737143)[0m [LightGBM] [Info] Number of positive: 150915, number of negative: 3019730
[36m(train_lgbm pid=2737143)[0m [LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 3.225610 seconds.
[36m(train_lgbm pid=2737143)[0m You can set `force_col_wise=true` to remove the overhead.
[36m(train_lgbm pid=2737143)[0m [LightGBM] [Info] Total Bins 56850
[36m(train_lgbm pid=2737143)[0m [LightGBM] [Info] Number of data points in the train set: 3170645, number of used features: 296
[36m(train_lgbm pid=2737143)[0m [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.047598 -> initscore=-2.996206
[36m(train_lgbm pid=2737143)[0m [LightGBM] [Info] Start training from score -2.996206




[36m(train_lgbm pid=2737312)[0m [LightGBM] [Info] Number of positive: 150915, number of negative: 3019730
[36m(train_lgbm pid=2737312)[0m [LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.830783 seconds.
[36m(train_lgbm pid=2737312)[0m You can set `force_col_wise=true` to remove the overhead.
[36m(train_lgbm pid=2737312)[0m [LightGBM] [Info] Total Bins 56850
[36m(train_lgbm pid=2737312)[0m [LightGBM] [Info] Number of data points in the train set: 3170645, number of used features: 296
[36m(train_lgbm pid=2737312)[0m [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.047598 -> initscore=-2.996206
[36m(train_lgbm pid=2737312)[0m [LightGBM] [Info] Start training from score -2.996206




[36m(train_lgbm pid=2737479)[0m [LightGBM] [Info] Number of positive: 150915, number of negative: 3019730
[36m(train_lgbm pid=2737479)[0m [LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 3.356479 seconds.
[36m(train_lgbm pid=2737479)[0m You can set `force_col_wise=true` to remove the overhead.
[36m(train_lgbm pid=2737479)[0m [LightGBM] [Info] Total Bins 56850
[36m(train_lgbm pid=2737479)[0m [LightGBM] [Info] Number of data points in the train set: 3170645, number of used features: 296
[36m(train_lgbm pid=2737479)[0m [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.047598 -> initscore=-2.996206
[36m(train_lgbm pid=2737479)[0m [LightGBM] [Info] Start training from score -2.996206


[36m(train_lgbm pid=2737034)[0m   if ray.train.get_context().get_world_rank() in (0, None):
[36m(train_lgbm pid=2737034)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/user14/Cyber/MroPred/output/lgbm/lgbm_tuning_results/lgbm_tuning_experiment/train_lgbm_8c17a_00011_11_learning_rate=0.0000,max_depth=18,num_leaves=197_2025-06-08_18-58-45/checkpoint_000000)


[36m(train_lgbm pid=2737711)[0m [LightGBM] [Info] Number of positive: 150915, number of negative: 3019730
[36m(train_lgbm pid=2737711)[0m [LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 3.096398 seconds.
[36m(train_lgbm pid=2737711)[0m You can set `force_col_wise=true` to remove the overhead.
[36m(train_lgbm pid=2737711)[0m [LightGBM] [Info] Total Bins 56850
[36m(train_lgbm pid=2737711)[0m [LightGBM] [Info] Number of data points in the train set: 3170645, number of used features: 296
[36m(train_lgbm pid=2737711)[0m [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.047598 -> initscore=-2.996206
[36m(train_lgbm pid=2737711)[0m [LightGBM] [Info] Start training from score -2.996206






[36m(train_lgbm pid=2737479)[0m   if ray.train.get_context().get_world_rank() in (0, None):
[36m(train_lgbm pid=2737479)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/user14/Cyber/MroPred/output/lgbm/lgbm_tuning_results/lgbm_tuning_experiment/train_lgbm_8c17a_00014_14_learning_rate=0.0148,max_depth=19,num_leaves=213_2025-06-08_18-58-45/checkpoint_000000)


[36m(train_lgbm pid=2737853)[0m [LightGBM] [Info] Number of positive: 150915, number of negative: 3019730
[36m(train_lgbm pid=2737853)[0m [LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.341887 seconds.
[36m(train_lgbm pid=2737853)[0m You can set `force_col_wise=true` to remove the overhead.
[36m(train_lgbm pid=2737853)[0m [LightGBM] [Info] Total Bins 56850
[36m(train_lgbm pid=2737853)[0m [LightGBM] [Info] Number of data points in the train set: 3170645, number of used features: 296
[36m(train_lgbm pid=2737853)[0m [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.047598 -> initscore=-2.996206
[36m(train_lgbm pid=2737853)[0m [LightGBM] [Info] Start training from score -2.996206




[36m(train_lgbm pid=2737985)[0m [LightGBM] [Info] Number of positive: 150915, number of negative: 3019730
[36m(train_lgbm pid=2737985)[0m [LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.432897 seconds.
[36m(train_lgbm pid=2737985)[0m You can set `force_col_wise=true` to remove the overhead.
[36m(train_lgbm pid=2737985)[0m [LightGBM] [Info] Total Bins 56850
[36m(train_lgbm pid=2737985)[0m [LightGBM] [Info] Number of data points in the train set: 3170645, number of used features: 296
[36m(train_lgbm pid=2737985)[0m [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.047598 -> initscore=-2.996206
[36m(train_lgbm pid=2737985)[0m [LightGBM] [Info] Start training from score -2.996206




[36m(train_lgbm pid=2738094)[0m [LightGBM] [Info] Number of positive: 150915, number of negative: 3019730
[36m(train_lgbm pid=2738094)[0m [LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.347181 seconds.
[36m(train_lgbm pid=2738094)[0m You can set `force_col_wise=true` to remove the overhead.
[36m(train_lgbm pid=2738094)[0m [LightGBM] [Info] Total Bins 56850
[36m(train_lgbm pid=2738094)[0m [LightGBM] [Info] Number of data points in the train set: 3170645, number of used features: 296
[36m(train_lgbm pid=2738094)[0m [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.047598 -> initscore=-2.996206
[36m(train_lgbm pid=2738094)[0m [LightGBM] [Info] Start training from score -2.996206




[36m(train_lgbm pid=2738246)[0m [LightGBM] [Info] Number of positive: 150915, number of negative: 3019730
[36m(train_lgbm pid=2738246)[0m [LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.292767 seconds.
[36m(train_lgbm pid=2738246)[0m You can set `force_col_wise=true` to remove the overhead.
[36m(train_lgbm pid=2738246)[0m [LightGBM] [Info] Total Bins 56850
[36m(train_lgbm pid=2738246)[0m [LightGBM] [Info] Number of data points in the train set: 3170645, number of used features: 296
[36m(train_lgbm pid=2738246)[0m [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.047598 -> initscore=-2.996206
[36m(train_lgbm pid=2738246)[0m [LightGBM] [Info] Start training from score -2.996206


2025-06-08 19:08:37,757	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/home/user14/Cyber/MroPred/output/lgbm/lgbm_tuning_results/lgbm_tuning_experiment' in 0.0129s.
2025-06-08 19:08:37,773	INFO tune.py:1041 -- Total run time: 592.61 seconds (592.56 seconds for the tuning loop).


In [8]:
best_model_path = os.path.join(results.get_best_result().checkpoint.path, "model.txt")
booster = lgb.Booster(model_file=best_model_path)

In [21]:
from sklearn.metrics import (
    f1_score,
    precision_score,
    recall_score,
    accuracy_score,
    roc_auc_score,
)


def get_X_y(df):
    X = df.drop("target_mro", axis=1)
    y = df["target_mro"]
    return X, y


X_train, y_train = get_X_y(train_dataset)
X_valid, y_valid = get_X_y(valid_dataset)
X_test, y_test = get_X_y(test_dataset)


def predict_and_eval(booster, X, y_true: pd.DataFrame, dataset_name="dataset"):
    y_prob = booster.predict(X)
    y_pred = (y_prob >= 0.5).astype(int)

    acc = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    auc = roc_auc_score(y_true, y_prob)

    print(f"\nEvaluation on {dataset_name}:")
    print(f"Accuracy:  {acc:.5f}")
    print(f"Precision: {precision:.5f}")
    print(f"Recall:    {recall:.5f}")
    print(f"F1 Score:  {f1:.5f}")
    print(f"AUC:       {auc:.5f}")

    result_df = pd.DataFrame(
        {"y_true": y_true.values, "y_prob": y_prob, "y_pred": y_pred}
    )

    # return acc, precision, recall, f1, auc, result_df
    return {
        "auc": auc,
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1_score": f1,
        "result_df": result_df,
    }


# predict_and_eval(booster, X_train, y_train, "Train Set")
# predict_and_eval(booster, X_valid, y_valid, "Validation Set")
# predict_and_eval(booster, X_test, y_test, "Test Set")
train_results = predict_and_eval(booster, X_train, y_train, "Train Set")
valid_results = predict_and_eval(booster, X_valid, y_valid, "Validation Set")
test_results = predict_and_eval(booster, X_test, y_test, "Test Set")

results_df = pd.DataFrame(
    {
        "Best Model Path": best_model_path,
        "Train Accuracy": [train_results["accuracy"]],
        "Train Precision": [train_results["precision"]],
        "Train Recall": [train_results["recall"]],
        "Train F1 Score": [train_results["f1_score"]],
        "Train AUC": [train_results["auc"]],
        "Validation Accuracy": [valid_results["accuracy"]],
        "Validation Precision": [valid_results["precision"]],
        "Validation Recall": [valid_results["recall"]],
        "Validation F1 Score": [valid_results["f1_score"]],
        "Validation AUC": [valid_results["auc"]],
        "Test Accuracy": [test_results["accuracy"]],
        "Test Precision": [test_results["precision"]],
        "Test Recall": [test_results["recall"]],
        "Test F1 Score": [test_results["f1_score"]],
        "Test AUC": [test_results["auc"]],
    }
)


Evaluation on Train Set:
Accuracy:  0.91292
Precision: 0.13650
Recall:    0.15574
F1 Score:  0.14549
AUC:       0.67276

Evaluation on Validation Set:
Accuracy:  0.91303
Precision: 0.13003
Recall:    0.14409
F1 Score:  0.13670
AUC:       0.63689

Evaluation on Test Set:
Accuracy:  0.91381
Precision: 0.13321
Recall:    0.14744
F1 Score:  0.13996
AUC:       0.63668


In [22]:
results_df

Unnamed: 0,Best Model Path,Train Accuracy,Train Precision,Train Recall,Train F1 Score,Train AUC,Validation Accuracy,Validation Precision,Validation Recall,Validation F1 Score,Validation AUC,Test Accuracy,Test Precision,Test Recall,Test F1 Score,Test AUC
0,/home/user14/Cyber/MroPred/output/lgbm/lgbm_tu...,0.912921,0.136501,0.155743,0.145489,0.672759,0.913027,0.13003,0.144092,0.1367,0.636893,0.913811,0.133214,0.147436,0.139964,0.636675
