In [1]:
from model import preprocess_data_lgbm as preprocess_data
import os
import pandas as pd
import ray
from ray.train import  CheckpointConfig, RunConfig, ScalingConfig
from ray.train.lightgbm import LightGBMTrainer
from utils import create_train_test_group
# set GPUs
os.environ["CUDA_VISIBLE_DEVICES"] = "2, 3, 4, 5, 6, 7"

In [2]:
# control parameter: data preparation
csv_file_name = "./Data/mro_daily_clean.csv"
target_mro = ["mro"]
maintain_repair_mro = "full"

add_mro_prev = True
add_purchase_time = True
add_driver_behavior = True
agg_weeks = 1
agg_fun = ["mean", "sum", "max", "min", "std", "skew"]

In [3]:
data = preprocess_data(
    file_name=csv_file_name,
    target_mro=target_mro,
    maintain_repair_mro=maintain_repair_mro,
    add_mro_prev=add_mro_prev,
    add_purchase_time=add_purchase_time,
    add_driver_behavior=add_driver_behavior,
    agg_weeks=agg_weeks,
    agg_fun=agg_fun,
    time_window = 8,
)

Load the dataset ./Data/mro_daily_clean.csv successfully.
The MRO choosen is: ['mro']
No need to know the maintenance or repair.
Use the Target MRO.
Add Purchase Time: True
Add behavior of Driver: True
Aggregate the data into 1 week.
Add Previous MRO: True


In [4]:
data_lgbm = create_train_test_group(
    data=data,
    sample_frac=1.0,
    test_size=0.1,
    valid_size=0.1,
    random_state=42,
)

In [5]:
data_lgbm.to_parquet("./Data/data_lgbm.gzip", compression="gzip", engine='pyarrow')

In [6]:
# data_lgbm = pd.read_parquet('./Data/data_lgbm.gzip')
# data_lgbm

In [8]:
def prepare_data(data: pd.DataFrame):
    """Load and split the dataset into train, validation, and test sets."""

    train_dataset = data[data["group"] == "train"]
    valid_dataset = data[data["group"] == "valid"]
    test_dataset = data[data["group"] == "test"]
    # train_dataset = train_dataset.drop(["group", "id", "mro_prev"], axis=1)
    # valid_dataset = valid_dataset.drop(["group", "id", "mro_prev"], axis=1)
    # test_dataset = test_dataset.drop(["group", "id", "mro_prev"], axis=1)
    train_dataset = train_dataset.drop(["group", "id"], axis=1)
    valid_dataset = valid_dataset.drop(["group", "id"], axis=1)
    test_dataset = test_dataset.drop(["group", "id"], axis=1)

    return train_dataset, valid_dataset, test_dataset

In [None]:
train_dataset, valid_dataset, test_dataset = prepare_data(data_lgbm)
ray_train_dataset = ray.data.from_pandas(train_dataset)
ray_valid_dataset = ray.data.from_pandas(valid_dataset)

2025-06-05 18:56:41,282	INFO worker.py:1888 -- Started a local Ray instance.


[36m(LightGBMTrainer pid=2370263)[0m Started distributed worker processes: 
[36m(LightGBMTrainer pid=2370263)[0m - (node_id=daa43a78680ec46f01f27bf939cc9db6d30bbd142c355a84aff54580, ip=144.214.55.187, pid=2370384) world_rank=0, local_rank=0, node_rank=0
[36m(LightGBMTrainer pid=2370263)[0m - (node_id=daa43a78680ec46f01f27bf939cc9db6d30bbd142c355a84aff54580, ip=144.214.55.187, pid=2370382) world_rank=1, local_rank=1, node_rank=0
[36m(LightGBMTrainer pid=2370263)[0m - (node_id=daa43a78680ec46f01f27bf939cc9db6d30bbd142c355a84aff54580, ip=144.214.55.187, pid=2370383) world_rank=2, local_rank=2, node_rank=0
[36m(LightGBMTrainer pid=2370263)[0m - (node_id=daa43a78680ec46f01f27bf939cc9db6d30bbd142c355a84aff54580, ip=144.214.55.187, pid=2370385) world_rank=3, local_rank=3, node_rank=0
[36m(RayTrainWorker pid=2370384)[0m Registered dataset logger for dataset dataset_4_0
[36m(SplitCoordinator pid=2370664)[0m Starting execution of Dataset train_2_0. Full logs are in /tmp/ray/session

[36m(RayTrainWorker pid=2370384)[0m [LightGBM] [Info] Trying to bind port 54223...
[36m(RayTrainWorker pid=2370384)[0m [LightGBM] [Info] Binding port 54223 succeeded
[36m(RayTrainWorker pid=2370384)[0m [LightGBM] [Info] Listening...
[36m(RayTrainWorker pid=2370383)[0m [LightGBM] [Info] Connected to rank 0
[36m(RayTrainWorker pid=2370383)[0m [LightGBM] [Info] Connected to rank 1
[36m(RayTrainWorker pid=2370383)[0m [LightGBM] [Info] Connected to rank 3
[36m(RayTrainWorker pid=2370383)[0m [LightGBM] [Info] Local rank: 2, total number of machines: 4
[36m(RayTrainWorker pid=2370383)[0m [LightGBM] [Info] Number of positive: 150915, number of negative: 3019729
[36m(RayTrainWorker pid=2370385)[0m [LightGBM] [Info] Trying to bind port 56457...[32m [repeated 3x across cluster][0m
[36m(RayTrainWorker pid=2370385)[0m [LightGBM] [Info] Binding port 56457 succeeded[32m [repeated 3x across cluster][0m
[36m(RayTrainWorker pid=2370385)[0m [LightGBM] [Info] Listening...[32m [r

[36m(RayTrainWorker pid=2370384)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/user14/ray_results/LightGBMTrainer_2025-06-05_18-56-46/LightGBMTrainer_c604d_00000_0_2025-06-05_18-56-46/checkpoint_000000)
[36m(SplitCoordinator pid=2370663)[0m Registered dataset logger for dataset valid_3_0[32m [repeated 3x across cluster][0m


[36m(RayTrainWorker pid=2370384)[0m [LightGBM] [Info] Finished linking network in 0.359163 seconds
[36m(RayTrainWorker pid=2370382)[0m [LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 4.325641 seconds.[32m [repeated 3x across cluster][0m
[36m(RayTrainWorker pid=2370382)[0m You can set `force_col_wise=true` to remove the overhead.[32m [repeated 3x across cluster][0m
[36m(RayTrainWorker pid=2370382)[0m [LightGBM] [Info] Total Bins 56794[32m [repeated 3x across cluster][0m
[36m(RayTrainWorker pid=2370382)[0m [LightGBM] [Info] Number of data points in the train set: 792661, number of used features: 296[32m [repeated 3x across cluster][0m
[36m(RayTrainWorker pid=2370382)[0m [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.047598 -> initscore=-2.996206[32m [repeated 3x across cluster][0m
[36m(RayTrainWorker pid=2370382)[0m [LightGBM] [Info] Start training from score -2.996206[32m [repeated 3x across cluster][0m


[36m(RayTrainWorker pid=2370384)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/user14/ray_results/LightGBMTrainer_2025-06-05_18-56-46/LightGBMTrainer_c604d_00000_0_2025-06-05_18-56-46/checkpoint_000001)
[36m(SplitCoordinator pid=2370664)[0m Fatal Python error: PyGILState_Release: auto-releasing thread-state, but no thread-state for this thread
[36m(SplitCoordinator pid=2370664)[0m Python runtime state: initialized
[36m(SplitCoordinator pid=2370664)[0m 
[36m(SplitCoordinator pid=2370664)[0m Thread 0x00007f86af672700 (most recent call first):
[36m(SplitCoordinator pid=2370664)[0m   <no Python frame>
[36m(SplitCoordinator pid=2370664)[0m 
[36m(SplitCoordinator pid=2370664)[0m Thread 0x00007fb6328a0740 (most recent call first):
[36m(SplitCoordinator pid=2370664)[0m   File "/home/user14/data/anaconda3/envs/mro/lib/python3.11/site-packages/ray/_private/worker.py", line 946 in main_loop
[36m(SplitCoordinator pid=2370664)[0m   File "/home/us

[33m(raylet)[0m A worker died or was killed while executing a task by an unexpected system error. To troubleshoot the problem, check the logs for the dead worker. RayTask ID: ffffffffffffffffc613e55ff7f8537640db982101000000 Worker ID: 09d590c63a2a80dc5cfe604768b6ed76dc3b982ea2ee5ecce42bf099 Node ID: daa43a78680ec46f01f27bf939cc9db6d30bbd142c355a84aff54580 Worker IP address: 144.214.55.187 Worker port: 33117 Worker PID: 2370664 Worker exit type: SYSTEM_ERROR Worker exit detail: Worker unexpectedly exits with a connection error code 2. End of file. There are some potential root causes. (1) The process is killed by SIGKILL by OOM killer due to high memory usage. (2) ray stop --force is called. (3) The worker is crashed unexpectedly due to SIGSEGV or other unexpected errors.


In [10]:
# Configure checkpointing to save progress during training
run_config = RunConfig(
    checkpoint_config=CheckpointConfig(
        # Checkpoint every 10 iterations.
        checkpoint_frequency=10,
        # Only keep the latest checkpoint and delete the others.
        num_to_keep=20,
    )
)
# Set up the XGBoost trainer with the specified configuration
trainer = LightGBMTrainer(
    # see "How to scale out training?" for more details
    scaling_config=ScalingConfig(
        # Number of workers to use for data parallelism.
        num_workers=4,
        # Whether to use GPU acceleration. Set to True to schedule GPU workers.
        use_gpu=True,
    ),
    label_column="target_mro",
    num_boost_round=20,
    # XGBoost specific params (see the `xgboost.train` API reference)
    params={
        "objective": "binary",
        "metric": ["binary_logloss", "binary_error", "auc", "average_precision"],
        # "device": "gpu",
        "learning_rate": 0.05,
        "num_leaves": 64,
        "max_depth": 8,
        "is_unbalance": True,
    },
    datasets={"train": ray_train_dataset, "valid": ray_valid_dataset},
    # store the preprocessor in the checkpoint for inference later
    run_config=run_config,
)
result = trainer.fit()

2025-06-05 18:56:46,325	INFO tune.py:616 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949


== Status ==
Current time: 2025-06-05 18:56:46 (running for 00:00:00.12)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/40 CPUs, 4.0/6 GPUs (0.0/1.0 accelerator_type:G)
Result logdir: /tmp/ray/session_2025-06-05_18-56-35_222224_2367005/artifacts/2025-06-05_18-56-46/LightGBMTrainer_2025-06-05_18-56-46/driver_artifacts
Number of trials: 1/1 (1 PENDING)


== Status ==
Current time: 2025-06-05 18:56:51 (running for 00:00:05.16)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/40 CPUs, 4.0/6 GPUs (0.0/1.0 accelerator_type:G)
Result logdir: /tmp/ray/session_2025-06-05_18-56-35_222224_2367005/artifacts/2025-06-05_18-56-46/LightGBMTrainer_2025-06-05_18-56-46/driver_artifacts
Number of trials: 1/1 (1 RUNNING)




(pid=2370664) Running 0: 0.00 row [00:00, ? row/s]

(pid=2370664) - split(4, equal=True) 1: 0.00 row [00:00, ? row/s]

== Status ==
Current time: 2025-06-05 18:56:56 (running for 00:00:10.20)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/40 CPUs, 4.0/6 GPUs (0.0/1.0 accelerator_type:G)
Result logdir: /tmp/ray/session_2025-06-05_18-56-35_222224_2367005/artifacts/2025-06-05_18-56-46/LightGBMTrainer_2025-06-05_18-56-46/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2025-06-05 18:57:01 (running for 00:00:15.24)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/40 CPUs, 4.0/6 GPUs (0.0/1.0 accelerator_type:G)
Result logdir: /tmp/ray/session_2025-06-05_18-56-35_222224_2367005/artifacts/2025-06-05_18-56-46/LightGBMTrainer_2025-06-05_18-56-46/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2025-06-05 18:57:06 (running for 00:00:20.28)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/40 CPUs, 4.0/6 GPUs (0.0/1.0 accelerator_type:G)
Result logdir: /tmp/ray/session_2025-06-05_18-56-35_222224_2367005/artifacts/20

(pid=2370663) Running 0: 0.00 row [00:00, ? row/s]

(pid=2370663) - split(4, equal=True) 1: 0.00 row [00:00, ? row/s]

== Status ==
Current time: 2025-06-05 18:57:56 (running for 00:01:10.64)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/40 CPUs, 4.0/6 GPUs (0.0/1.0 accelerator_type:G)
Result logdir: /tmp/ray/session_2025-06-05_18-56-35_222224_2367005/artifacts/2025-06-05_18-56-46/LightGBMTrainer_2025-06-05_18-56-46/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2025-06-05 18:58:02 (running for 00:01:15.67)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/40 CPUs, 4.0/6 GPUs (0.0/1.0 accelerator_type:G)
Result logdir: /tmp/ray/session_2025-06-05_18-56-35_222224_2367005/artifacts/2025-06-05_18-56-46/LightGBMTrainer_2025-06-05_18-56-46/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2025-06-05 18:58:07 (running for 00:01:20.72)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/40 CPUs, 4.0/6 GPUs (0.0/1.0 accelerator_type:G)
Result logdir: /tmp/ray/session_2025-06-05_18-56-35_222224_2367005/artifacts/20

2025-06-05 18:59:08,333	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/home/user14/ray_results/LightGBMTrainer_2025-06-05_18-56-46' in 0.0035s.
2025-06-05 18:59:08,336	INFO tune.py:1041 -- Total run time: 142.01 seconds (141.98 seconds for the tuning loop).


== Status ==
Current time: 2025-06-05 18:59:08 (running for 00:02:21.99)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/40 CPUs, 4.0/6 GPUs (0.0/1.0 accelerator_type:G)
Result logdir: /tmp/ray/session_2025-06-05_18-56-35_222224_2367005/artifacts/2025-06-05_18-56-46/LightGBMTrainer_2025-06-05_18-56-46/driver_artifacts
Number of trials: 1/1 (1 TERMINATED)




In [11]:
import os
output_dir = "./output/lgbm"
model_path = os.path.join(output_dir, "model_lgbm.txt")
os.makedirs(output_dir, exist_ok=True)

In [12]:
booster = trainer.get_model(result.checkpoint)

In [13]:
booster.save_model(model_path)

<lightgbm.basic.Booster at 0x7f1ad9157290>

In [17]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score


def get_X_y(df):
    X = df.drop("target_mro",  axis=1)
    y = df["target_mro"]
    return X, y


X_train, y_train = get_X_y(train_dataset)
X_valid, y_valid = get_X_y(valid_dataset)
X_test, y_test = get_X_y(test_dataset)


def predict_and_eval(booster, X, y_true, dataset_name="dataset"):
    y_prob = booster.predict(X)
    y_pred = (y_prob >= 0.5).astype(int)


    acc = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)

    print(f"\nEvaluation on {dataset_name}:")
    print(f"Accuracy:  {acc:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1 Score:  {f1:.4f}")

    result_df = pd.DataFrame(
        {"y_true": y_true.values, "y_prob": y_prob, "y_pred": y_pred}
    )

    return acc, precision, recall, f1, result_df


# predict_and_eval(booster, X_train, y_train, "Train Set")
# predict_and_eval(booster, X_valid, y_valid, "Validation Set")
# predict_and_eval(booster, X_test, y_test, "Test Set")
acc, precision, recall, f1, df_result = predict_and_eval(
    booster, X_test, y_test, "Test Set"
)


Evaluation on Test Set:
Accuracy:  0.9427
Precision: 0.1905
Recall:    0.0629
F1 Score:  0.0946
