In [1]:
import pandas as pd
import numpy as np  # noqa
import optuna
import time
import logging  # noqa
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error
from lightgbm import LGBMRegressor, early_stopping, log_evaluation

# Enable logging
optuna.logging.set_verbosity(optuna.logging.INFO)


class TimingCallback:
    def __init__(self):
        self.start_time = None

    def __call__(self, study, trial):
        if self.start_time is None:
            self.start_time = time.time()
        else:
            elapsed_time = time.time() - self.start_time
            print(f"Trial {trial.number} finished in {elapsed_time:.2f} seconds.")
            self.start_time = time.time()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data_dir = "/data3/lsf/Pein/Power-Prediction/data/"
train_data_selected = pd.read_csv(data_dir + "train_farm_92.csv")
test_data_selected = pd.read_csv(data_dir + "test_farm_92.csv")

# Define features and target
features = [
    col
    for col in train_data_selected.columns
    if col not in ["time", "lead_hour", "power"]
]
X_train = train_data_selected[features]
y_train = train_data_selected["power"]
X_test = test_data_selected[features]
y_test = test_data_selected["power"]

### Adding time features

In [3]:
# Convert 'time' column to datetime
train_data_selected["time"] = pd.to_datetime(train_data_selected["time"])
test_data_selected["time"] = pd.to_datetime(test_data_selected["time"])

In [4]:
# Function to add time features
def add_time_features(df):
    df["time"] = pd.to_datetime(df["time"])

    # Existing time features
    df["hour"] = df["time"].dt.hour
    df["quarter_hour"] = df["time"].dt.minute // 15

    # New time features
    df["day"] = df["time"].dt.day
    df["day_in_week"] = df["time"].dt.weekday

    # Sine and cosine transformations
    df["hour_sin"] = np.sin(2 * np.pi * df["hour"] / 24)
    df["hour_cos"] = np.cos(2 * np.pi * df["hour"] / 24)

    df["quarter_hour_sin"] = np.sin(2 * np.pi * df["quarter_hour"] / 4)
    df["quarter_hour_cos"] = np.cos(2 * np.pi * df["quarter_hour"] / 4)

    df["day_sin"] = np.sin(2 * np.pi * df["day"] / 31)
    df["day_cos"] = np.cos(2 * np.pi * df["day"] / 31)

    df["day_in_week_sin"] = np.sin(2 * np.pi * df["day_in_week"] / 7)
    df["day_in_week_cos"] = np.cos(2 * np.pi * df["day_in_week"] / 7)

    return df

In [5]:
# Add time features to both train and test data
train_data_selected = add_time_features(train_data_selected)
test_data_selected = add_time_features(test_data_selected)

In [6]:
# Define features including the new time features
time_features = [
    "hour",
    "quarter_hour",
    "day",
    "day_in_week",
    "hour_sin",
    "hour_cos",
    "quarter_hour_sin",
    "quarter_hour_cos",
    "day_sin",
    "day_cos",
    "day_in_week_sin",
    "day_in_week_cos",
]

In [7]:
all_features = features + time_features

X_train = train_data_selected[all_features]
X_test = test_data_selected[all_features]

In [8]:
def objective(
    trial,
    X_train,
    y_train,
    X_test=None,
    y_test=None,
    use_test_for_validation_flag=False,
):
    params = {
        "objective": "regression",
        "metric": "rmse",
        "boosting_type": "gbdt",
        "num_leaves": trial.suggest_int("num_leaves", 10, 80),
        "max_depth": trial.suggest_int("max_depth", 10, 50),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2),
        "n_estimators": trial.suggest_int("n_estimators", 500, 1500),
        "min_child_samples": trial.suggest_int("min_child_samples", 10, 50),
        "subsample": trial.suggest_float("subsample", 0.3, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "num_threads": 16,
        "seed": 42,
    }

    model = LGBMRegressor(**params)

    if use_test_for_validation_flag and X_test is not None and y_test is not None:
        X_val_split, y_val_split = X_test, y_test
        model.fit(
            X_train,
            y_train,
            eval_set=[(X_val_split, y_val_split)],
            eval_metric="rmse",
            callbacks=[early_stopping(stopping_rounds=15), log_evaluation(period=500)],
        )
        preds = model.predict(X_val_split)
    else:
        X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
            X_train, y_train, test_size=0.2, random_state=42
        )
        model.fit(
            X_train_split,
            y_train_split,
            eval_set=[(X_val_split, y_val_split)],
            eval_metric="rmse",
            callbacks=[early_stopping(stopping_rounds=15), log_evaluation(period=500)],
        )
        preds = model.predict(X_val_split)

    # Calculate RMSE
    rmse = root_mean_squared_error(y_val_split, preds)
    return rmse

In [9]:
# Create the study and optimize
use_test_for_validation_flag = False
study = optuna.create_study(direction="minimize")
study.optimize(
    lambda trial: objective(
        trial,
        X_train,
        y_train,
        X_test,
        y_test,
        use_test_for_validation_flag=use_test_for_validation_flag,
    ),
    n_trials=50,
    callbacks=[TimingCallback()],
)

# Get the best parameters
best_params = study.best_params
print("Best parameters found: ", best_params)

[I 2024-08-02 15:26:13,776] A new study created in memory with name: no-name-3638510d-4c6e-4ca8-aa46-0683768eebbd


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002369 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22879
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 100888.355472
Training until validation scores don't improve for 15 rounds
[500]	valid_0's rmse: 13049
[1000]	valid_0's rmse: 12589.1
Did not meet early stopping. Best iteration is:
[1050]	valid_0's rmse: 12569.1


[I 2024-08-02 15:26:17,490] Trial 0 finished with value: 12569.131255407501 and parameters: {'num_leaves': 70, 'max_depth': 10, 'learning_rate': 0.11089815188227202, 'n_estimators': 1050, 'min_child_samples': 19, 'subsample': 0.8658201320230681, 'colsample_bytree': 0.5497136054577744}. Best is trial 0 with value: 12569.131255407501.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002803 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22879
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 100888.355472
Training until validation scores don't improve for 15 rounds
[500]	valid_0's rmse: 12895.6
[1000]	valid_0's rmse: 12526.6


[I 2024-08-02 15:26:23,255] Trial 1 finished with value: 12467.48954665738 and parameters: {'num_leaves': 77, 'max_depth': 40, 'learning_rate': 0.09744597713054322, 'n_estimators': 1401, 'min_child_samples': 27, 'subsample': 0.9853734683934467, 'colsample_bytree': 0.6710221845532004}. Best is trial 1 with value: 12467.48954665738.


Did not meet early stopping. Best iteration is:
[1401]	valid_0's rmse: 12467.5
Trial 1 finished in 5.77 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002898 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22879
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 100888.355472
Training until validation scores don't improve for 15 rounds


[I 2024-08-02 15:26:24,278] Trial 2 finished with value: 14300.647643453474 and parameters: {'num_leaves': 22, 'max_depth': 36, 'learning_rate': 0.15217879957993058, 'n_estimators': 595, 'min_child_samples': 15, 'subsample': 0.30786406666671096, 'colsample_bytree': 0.8082653324335398}. Best is trial 1 with value: 12467.48954665738.


[500]	valid_0's rmse: 14682.1
Did not meet early stopping. Best iteration is:
[595]	valid_0's rmse: 14300.6
Trial 2 finished in 1.02 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002553 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22879
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 100888.355472
Training until validation scores don't improve for 15 rounds
[500]	valid_0's rmse: 14355.8


[I 2024-08-02 15:26:25,639] Trial 3 finished with value: 13611.947108142134 and parameters: {'num_leaves': 25, 'max_depth': 28, 'learning_rate': 0.15869235757004366, 'n_estimators': 808, 'min_child_samples': 27, 'subsample': 0.8968001172764388, 'colsample_bytree': 0.6718080246525302}. Best is trial 1 with value: 12467.48954665738.


Did not meet early stopping. Best iteration is:
[808]	valid_0's rmse: 13611.9
Trial 3 finished in 1.36 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002640 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22879
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 100888.355472
Training until validation scores don't improve for 15 rounds
[500]	valid_0's rmse: 20141.5
[1000]	valid_0's rmse: 16309.1


[I 2024-08-02 15:26:28,069] Trial 4 finished with value: 15444.537806686747 and parameters: {'num_leaves': 24, 'max_depth': 23, 'learning_rate': 0.03194850747601882, 'n_estimators': 1237, 'min_child_samples': 10, 'subsample': 0.40592980870301965, 'colsample_bytree': 0.9923223590736706}. Best is trial 1 with value: 12467.48954665738.


Did not meet early stopping. Best iteration is:
[1237]	valid_0's rmse: 15444.5
Trial 4 finished in 2.43 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002971 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22879
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 100888.355472
Training until validation scores don't improve for 15 rounds
[500]	valid_0's rmse: 29051.3


[I 2024-08-02 15:26:29,616] Trial 5 finished with value: 25116.26672555796 and parameters: {'num_leaves': 20, 'max_depth': 20, 'learning_rate': 0.012384827378516707, 'n_estimators': 839, 'min_child_samples': 44, 'subsample': 0.7441613954735145, 'colsample_bytree': 0.9153920157961146}. Best is trial 1 with value: 12467.48954665738.


Did not meet early stopping. Best iteration is:
[839]	valid_0's rmse: 25116.3
Trial 5 finished in 1.55 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002466 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22879
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 100888.355472
Training until validation scores don't improve for 15 rounds


[I 2024-08-02 15:26:31,724] Trial 6 finished with value: 13185.789648544514 and parameters: {'num_leaves': 68, 'max_depth': 27, 'learning_rate': 0.09645686646573812, 'n_estimators': 540, 'min_child_samples': 39, 'subsample': 0.9567761677327997, 'colsample_bytree': 0.6625550389143045}. Best is trial 1 with value: 12467.48954665738.


[500]	valid_0's rmse: 13280.9
Did not meet early stopping. Best iteration is:
[540]	valid_0's rmse: 13185.8
Trial 6 finished in 2.11 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002461 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22879
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 100888.355472
Training until validation scores don't improve for 15 rounds
[500]	valid_0's rmse: 13445.2


[I 2024-08-02 15:26:34,245] Trial 7 finished with value: 13120.516955997648 and parameters: {'num_leaves': 67, 'max_depth': 38, 'learning_rate': 0.08500850453012872, 'n_estimators': 657, 'min_child_samples': 50, 'subsample': 0.4847857596768431, 'colsample_bytree': 0.6648835886130118}. Best is trial 1 with value: 12467.48954665738.


Did not meet early stopping. Best iteration is:
[657]	valid_0's rmse: 13120.5
Trial 7 finished in 2.52 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002640 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22879
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 100888.355472
Training until validation scores don't improve for 15 rounds
[500]	valid_0's rmse: 14309.8


[I 2024-08-02 15:26:35,779] Trial 8 finished with value: 13529.19891565406 and parameters: {'num_leaves': 23, 'max_depth': 10, 'learning_rate': 0.19514142528638068, 'n_estimators': 984, 'min_child_samples': 31, 'subsample': 0.7909646784748154, 'colsample_bytree': 0.7944317914426182}. Best is trial 1 with value: 12467.48954665738.


Did not meet early stopping. Best iteration is:
[984]	valid_0's rmse: 13529.2
Trial 8 finished in 1.53 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002522 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22879
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 100888.355472
Training until validation scores don't improve for 15 rounds
[500]	valid_0's rmse: 13305.1


[I 2024-08-02 15:26:39,598] Trial 9 finished with value: 13234.809115552218 and parameters: {'num_leaves': 80, 'max_depth': 42, 'learning_rate': 0.15411067971520254, 'n_estimators': 1072, 'min_child_samples': 10, 'subsample': 0.4731613500143968, 'colsample_bytree': 0.9719285609930957}. Best is trial 1 with value: 12467.48954665738.


Early stopping, best iteration is:
[801]	valid_0's rmse: 13234.8
Trial 9 finished in 3.82 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002401 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22879
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 100888.355472
Training until validation scores don't improve for 15 rounds
[500]	valid_0's rmse: 14209.5
[1000]	valid_0's rmse: 12906.9


[I 2024-08-02 15:26:43,471] Trial 10 finished with value: 12483.77234582719 and parameters: {'num_leaves': 49, 'max_depth': 50, 'learning_rate': 0.05948640286387858, 'n_estimators': 1455, 'min_child_samples': 26, 'subsample': 0.6036678039187764, 'colsample_bytree': 0.5216658058415873}. Best is trial 1 with value: 12467.48954665738.


Did not meet early stopping. Best iteration is:
[1455]	valid_0's rmse: 12483.8
Trial 10 finished in 3.87 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002311 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22879
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 100888.355472
Training until validation scores don't improve for 15 rounds
[500]	valid_0's rmse: 14061
[1000]	valid_0's rmse: 12873.6


[I 2024-08-02 15:26:47,102] Trial 11 finished with value: 12514.801060348298 and parameters: {'num_leaves': 46, 'max_depth': 50, 'learning_rate': 0.07159755228441873, 'n_estimators': 1467, 'min_child_samples': 30, 'subsample': 0.6220460229816722, 'colsample_bytree': 0.5002352171707015}. Best is trial 1 with value: 12467.48954665738.


Did not meet early stopping. Best iteration is:
[1466]	valid_0's rmse: 12514.8
Trial 11 finished in 3.63 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002208 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22879
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 100888.355472
Training until validation scores don't improve for 15 rounds
[500]	valid_0's rmse: 15038.4
[1000]	valid_0's rmse: 13409.8


[I 2024-08-02 15:26:50,975] Trial 12 finished with value: 12888.396210419854 and parameters: {'num_leaves': 47, 'max_depth': 50, 'learning_rate': 0.04961788512974074, 'n_estimators': 1473, 'min_child_samples': 23, 'subsample': 0.6375586003140211, 'colsample_bytree': 0.5773320053766382}. Best is trial 1 with value: 12467.48954665738.


Did not meet early stopping. Best iteration is:
[1473]	valid_0's rmse: 12888.4
Trial 12 finished in 3.87 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002817 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22879
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 100888.355472
Training until validation scores don't improve for 15 rounds
[500]	valid_0's rmse: 13448.6
[1000]	valid_0's rmse: 12908.6


[I 2024-08-02 15:26:54,757] Trial 13 finished with value: 12811.7091302455 and parameters: {'num_leaves': 53, 'max_depth': 44, 'learning_rate': 0.12043545818661702, 'n_estimators': 1302, 'min_child_samples': 37, 'subsample': 0.5705882461127758, 'colsample_bytree': 0.6104627675479393}. Best is trial 1 with value: 12467.48954665738.


Did not meet early stopping. Best iteration is:
[1301]	valid_0's rmse: 12811.7
Trial 13 finished in 3.78 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002761 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22879
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 100888.355472
Training until validation scores don't improve for 15 rounds
[500]	valid_0's rmse: 15312.7
[1000]	valid_0's rmse: 13512.5


[I 2024-08-02 15:26:57,874] Trial 14 finished with value: 13119.695326868705 and parameters: {'num_leaves': 38, 'max_depth': 45, 'learning_rate': 0.05576479282440036, 'n_estimators': 1283, 'min_child_samples': 25, 'subsample': 0.7326174945467325, 'colsample_bytree': 0.7320552989821543}. Best is trial 1 with value: 12467.48954665738.


Did not meet early stopping. Best iteration is:
[1283]	valid_0's rmse: 13119.7
Trial 14 finished in 3.12 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002466 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22879
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 100888.355472
Training until validation scores don't improve for 15 rounds
[500]	valid_0's rmse: 13214.3
[1000]	valid_0's rmse: 12716.5


[I 2024-08-02 15:27:02,061] Trial 15 finished with value: 12617.585619825404 and parameters: {'num_leaves': 58, 'max_depth': 33, 'learning_rate': 0.12495284991214384, 'n_estimators': 1384, 'min_child_samples': 36, 'subsample': 0.983555745907763, 'colsample_bytree': 0.504608908507624}. Best is trial 1 with value: 12467.48954665738.


Did not meet early stopping. Best iteration is:
[1384]	valid_0's rmse: 12617.6
Trial 15 finished in 4.19 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002965 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22879
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 100888.355472
Training until validation scores don't improve for 15 rounds
[500]	valid_0's rmse: 14505.5
[1000]	valid_0's rmse: 13124.4


[I 2024-08-02 15:27:04,757] Trial 16 finished with value: 12981.181119216173 and parameters: {'num_leaves': 37, 'max_depth': 40, 'learning_rate': 0.07211772581181951, 'n_estimators': 1144, 'min_child_samples': 21, 'subsample': 0.538393958723097, 'colsample_bytree': 0.7363034790102884}. Best is trial 1 with value: 12467.48954665738.


Did not meet early stopping. Best iteration is:
[1144]	valid_0's rmse: 12981.2
Trial 16 finished in 2.70 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002578 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22879
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 100888.355472
Training until validation scores don't improve for 15 rounds
[500]	valid_0's rmse: 26532
[1000]	valid_0's rmse: 21247.6


[I 2024-08-02 15:27:06,098] Trial 17 finished with value: 18698.063011639144 and parameters: {'num_leaves': 10, 'max_depth': 46, 'learning_rate': 0.04157639703387868, 'n_estimators': 1496, 'min_child_samples': 33, 'subsample': 0.6991495649719334, 'colsample_bytree': 0.6190107920245549}. Best is trial 1 with value: 12467.48954665738.


Did not meet early stopping. Best iteration is:
[1496]	valid_0's rmse: 18698.1
Trial 17 finished in 1.34 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002875 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22879
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 100888.355472
Training until validation scores don't improve for 15 rounds
[500]	valid_0's rmse: 15992.2
[1000]	valid_0's rmse: 13958.4


[I 2024-08-02 15:27:12,007] Trial 18 finished with value: 13589.545565435048 and parameters: {'num_leaves': 79, 'max_depth': 35, 'learning_rate': 0.017985497150686945, 'n_estimators': 1173, 'min_child_samples': 16, 'subsample': 0.8288439422032393, 'colsample_bytree': 0.8615563204266106}. Best is trial 1 with value: 12467.48954665738.


Did not meet early stopping. Best iteration is:
[1173]	valid_0's rmse: 13589.5
Trial 18 finished in 5.91 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002366 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22879
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 100888.355472
Training until validation scores don't improve for 15 rounds
[500]	valid_0's rmse: 13669.5
[1000]	valid_0's rmse: 12715.2


[I 2024-08-02 15:27:16,323] Trial 19 finished with value: 12492.628689769295 and parameters: {'num_leaves': 59, 'max_depth': 47, 'learning_rate': 0.06587479870316919, 'n_estimators': 1369, 'min_child_samples': 27, 'subsample': 0.36970596327485145, 'colsample_bytree': 0.5591639734042693}. Best is trial 1 with value: 12467.48954665738.


Did not meet early stopping. Best iteration is:
[1369]	valid_0's rmse: 12492.6
Trial 19 finished in 4.32 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002817 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22879
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 100888.355472
Training until validation scores don't improve for 15 rounds
[500]	valid_0's rmse: 14065.1


[I 2024-08-02 15:27:18,626] Trial 20 finished with value: 12990.003392189064 and parameters: {'num_leaves': 39, 'max_depth': 32, 'learning_rate': 0.0869595407146369, 'n_estimators': 930, 'min_child_samples': 41, 'subsample': 0.5751895233441883, 'colsample_bytree': 0.6986378801341351}. Best is trial 1 with value: 12467.48954665738.


Did not meet early stopping. Best iteration is:
[930]	valid_0's rmse: 12990
Trial 20 finished in 2.30 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002164 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22879
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 100888.355472
Training until validation scores don't improve for 15 rounds
[500]	valid_0's rmse: 13923.6
[1000]	valid_0's rmse: 12857.1


[I 2024-08-02 15:27:23,007] Trial 21 finished with value: 12557.504278856839 and parameters: {'num_leaves': 59, 'max_depth': 48, 'learning_rate': 0.059854083727344354, 'n_estimators': 1374, 'min_child_samples': 28, 'subsample': 0.3062979957513685, 'colsample_bytree': 0.5597329826125071}. Best is trial 1 with value: 12467.48954665738.


Did not meet early stopping. Best iteration is:
[1374]	valid_0's rmse: 12557.5
Trial 21 finished in 4.38 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002837 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22879
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 100888.355472
Training until validation scores don't improve for 15 rounds
[500]	valid_0's rmse: 13317.1
[1000]	valid_0's rmse: 12569.7


[I 2024-08-02 15:27:27,543] Trial 22 finished with value: 12398.26688815675 and parameters: {'num_leaves': 62, 'max_depth': 41, 'learning_rate': 0.07749946781323634, 'n_estimators': 1364, 'min_child_samples': 25, 'subsample': 0.38639583620383705, 'colsample_bytree': 0.610552081175304}. Best is trial 22 with value: 12398.26688815675.


Did not meet early stopping. Best iteration is:
[1363]	valid_0's rmse: 12398.3
Trial 22 finished in 4.54 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003014 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22879
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 100888.355472
Training until validation scores don't improve for 15 rounds
[500]	valid_0's rmse: 12993.4
[1000]	valid_0's rmse: 12592.4


[I 2024-08-02 15:27:32,986] Trial 23 finished with value: 12526.919599656294 and parameters: {'num_leaves': 75, 'max_depth': 41, 'learning_rate': 0.1007613043487156, 'n_estimators': 1391, 'min_child_samples': 33, 'subsample': 0.4426370246982536, 'colsample_bytree': 0.6161431878359356}. Best is trial 22 with value: 12398.26688815675.


Did not meet early stopping. Best iteration is:
[1391]	valid_0's rmse: 12526.9
Trial 23 finished in 5.44 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003502 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22879
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 100888.355472
Training until validation scores don't improve for 15 rounds
[500]	valid_0's rmse: 12981.8
[1000]	valid_0's rmse: 12710.9


[I 2024-08-02 15:27:37,185] Trial 24 finished with value: 12684.76142113357 and parameters: {'num_leaves': 65, 'max_depth': 38, 'learning_rate': 0.1328841413471086, 'n_estimators': 1208, 'min_child_samples': 23, 'subsample': 0.6946475751182851, 'colsample_bytree': 0.6210805218838149}. Best is trial 22 with value: 12398.26688815675.


Did not meet early stopping. Best iteration is:
[1208]	valid_0's rmse: 12684.8
Trial 24 finished in 4.20 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002152 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22879
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 100888.355472
Training until validation scores don't improve for 15 rounds
[500]	valid_0's rmse: 13153.7
[1000]	valid_0's rmse: 12654


[I 2024-08-02 15:27:42,074] Trial 25 finished with value: 12572.178705084969 and parameters: {'num_leaves': 75, 'max_depth': 43, 'learning_rate': 0.0836994970675646, 'n_estimators': 1308, 'min_child_samples': 18, 'subsample': 0.5167045221675353, 'colsample_bytree': 0.5292363553667396}. Best is trial 22 with value: 12398.26688815675.


Did not meet early stopping. Best iteration is:
[1308]	valid_0's rmse: 12572.2
Trial 25 finished in 4.89 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002446 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22879
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 100888.355472
Training until validation scores don't improve for 15 rounds
[500]	valid_0's rmse: 15762.8
[1000]	valid_0's rmse: 13745.3


[I 2024-08-02 15:27:46,221] Trial 26 finished with value: 13132.079058598465 and parameters: {'num_leaves': 51, 'max_depth': 39, 'learning_rate': 0.03454834406402968, 'n_estimators': 1420, 'min_child_samples': 24, 'subsample': 0.38590891598886173, 'colsample_bytree': 0.58593422711205}. Best is trial 22 with value: 12398.26688815675.


Did not meet early stopping. Best iteration is:
[1420]	valid_0's rmse: 13132.1
Trial 26 finished in 4.15 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002705 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22879
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 100888.355472
Training until validation scores don't improve for 15 rounds
[500]	valid_0's rmse: 13369.1
[1000]	valid_0's rmse: 12834.6


[I 2024-08-02 15:27:49,965] Trial 27 finished with value: 12792.063307696631 and parameters: {'num_leaves': 61, 'max_depth': 16, 'learning_rate': 0.105585458765098, 'n_estimators': 1107, 'min_child_samples': 30, 'subsample': 0.9175085097773359, 'colsample_bytree': 0.6993560533797014}. Best is trial 22 with value: 12398.26688815675.


Did not meet early stopping. Best iteration is:
[1107]	valid_0's rmse: 12792.1
Trial 27 finished in 3.74 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002639 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22879
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 100888.355472
Training until validation scores don't improve for 15 rounds
[500]	valid_0's rmse: 13711.6
[1000]	valid_0's rmse: 12921.4


[I 2024-08-02 15:27:53,842] Trial 28 finished with value: 12768.343315419974 and parameters: {'num_leaves': 53, 'max_depth': 31, 'learning_rate': 0.08092515856433569, 'n_estimators': 1265, 'min_child_samples': 21, 'subsample': 0.7863609058365051, 'colsample_bytree': 0.7718997153145051}. Best is trial 22 with value: 12398.26688815675.


Did not meet early stopping. Best iteration is:
[1265]	valid_0's rmse: 12768.3
Trial 28 finished in 3.88 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002142 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22879
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 100888.355472
Training until validation scores don't improve for 15 rounds
[500]	valid_0's rmse: 13100.7
[1000]	valid_0's rmse: 12763.9


[I 2024-08-02 15:27:58,509] Trial 29 finished with value: 12722.99761775472 and parameters: {'num_leaves': 72, 'max_depth': 36, 'learning_rate': 0.11270231872126586, 'n_estimators': 1322, 'min_child_samples': 19, 'subsample': 0.8579541163553933, 'colsample_bytree': 0.5380184075174455}. Best is trial 22 with value: 12398.26688815675.


Did not meet early stopping. Best iteration is:
[1322]	valid_0's rmse: 12723
Trial 29 finished in 4.67 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002793 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22879
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 100888.355472
Training until validation scores don't improve for 15 rounds
[500]	valid_0's rmse: 13127.9
[1000]	valid_0's rmse: 12840.9


[I 2024-08-02 15:28:02,957] Trial 30 finished with value: 12810.086642230995 and parameters: {'num_leaves': 64, 'max_depth': 47, 'learning_rate': 0.1407840577600997, 'n_estimators': 1444, 'min_child_samples': 15, 'subsample': 0.6084753135348708, 'colsample_bytree': 0.6412179679737787}. Best is trial 22 with value: 12398.26688815675.


Early stopping, best iteration is:
[1320]	valid_0's rmse: 12810.1
Trial 30 finished in 4.45 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002495 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22879
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 100888.355472
Training until validation scores don't improve for 15 rounds
[500]	valid_0's rmse: 13644.3
[1000]	valid_0's rmse: 12634.1


[I 2024-08-02 15:28:06,980] Trial 31 finished with value: 12384.460169832115 and parameters: {'num_leaves': 56, 'max_depth': 47, 'learning_rate': 0.0665325031780056, 'n_estimators': 1361, 'min_child_samples': 27, 'subsample': 0.4010024422697113, 'colsample_bytree': 0.5649823791588102}. Best is trial 31 with value: 12384.460169832115.


Did not meet early stopping. Best iteration is:
[1358]	valid_0's rmse: 12384.5
Trial 31 finished in 4.02 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002427 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22879
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 100888.355472
Training until validation scores don't improve for 15 rounds
[500]	valid_0's rmse: 14551.3
[1000]	valid_0's rmse: 13131.5


[I 2024-08-02 15:28:10,882] Trial 32 finished with value: 12787.175375283128 and parameters: {'num_leaves': 54, 'max_depth': 43, 'learning_rate': 0.049184470040085554, 'n_estimators': 1348, 'min_child_samples': 26, 'subsample': 0.34663634137734467, 'colsample_bytree': 0.5831924188073936}. Best is trial 31 with value: 12384.460169832115.


Did not meet early stopping. Best iteration is:
[1348]	valid_0's rmse: 12787.2
Trial 32 finished in 3.90 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002470 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22879
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 100888.355472
Training until validation scores don't improve for 15 rounds
[500]	valid_0's rmse: 14160.8
[1000]	valid_0's rmse: 13152.7


[I 2024-08-02 15:28:14,069] Trial 33 finished with value: 12847.477889596985 and parameters: {'num_leaves': 42, 'max_depth': 50, 'learning_rate': 0.09470655752949365, 'n_estimators': 1500, 'min_child_samples': 33, 'subsample': 0.4289762483212616, 'colsample_bytree': 0.5204913026362555}. Best is trial 31 with value: 12384.460169832115.


[1500]	valid_0's rmse: 12847.5
Did not meet early stopping. Best iteration is:
[1500]	valid_0's rmse: 12847.5
Trial 33 finished in 3.19 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002623 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22879
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 100888.355472
Training until validation scores don't improve for 15 rounds
[500]	valid_0's rmse: 15010.9
[1000]	valid_0's rmse: 13290.7


[I 2024-08-02 15:28:16,302] Trial 34 finished with value: 12980.216154490072 and parameters: {'num_leaves': 32, 'max_depth': 45, 'learning_rate': 0.07282692833547261, 'n_estimators': 1233, 'min_child_samples': 28, 'subsample': 0.3502449276683011, 'colsample_bytree': 0.588335593614575}. Best is trial 31 with value: 12384.460169832115.


Did not meet early stopping. Best iteration is:
[1233]	valid_0's rmse: 12980.2
Trial 34 finished in 2.23 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002886 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22879
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 100888.355472
Training until validation scores don't improve for 15 rounds
[500]	valid_0's rmse: 17417.8
[1000]	valid_0's rmse: 14710.7


[I 2024-08-02 15:28:20,324] Trial 35 finished with value: 13787.57315126125 and parameters: {'num_leaves': 49, 'max_depth': 48, 'learning_rate': 0.024157458213696623, 'n_estimators': 1422, 'min_child_samples': 20, 'subsample': 0.4711835234775872, 'colsample_bytree': 0.6916074237387558}. Best is trial 31 with value: 12384.460169832115.


Did not meet early stopping. Best iteration is:
[1422]	valid_0's rmse: 13787.6
Trial 35 finished in 4.02 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002830 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22879
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 100888.355472
Training until validation scores don't improve for 15 rounds
[500]	valid_0's rmse: 13287.4
[1000]	valid_0's rmse: 12452.7


[I 2024-08-02 15:28:24,687] Trial 36 finished with value: 12340.962372747368 and parameters: {'num_leaves': 72, 'max_depth': 28, 'learning_rate': 0.060507088620476795, 'n_estimators': 1188, 'min_child_samples': 24, 'subsample': 0.6692733668369313, 'colsample_bytree': 0.6441282814276477}. Best is trial 36 with value: 12340.962372747368.


Did not meet early stopping. Best iteration is:
[1188]	valid_0's rmse: 12341
Trial 36 finished in 4.36 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002835 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22879
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 100888.355472
Training until validation scores don't improve for 15 rounds
[500]	valid_0's rmse: 14108
[1000]	valid_0's rmse: 12846.4


[I 2024-08-02 15:28:29,153] Trial 37 finished with value: 12660.158026353469 and parameters: {'num_leaves': 71, 'max_depth': 28, 'learning_rate': 0.04025446350497314, 'n_estimators': 1183, 'min_child_samples': 23, 'subsample': 0.9226675269591347, 'colsample_bytree': 0.6469932827627669}. Best is trial 36 with value: 12340.962372747368.


Did not meet early stopping. Best iteration is:
[1183]	valid_0's rmse: 12660.2
Trial 37 finished in 4.47 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002678 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22879
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 100888.355472
Training until validation scores don't improve for 15 rounds
[500]	valid_0's rmse: 12824.7
[1000]	valid_0's rmse: 12555.2
Did not meet early stopping. Best iteration is:
[1033]	valid_0's rmse: 12551.4


[I 2024-08-02 15:28:33,265] Trial 38 finished with value: 12551.439202458709 and parameters: {'num_leaves': 75, 'max_depth': 24, 'learning_rate': 0.1102441943090654, 'n_estimators': 1033, 'min_child_samples': 13, 'subsample': 0.6783895835061355, 'colsample_bytree': 0.6703507643551413}. Best is trial 36 with value: 12340.962372747368.


Trial 38 finished in 4.11 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002572 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22879
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 100888.355472
Training until validation scores don't improve for 15 rounds
[500]	valid_0's rmse: 13546.6


[I 2024-08-02 15:28:36,567] Trial 39 finished with value: 13419.44286129791 and parameters: {'num_leaves': 68, 'max_depth': 24, 'learning_rate': 0.1802347616909698, 'n_estimators': 1129, 'min_child_samples': 29, 'subsample': 0.40333504514547497, 'colsample_bytree': 0.718638673402613}. Best is trial 36 with value: 12340.962372747368.


Early stopping, best iteration is:
[878]	valid_0's rmse: 13419.4
Trial 39 finished in 3.30 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002795 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22879
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 100888.355472
Training until validation scores don't improve for 15 rounds
[500]	valid_0's rmse: 13124.8


[I 2024-08-02 15:28:39,101] Trial 40 finished with value: 12800.962410611817 and parameters: {'num_leaves': 65, 'max_depth': 34, 'learning_rate': 0.0934036947123344, 'n_estimators': 711, 'min_child_samples': 17, 'subsample': 0.5164885519205445, 'colsample_bytree': 0.6449897319955192}. Best is trial 36 with value: 12340.962372747368.


Did not meet early stopping. Best iteration is:
[711]	valid_0's rmse: 12801
Trial 40 finished in 2.53 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002244 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22879
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 100888.355472
Training until validation scores don't improve for 15 rounds
[500]	valid_0's rmse: 13872.8
[1000]	valid_0's rmse: 12785.9


[I 2024-08-02 15:28:42,796] Trial 41 finished with value: 12573.38928935814 and parameters: {'num_leaves': 56, 'max_depth': 28, 'learning_rate': 0.06424675265945488, 'n_estimators': 1257, 'min_child_samples': 26, 'subsample': 0.3355919461401564, 'colsample_bytree': 0.55937299143062}. Best is trial 36 with value: 12340.962372747368.


Did not meet early stopping. Best iteration is:
[1257]	valid_0's rmse: 12573.4
Trial 41 finished in 3.69 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002673 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22879
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 100888.355472
Training until validation scores don't improve for 15 rounds
[500]	valid_0's rmse: 14062.5
[1000]	valid_0's rmse: 12862.3


[I 2024-08-02 15:28:47,525] Trial 42 finished with value: 12535.651028892336 and parameters: {'num_leaves': 63, 'max_depth': 37, 'learning_rate': 0.051889342042208464, 'n_estimators': 1423, 'min_child_samples': 31, 'subsample': 0.7714832703300146, 'colsample_bytree': 0.6036756343349058}. Best is trial 36 with value: 12340.962372747368.


Did not meet early stopping. Best iteration is:
[1423]	valid_0's rmse: 12535.7
Trial 42 finished in 4.73 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002318 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22879
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 100888.355472
Training until validation scores don't improve for 15 rounds
[500]	valid_0's rmse: 13213
[1000]	valid_0's rmse: 12659.5


[I 2024-08-02 15:28:52,291] Trial 43 finished with value: 12543.199536373744 and parameters: {'num_leaves': 72, 'max_depth': 30, 'learning_rate': 0.07977132197354868, 'n_estimators': 1323, 'min_child_samples': 24, 'subsample': 0.8276035982690766, 'colsample_bytree': 0.554535245155293}. Best is trial 36 with value: 12340.962372747368.


Did not meet early stopping. Best iteration is:
[1323]	valid_0's rmse: 12543.2
Trial 43 finished in 4.77 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002733 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22879
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 100888.355472
Training until validation scores don't improve for 15 rounds
[500]	valid_0's rmse: 12924.3
[1000]	valid_0's rmse: 12383.3


[I 2024-08-02 15:28:58,030] Trial 44 finished with value: 12263.373537879055 and parameters: {'num_leaves': 79, 'max_depth': 26, 'learning_rate': 0.07455914521078384, 'n_estimators': 1446, 'min_child_samples': 22, 'subsample': 0.43544652153306895, 'colsample_bytree': 0.631599023200065}. Best is trial 44 with value: 12263.373537879055.


Did not meet early stopping. Best iteration is:
[1446]	valid_0's rmse: 12263.4
Trial 44 finished in 5.74 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002730 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22879
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 100888.355472
Training until validation scores don't improve for 15 rounds
[500]	valid_0's rmse: 13077.1
Did not meet early stopping. Best iteration is:
[908]	valid_0's rmse: 12534.1


[I 2024-08-02 15:29:01,761] Trial 45 finished with value: 12534.130104522701 and parameters: {'num_leaves': 78, 'max_depth': 18, 'learning_rate': 0.07122994897707878, 'n_estimators': 908, 'min_child_samples': 22, 'subsample': 0.4340290482350887, 'colsample_bytree': 0.6428307306241504}. Best is trial 44 with value: 12263.373537879055.


Trial 45 finished in 3.73 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002759 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22879
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 100888.355472
Training until validation scores don't improve for 15 rounds
[500]	valid_0's rmse: 13202.4
[1000]	valid_0's rmse: 12640.1


[I 2024-08-02 15:29:06,434] Trial 46 finished with value: 12552.749522742388 and parameters: {'num_leaves': 69, 'max_depth': 21, 'learning_rate': 0.09137800942761012, 'n_estimators': 1237, 'min_child_samples': 49, 'subsample': 0.4627423542451008, 'colsample_bytree': 0.7602333452636966}. Best is trial 44 with value: 12263.373537879055.


Did not meet early stopping. Best iteration is:
[1237]	valid_0's rmse: 12552.7
Trial 46 finished in 4.67 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002397 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22879
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 100888.355472
Training until validation scores don't improve for 15 rounds
[500]	valid_0's rmse: 13070.6
[1000]	valid_0's rmse: 12617.3


[I 2024-08-02 15:29:11,525] Trial 47 finished with value: 12542.703760075628 and parameters: {'num_leaves': 80, 'max_depth': 30, 'learning_rate': 0.0782079121720606, 'n_estimators': 1336, 'min_child_samples': 13, 'subsample': 0.4950134495715491, 'colsample_bytree': 0.68465883168935}. Best is trial 44 with value: 12263.373537879055.


Did not meet early stopping. Best iteration is:
[1336]	valid_0's rmse: 12542.7
Trial 47 finished in 5.09 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002561 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22879
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 100888.355472
Training until validation scores don't improve for 15 rounds
[500]	valid_0's rmse: 14156
[1000]	valid_0's rmse: 12977.9
Did not meet early stopping. Best iteration is:
[1409]	valid_0's rmse: 12653


[I 2024-08-02 15:29:17,234] Trial 48 finished with value: 12653.046629387734 and parameters: {'num_leaves': 76, 'max_depth': 26, 'learning_rate': 0.04172143771147141, 'n_estimators': 1409, 'min_child_samples': 35, 'subsample': 0.41229188722947296, 'colsample_bytree': 0.6645197754639679}. Best is trial 44 with value: 12263.373537879055.


Trial 48 finished in 5.71 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002467 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22879
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 100888.355472
Training until validation scores don't improve for 15 rounds
[500]	valid_0's rmse: 12895.4
[1000]	valid_0's rmse: 12573.9


[I 2024-08-02 15:29:22,708] Trial 49 finished with value: 12525.717246476153 and parameters: {'num_leaves': 72, 'max_depth': 26, 'learning_rate': 0.11805269103023541, 'n_estimators': 1463, 'min_child_samples': 25, 'subsample': 0.5562182418998072, 'colsample_bytree': 0.7198213659234672}. Best is trial 44 with value: 12263.373537879055.


Early stopping, best iteration is:
[1419]	valid_0's rmse: 12525.7
Trial 49 finished in 5.47 seconds.
Best parameters found:  {'num_leaves': 79, 'max_depth': 26, 'learning_rate': 0.07455914521078384, 'n_estimators': 1446, 'min_child_samples': 22, 'subsample': 0.43544652153306895, 'colsample_bytree': 0.631599023200065}


In [10]:
# best_params = {
#     "num_leaves": 70,
#     "max_depth": 36,
#     "learning_rate": 0.09304350950671668,
#     "n_estimators": 1158,
#     "min_child_samples": 18,
#     "subsample": 0.579731306036922,
#     "colsample_bytree": 0.8511910376678277,
# }

In [11]:
print("Best parameters found: ", best_params)
best_params["num_threads"] = 16

# Train the model with the best parameters
best_model = LGBMRegressor(**best_params)
best_model.fit(X_train, y_train)


# Extract feature importances
feature_importances = best_model.feature_importances_

# Create a DataFrame to hold the features and their importances
importance_df = pd.DataFrame(
    {"Feature": X_train.columns, "Importance": feature_importances}
)

# Sort the DataFrame by importance
importance_df = importance_df.sort_values(by="Importance", ascending=False)

Best parameters found:  {'num_leaves': 79, 'max_depth': 26, 'learning_rate': 0.07455914521078384, 'n_estimators': 1446, 'min_child_samples': 22, 'subsample': 0.43544652153306895, 'colsample_bytree': 0.631599023200065}
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002785 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22879
[LightGBM] [Info] Number of data points in the train set: 14592, number of used features: 101
[LightGBM] [Info] Start training from score 101382.392957


In [12]:
# Predict on the test set
y_pred = best_model.predict(X_test)

# Calculate RMSE
rmse = root_mean_squared_error(y_test, y_pred)
print(f"RMSE on the test set: {rmse:.4f}")

RMSE on the test set: 65581.0135


In [13]:
# save important_features to csv file
if use_test_for_validation_flag:
    importance_df.to_csv("farm_important_features_use_test_for_validation.csv", index=False)
else:
    importance_df.to_csv("farm_important_features.csv", index=False)

In [14]:
# Set the threshold for feature importance
importance_threshold = 200

# Select features with importance greater than or equal to the threshold
selected_features = importance_df[importance_df["Importance"] >= importance_threshold][
    "Feature"
].tolist()

# Separate selected features into original input features and time features
selected_input_features = [
    feature for feature in selected_features if feature not in time_features
]
selected_time_features = [
    feature for feature in selected_features if feature in time_features
]

In [15]:
# Combine the filtered features with the power column and time features
train_data_final = pd.concat(
    [
        train_data_selected["time"],
        X_train[selected_input_features],
        train_data_selected["power"],
        train_data_selected[time_features],
    ],
    axis=1,
)
test_data_final = pd.concat(
    [
        test_data_selected["time"],
        X_test[selected_input_features],
        test_data_selected["power"],
        test_data_selected[time_features],
    ],
    axis=1,
)

# Ensure the final order of columns
final_columns = ["time"] + selected_input_features + ["power"] + time_features
train_data_final = train_data_final[final_columns]
test_data_final = test_data_final[final_columns]

# Display the new shapes of train_data_final and test_data_final
print("Shape of train_data_final:", train_data_final.shape)
print("Shape of test_data_final:", test_data_final.shape)
print("Selected input features:", selected_input_features)
print(f"Train columns : {train_data_final.columns}")

Shape of train_data_final: (14592, 97)
Shape of test_data_final: (2880, 97)
Selected input features: ['lgws', 'mgws', 'altitude', 'azimuth', 'trpp', 'blh', 'mudlp', 'lcc', 'deg0l', 'mcc', 'kx', 'totalx', 'tcc', 'viwvn', 'tcsw', 'flsr', 'fg10', 'capes', 'ilspf', 'ishf', 'vimd', 'viwve', 'bld', 'degm10l', 'u100', 'p3020', 'gwd', 'ws200', 'ewss', 'mld', 'cape', 'ws10', 'ws100', 'lspf', 'hcc', 'litoti', 'i10fg', 'u200', 'u10', 'u10n', 'mlcape100', 'hwbt0', 'sund', 'msl', 'hwbt1', 'dsrp', 'nsss', 'fdir', 'mucape', 'mlcape50', 'str', 'sshf', 'v200', 'ttrc', 'tcw', 'cdir', 'v100', 'v10', 'ttr', 'slhf', 'v10n', 'par', 'strc', 'd2m', 'tsr', 't2m', 'parcs', 'ssrd', 'uvb', 'sp', 'tisr', 'ssrdc', 'ssrc', 'strd', 'ssr', 'tsrc', 'tcwv', 'mn2t', 'mx2t', 'lblt', 'sst', 'skt', 'strdc']
Train columns : Index(['time', 'lgws', 'mgws', 'altitude', 'azimuth', 'trpp', 'blh', 'mudlp',
       'lcc', 'deg0l', 'mcc', 'kx', 'totalx', 'tcc', 'viwvn', 'tcsw', 'flsr',
       'fg10', 'capes', 'ilspf', 'ishf', 'vimd',

In [16]:
# Save the final datasets to CSV
train_data_final.to_csv(data_dir + "train_farm_97_withTime.csv", index=False)
test_data_final.to_csv(data_dir + "test_farm_97_withTime.csv", index=False)
print("Final datasets saved to CSV.")

Final datasets saved to CSV.
