In [390]:
import pandas as pd
import numpy as np  # noqa
import optuna
import time
import logging  # noqa
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error
from lightgbm import LGBMRegressor, early_stopping, log_evaluation

# Enable logging
optuna.logging.set_verbosity(optuna.logging.INFO)


class TimingCallback:
    def __init__(self):
        self.start_time = None

    def __call__(self, study, trial):
        if self.start_time is None:
            self.start_time = time.time()
        else:
            elapsed_time = time.time() - self.start_time
            print(f"Trial {trial.number} finished in {elapsed_time:.2f} seconds.")
            self.start_time = time.time()

In [391]:
# Load data
# data_dir = "/data3/lsf/Pein/Power-Prediction/data/"

train_path = "3-train_merged_55.csv"
test_path = "3-test_merged_55.csv"

train_data_selected = pd.read_csv(train_path)
test_data_selected = pd.read_csv(test_path)

# Set the ['power'] to be non-negative
train_data_selected["power"] = train_data_selected["power"].apply(
    lambda x: x if x >= 0 else 0
)
test_data_selected["power"] = test_data_selected["power"].apply(
    lambda x: x if x >= 0 else 0
)

# Convert 'time' column to datetime
train_data_selected["time"] = pd.to_datetime(train_data_selected["time"])
test_data_selected["time"] = pd.to_datetime(test_data_selected["time"])


# Define features and target
features = [
    col
    for col in train_data_selected.columns
    if col not in ["time", "lead_hour", "initial_time", "power"]
]


def generate_past_features(df, features, seq_len=1, dilation=1):
    """
    Generate past features for a given dataframe.

    Parameters:
        df (pd.DataFrame): The input dataframe.
        features (list): List of features to generate past values for.
        seq_len (int): The length of the sequence (number of past observations).
        dilation (int): The step size between past observations.

    Returns:
        pd.DataFrame: The dataframe with past features, including 'time' and 'power' columns.
    """
    df_copy = df.copy()
    for i in range(1, seq_len):
        for feature in features:
            df_copy[f"{feature}_lag_{i}"] = df_copy[feature].shift(i * dilation)

    # Remove rows with NaN values created due to shifting
    df_copy = df_copy.dropna().reset_index(drop=True)

    return df_copy


# Define your settings
seq_len = 36  # You can change this as needed
dilation = 1  # Default is 1, adjust as needed

# Generate past features
train_data_with_past = generate_past_features(
    train_data_selected, features, seq_len=seq_len, dilation=dilation
)
test_data_with_past = generate_past_features(
    test_data_selected, features, seq_len=seq_len, dilation=dilation
)

# Ensure 'time' and 'power' columns are retained
train_data_with_past = train_data_with_past[
    ["time", "power"] + list(train_data_with_past.columns.difference(["time", "power"]))
]
test_data_with_past = test_data_with_past[
    ["time", "power"] + list(test_data_with_past.columns.difference(["time", "power"]))
]

# Define features and target
features = [
    col
    for col in train_data_with_past.columns
    if col not in ["time", "lead_hour", "initial_time", "power"]
]


  df_copy[f"{feature}_lag_{i}"] = df_copy[feature].shift(i * dilation)
  df_copy[f"{feature}_lag_{i}"] = df_copy[feature].shift(i * dilation)
  df_copy[f"{feature}_lag_{i}"] = df_copy[feature].shift(i * dilation)
  df_copy[f"{feature}_lag_{i}"] = df_copy[feature].shift(i * dilation)
  df_copy[f"{feature}_lag_{i}"] = df_copy[feature].shift(i * dilation)
  df_copy[f"{feature}_lag_{i}"] = df_copy[feature].shift(i * dilation)
  df_copy[f"{feature}_lag_{i}"] = df_copy[feature].shift(i * dilation)
  df_copy[f"{feature}_lag_{i}"] = df_copy[feature].shift(i * dilation)
  df_copy[f"{feature}_lag_{i}"] = df_copy[feature].shift(i * dilation)
  df_copy[f"{feature}_lag_{i}"] = df_copy[feature].shift(i * dilation)
  df_copy[f"{feature}_lag_{i}"] = df_copy[feature].shift(i * dilation)
  df_copy[f"{feature}_lag_{i}"] = df_copy[feature].shift(i * dilation)
  df_copy[f"{feature}_lag_{i}"] = df_copy[feature].shift(i * dilation)
  df_copy[f"{feature}_lag_{i}"] = df_copy[feature].shift(i * dilation)
  df_c

In [392]:
print(features)
print(
    f"shape of train is {train_data_with_past.shape}, shape of test is {test_data_with_past.shape}"
)

['altitude', 'altitude_lag_1', 'altitude_lag_10', 'altitude_lag_11', 'altitude_lag_12', 'altitude_lag_13', 'altitude_lag_14', 'altitude_lag_15', 'altitude_lag_16', 'altitude_lag_17', 'altitude_lag_18', 'altitude_lag_19', 'altitude_lag_2', 'altitude_lag_20', 'altitude_lag_21', 'altitude_lag_22', 'altitude_lag_23', 'altitude_lag_24', 'altitude_lag_25', 'altitude_lag_26', 'altitude_lag_27', 'altitude_lag_28', 'altitude_lag_29', 'altitude_lag_3', 'altitude_lag_30', 'altitude_lag_31', 'altitude_lag_32', 'altitude_lag_33', 'altitude_lag_34', 'altitude_lag_35', 'altitude_lag_4', 'altitude_lag_5', 'altitude_lag_6', 'altitude_lag_7', 'altitude_lag_8', 'altitude_lag_9', 'azimuth', 'azimuth_lag_1', 'azimuth_lag_10', 'azimuth_lag_11', 'azimuth_lag_12', 'azimuth_lag_13', 'azimuth_lag_14', 'azimuth_lag_15', 'azimuth_lag_16', 'azimuth_lag_17', 'azimuth_lag_18', 'azimuth_lag_19', 'azimuth_lag_2', 'azimuth_lag_20', 'azimuth_lag_21', 'azimuth_lag_22', 'azimuth_lag_23', 'azimuth_lag_24', 'azimuth_lag_25'

### Adding time features

In [393]:
# Function to add time features
def add_time_features(df):
    df["time"] = pd.to_datetime(df["time"])

    # Existing time features
    df["hour"] = df["time"].dt.hour
    df["quarter_hour"] = df["time"].dt.minute // 15

    # New time features
    df["day"] = df["time"].dt.day
    df["day_in_week"] = df["time"].dt.weekday

    # Sine and cosine transformations
    df["hour_sin"] = np.sin(2 * np.pi * df["hour"] / 24)
    df["hour_cos"] = np.cos(2 * np.pi * df["hour"] / 24)

    df["quarter_hour_sin"] = np.sin(2 * np.pi * df["quarter_hour"] / 4)
    df["quarter_hour_cos"] = np.cos(2 * np.pi * df["quarter_hour"] / 4)

    df["day_sin"] = np.sin(2 * np.pi * df["day"] / 31)
    df["day_cos"] = np.cos(2 * np.pi * df["day"] / 31)

    df["day_in_week_sin"] = np.sin(2 * np.pi * df["day_in_week"] / 7)
    df["day_in_week_cos"] = np.cos(2 * np.pi * df["day_in_week"] / 7)

    return df

In [394]:
# # Add time features to both train and test data
train_data_selected = add_time_features(train_data_with_past)
test_data_selected = add_time_features(test_data_with_past)

In [395]:
# Define features including the new time features
time_features = [
    "hour",
    # "quarter_hour",
    "day",
    # "day_in_week",
    "hour_sin",
    "hour_cos",
    # "quarter_hour_sin",
    # "quarter_hour_cos",
    "day_sin",
    "day_cos",
    # "day_in_week_sin",
    # "day_in_week_cos",
]

In [396]:
all_features = features + time_features
y_train = train_data_selected["power"]
y_test = test_data_selected["power"]
train_data_selected = train_data_selected[all_features]
test_data_selected = test_data_selected[all_features]

In [397]:
print(
    f"shape of train is {train_data_selected.shape}, shape of test is {test_data_selected.shape}"
)

shape of train is (14557, 1878), shape of test is (2845, 1878)


In [398]:
print(f"features: {len(features)}, time_features: {len(time_features)}")

features: 1872, time_features: 6


In [399]:
# Extract the new features and target
X_train = train_data_selected
X_test = test_data_selected
print(f"Shape of X_train: {X_train.shape}, Shape of y_train: {y_train.shape}")
print(f"Shape of X_test: {X_test.shape}, Shape of y_test: {y_test.shape}")


Shape of X_train: (14557, 1878), Shape of y_train: (14557,)
Shape of X_test: (2845, 1878), Shape of y_test: (2845,)


In [400]:
from sklearn.model_selection import train_test_split


def objective(
    trial,
    X_train,
    y_train,
    X_test=None,
    y_test=None,
    use_test_for_validation_flag=False,
    use_sklearn_split=False,
):
    params = {
        "objective": "regression",
        "metric": "rmse",
        "boosting_type": "gbdt",
        "num_leaves": trial.suggest_int("num_leaves", 10, 80),
        "max_depth": trial.suggest_int("max_depth", 10, 50),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2),
        "n_estimators": trial.suggest_int("n_estimators", 500, 1500),
        "min_child_samples": trial.suggest_int("min_child_samples", 10, 50),
        "subsample": trial.suggest_float("subsample", 0.3, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "num_threads": 16,
        "seed": 42,
    }

    model = LGBMRegressor(**params)

    if use_test_for_validation_flag and X_test is not None and y_test is not None:
        X_val_split, y_val_split = X_test, y_test
        model.fit(
            X_train,
            y_train,
            eval_set=[(X_val_split, y_val_split)],
            eval_metric="rmse",
            callbacks=[early_stopping(stopping_rounds=15), log_evaluation(period=500)],
        )
        preds = model.predict(X_val_split)
    else:
        if use_sklearn_split:
            # Use sklearn's train_test_split
            X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
                X_train, y_train, test_size=0.2, random_state=42
            )
        else:
            # Calculate the split index
            split_index = int(0.8 * len(X_train))

            # Split the data manually
            X_train_split, X_val_split = X_train[:split_index], X_train[split_index:]
            y_train_split, y_val_split = y_train[:split_index], y_train[split_index:]

        model.fit(
            X_train_split,
            y_train_split,
            eval_set=[(X_val_split, y_val_split)],
            eval_metric="rmse",
            callbacks=[early_stopping(stopping_rounds=100), log_evaluation(period=500)],
        )
        preds = model.predict(X_val_split)

    # Calculate RMSE
    rmse = root_mean_squared_error(y_val_split, preds)
    return rmse

In [401]:
# Create the study and optimize
use_test_for_validation_flag = False
study = optuna.create_study(direction="minimize")
study.optimize(
    lambda trial: objective(
        trial,
        X_train,
        y_train,
        X_test,
        y_test,
        use_test_for_validation_flag=use_test_for_validation_flag,
    ),
    n_trials=50,
    callbacks=[TimingCallback()],
)

# Get the best parameters
best_params = study.best_params
print("Best parameters found: ", best_params)

[I 2024-08-18 22:45:40,540] A new study created in memory with name: no-name-55d4572a-eda3-4213-834e-b555846044fc


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.085291 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 477466
[LightGBM] [Info] Number of data points in the train set: 11645, number of used features: 1878
[LightGBM] [Info] Start training from score 105788.593173
Training until validation scores don't improve for 100 rounds


[I 2024-08-18 22:45:45,857] Trial 0 finished with value: 65466.098426212026 and parameters: {'num_leaves': 61, 'max_depth': 26, 'learning_rate': 0.1361168462408431, 'n_estimators': 693, 'min_child_samples': 26, 'subsample': 0.5081834831604161, 'colsample_bytree': 0.6074269705949685}. Best is trial 0 with value: 65466.098426212026.


Early stopping, best iteration is:
[23]	valid_0's rmse: 65466.1
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.081209 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 477466
[LightGBM] [Info] Number of data points in the train set: 11645, number of used features: 1878
[LightGBM] [Info] Start training from score 105788.593173
Training until validation scores don't improve for 100 rounds


[I 2024-08-18 22:45:53,326] Trial 1 finished with value: 65850.62002424612 and parameters: {'num_leaves': 49, 'max_depth': 27, 'learning_rate': 0.07053447454989857, 'n_estimators': 1187, 'min_child_samples': 15, 'subsample': 0.48868160862864607, 'colsample_bytree': 0.7154483442352819}. Best is trial 0 with value: 65466.098426212026.


Early stopping, best iteration is:
[100]	valid_0's rmse: 65850.6
Trial 1 finished in 7.47 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.064920 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 477466
[LightGBM] [Info] Number of data points in the train set: 11645, number of used features: 1878
[LightGBM] [Info] Start training from score 105788.593173
Training until validation scores don't improve for 100 rounds


[I 2024-08-18 22:45:58,501] Trial 2 finished with value: 67741.14848509518 and parameters: {'num_leaves': 32, 'max_depth': 20, 'learning_rate': 0.14828664628381372, 'n_estimators': 1258, 'min_child_samples': 43, 'subsample': 0.3544365746615826, 'colsample_bytree': 0.9722122872177378}. Best is trial 0 with value: 65466.098426212026.


Early stopping, best iteration is:
[52]	valid_0's rmse: 67741.1
Trial 2 finished in 5.17 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.052900 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 477466
[LightGBM] [Info] Number of data points in the train set: 11645, number of used features: 1878
[LightGBM] [Info] Start training from score 105788.593173
Training until validation scores don't improve for 100 rounds


[I 2024-08-18 22:46:03,607] Trial 3 finished with value: 69413.77473824291 and parameters: {'num_leaves': 65, 'max_depth': 48, 'learning_rate': 0.14475149742378648, 'n_estimators': 1316, 'min_child_samples': 29, 'subsample': 0.6472911020047186, 'colsample_bytree': 0.527817836255505}. Best is trial 0 with value: 65466.098426212026.


Early stopping, best iteration is:
[35]	valid_0's rmse: 69413.8
Trial 3 finished in 5.11 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.080912 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 477466
[LightGBM] [Info] Number of data points in the train set: 11645, number of used features: 1878
[LightGBM] [Info] Start training from score 105788.593173
Training until validation scores don't improve for 100 rounds


[I 2024-08-18 22:46:08,108] Trial 4 finished with value: 68575.60610414064 and parameters: {'num_leaves': 20, 'max_depth': 33, 'learning_rate': 0.192973141359864, 'n_estimators': 855, 'min_child_samples': 31, 'subsample': 0.4891102852602557, 'colsample_bytree': 0.7030873620197977}. Best is trial 0 with value: 65466.098426212026.


Early stopping, best iteration is:
[140]	valid_0's rmse: 68575.6
Trial 4 finished in 4.50 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.083495 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 477466
[LightGBM] [Info] Number of data points in the train set: 11645, number of used features: 1878
[LightGBM] [Info] Start training from score 105788.593173
Training until validation scores don't improve for 100 rounds


[I 2024-08-18 22:46:11,653] Trial 5 finished with value: 67627.37125189771 and parameters: {'num_leaves': 14, 'max_depth': 36, 'learning_rate': 0.11952399143054435, 'n_estimators': 752, 'min_child_samples': 47, 'subsample': 0.8317550604822177, 'colsample_bytree': 0.6729861247095569}. Best is trial 0 with value: 65466.098426212026.


Early stopping, best iteration is:
[114]	valid_0's rmse: 67627.4
Trial 5 finished in 3.55 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.083130 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 477466
[LightGBM] [Info] Number of data points in the train set: 11645, number of used features: 1878
[LightGBM] [Info] Start training from score 105788.593173
Training until validation scores don't improve for 100 rounds


[I 2024-08-18 22:46:28,030] Trial 6 finished with value: 67384.2485344614 and parameters: {'num_leaves': 69, 'max_depth': 12, 'learning_rate': 0.021939098322558172, 'n_estimators': 610, 'min_child_samples': 29, 'subsample': 0.564431212910235, 'colsample_bytree': 0.7838472779726571}. Best is trial 0 with value: 65466.098426212026.


Early stopping, best iteration is:
[277]	valid_0's rmse: 67384.2
Trial 6 finished in 16.38 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.085407 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 477466
[LightGBM] [Info] Number of data points in the train set: 11645, number of used features: 1878
[LightGBM] [Info] Start training from score 105788.593173
Training until validation scores don't improve for 100 rounds


[I 2024-08-18 22:46:38,281] Trial 7 finished with value: 67832.85386041777 and parameters: {'num_leaves': 61, 'max_depth': 25, 'learning_rate': 0.0759180715623811, 'n_estimators': 1356, 'min_child_samples': 21, 'subsample': 0.8349824401541994, 'colsample_bytree': 0.7931540591246597}. Best is trial 0 with value: 65466.098426212026.


Early stopping, best iteration is:
[136]	valid_0's rmse: 67832.9
Trial 7 finished in 10.25 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.053300 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 477466
[LightGBM] [Info] Number of data points in the train set: 11645, number of used features: 1878
[LightGBM] [Info] Start training from score 105788.593173
Training until validation scores don't improve for 100 rounds


[I 2024-08-18 22:46:41,728] Trial 8 finished with value: 68262.75618936299 and parameters: {'num_leaves': 23, 'max_depth': 20, 'learning_rate': 0.1474377714324692, 'n_estimators': 1060, 'min_child_samples': 38, 'subsample': 0.9205350714069169, 'colsample_bytree': 0.556400331814866}. Best is trial 0 with value: 65466.098426212026.


Early stopping, best iteration is:
[64]	valid_0's rmse: 68262.8
Trial 8 finished in 3.45 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.073869 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 477466
[LightGBM] [Info] Number of data points in the train set: 11645, number of used features: 1878
[LightGBM] [Info] Start training from score 105788.593173
Training until validation scores don't improve for 100 rounds


[I 2024-08-18 22:46:44,932] Trial 9 finished with value: 69419.15051401244 and parameters: {'num_leaves': 17, 'max_depth': 22, 'learning_rate': 0.13622487879483408, 'n_estimators': 999, 'min_child_samples': 41, 'subsample': 0.7483970271785232, 'colsample_bytree': 0.8411682338025199}. Best is trial 0 with value: 65466.098426212026.


Early stopping, best iteration is:
[28]	valid_0's rmse: 69419.2
Trial 9 finished in 3.20 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.050199 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 477466
[LightGBM] [Info] Number of data points in the train set: 11645, number of used features: 1878
[LightGBM] [Info] Start training from score 105788.593173
Training until validation scores don't improve for 100 rounds


[I 2024-08-18 22:46:49,197] Trial 10 finished with value: 69162.22729387012 and parameters: {'num_leaves': 48, 'max_depth': 42, 'learning_rate': 0.19778632733357757, 'n_estimators': 502, 'min_child_samples': 12, 'subsample': 0.3034766451666265, 'colsample_bytree': 0.6023426140302033}. Best is trial 0 with value: 65466.098426212026.


Early stopping, best iteration is:
[14]	valid_0's rmse: 69162.2
Trial 10 finished in 4.26 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.063433 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 477466
[LightGBM] [Info] Number of data points in the train set: 11645, number of used features: 1878
[LightGBM] [Info] Start training from score 105788.593173
Training until validation scores don't improve for 100 rounds


[I 2024-08-18 22:46:56,019] Trial 11 finished with value: 65459.4133356878 and parameters: {'num_leaves': 49, 'max_depth': 30, 'learning_rate': 0.07297368356113065, 'n_estimators': 1113, 'min_child_samples': 10, 'subsample': 0.4518093784606565, 'colsample_bytree': 0.6502770948193778}. Best is trial 11 with value: 65459.4133356878.


Early stopping, best iteration is:
[84]	valid_0's rmse: 65459.4
Trial 11 finished in 6.82 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.068224 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 477466
[LightGBM] [Info] Number of data points in the train set: 11645, number of used features: 1878
[LightGBM] [Info] Start training from score 105788.593173
Training until validation scores don't improve for 100 rounds


[I 2024-08-18 22:47:03,826] Trial 12 finished with value: 66875.22373113403 and parameters: {'num_leaves': 77, 'max_depth': 37, 'learning_rate': 0.08755575045177333, 'n_estimators': 1477, 'min_child_samples': 21, 'subsample': 0.4030963889536029, 'colsample_bytree': 0.643582730334096}. Best is trial 11 with value: 65459.4133356878.


Early stopping, best iteration is:
[61]	valid_0's rmse: 66875.2
Trial 12 finished in 7.81 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.052001 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 477466
[LightGBM] [Info] Number of data points in the train set: 11645, number of used features: 1878
[LightGBM] [Info] Start training from score 105788.593173
Training until validation scores don't improve for 100 rounds


[I 2024-08-18 22:47:10,304] Trial 13 finished with value: 66288.45053160578 and parameters: {'num_leaves': 36, 'max_depth': 14, 'learning_rate': 0.039715939601360745, 'n_estimators': 899, 'min_child_samples': 20, 'subsample': 0.5971187961205624, 'colsample_bytree': 0.591598385707456}. Best is trial 11 with value: 65459.4133356878.


Early stopping, best iteration is:
[146]	valid_0's rmse: 66288.5
Trial 13 finished in 6.48 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.068334 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 477466
[LightGBM] [Info] Number of data points in the train set: 11645, number of used features: 1878
[LightGBM] [Info] Start training from score 105788.593173
Training until validation scores don't improve for 100 rounds


[I 2024-08-18 22:47:16,603] Trial 14 finished with value: 67434.21615432603 and parameters: {'num_leaves': 56, 'max_depth': 30, 'learning_rate': 0.10602387163790247, 'n_estimators': 1112, 'min_child_samples': 10, 'subsample': 0.4657184462692718, 'colsample_bytree': 0.6339470074429042}. Best is trial 11 with value: 65459.4133356878.


Early stopping, best iteration is:
[59]	valid_0's rmse: 67434.2
Trial 14 finished in 6.30 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.050395 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 477466
[LightGBM] [Info] Number of data points in the train set: 11645, number of used features: 1878
[LightGBM] [Info] Start training from score 105788.593173
Training until validation scores don't improve for 100 rounds


[I 2024-08-18 22:47:21,636] Trial 15 finished with value: 67459.56358027367 and parameters: {'num_leaves': 39, 'max_depth': 39, 'learning_rate': 0.06290434561307437, 'n_estimators': 739, 'min_child_samples': 35, 'subsample': 0.5600391093512934, 'colsample_bytree': 0.5215452363725186}. Best is trial 11 with value: 65459.4133356878.


Early stopping, best iteration is:
[89]	valid_0's rmse: 67459.6
Trial 15 finished in 5.03 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.066673 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 477466
[LightGBM] [Info] Number of data points in the train set: 11645, number of used features: 1878
[LightGBM] [Info] Start training from score 105788.593173
Training until validation scores don't improve for 100 rounds


[I 2024-08-18 22:47:30,041] Trial 16 finished with value: 68890.99044535562 and parameters: {'num_leaves': 80, 'max_depth': 29, 'learning_rate': 0.1750679785085123, 'n_estimators': 946, 'min_child_samples': 25, 'subsample': 0.41500804114544004, 'colsample_bytree': 0.8817149554822887}. Best is trial 11 with value: 65459.4133356878.


Early stopping, best iteration is:
[42]	valid_0's rmse: 68891
Trial 16 finished in 8.41 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.067628 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 477466
[LightGBM] [Info] Number of data points in the train set: 11645, number of used features: 1878
[LightGBM] [Info] Start training from score 105788.593173
Training until validation scores don't improve for 100 rounds


[I 2024-08-18 22:47:38,736] Trial 17 finished with value: 68147.83071123093 and parameters: {'num_leaves': 55, 'max_depth': 44, 'learning_rate': 0.04501977443634596, 'n_estimators': 752, 'min_child_samples': 17, 'subsample': 0.7058861784763907, 'colsample_bytree': 0.719651876934456}. Best is trial 11 with value: 65459.4133356878.


Early stopping, best iteration is:
[109]	valid_0's rmse: 68147.8
Trial 17 finished in 8.69 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.055450 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 477466
[LightGBM] [Info] Number of data points in the train set: 11645, number of used features: 1878
[LightGBM] [Info] Start training from score 105788.593173
Training until validation scores don't improve for 100 rounds


[I 2024-08-18 22:47:45,264] Trial 18 finished with value: 67651.68647669864 and parameters: {'num_leaves': 71, 'max_depth': 17, 'learning_rate': 0.09328951881240873, 'n_estimators': 602, 'min_child_samples': 25, 'subsample': 0.9959010594179536, 'colsample_bytree': 0.589560822874004}. Best is trial 11 with value: 65459.4133356878.


Early stopping, best iteration is:
[56]	valid_0's rmse: 67651.7
Trial 18 finished in 6.53 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.071246 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 477466
[LightGBM] [Info] Number of data points in the train set: 11645, number of used features: 1878
[LightGBM] [Info] Start training from score 105788.593173
Training until validation scores don't improve for 100 rounds


[I 2024-08-18 22:47:52,660] Trial 19 finished with value: 68866.36916854311 and parameters: {'num_leaves': 44, 'max_depth': 32, 'learning_rate': 0.1203894291835816, 'n_estimators': 1119, 'min_child_samples': 50, 'subsample': 0.5119405155015915, 'colsample_bytree': 0.659981566834018}. Best is trial 11 with value: 65459.4133356878.


Early stopping, best iteration is:
[173]	valid_0's rmse: 68866.4
Trial 19 finished in 7.40 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.048154 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 477466
[LightGBM] [Info] Number of data points in the train set: 11645, number of used features: 1878
[LightGBM] [Info] Start training from score 105788.593173
Training until validation scores don't improve for 100 rounds


[I 2024-08-18 22:47:59,079] Trial 20 finished with value: 69648.9604499647 and parameters: {'num_leaves': 56, 'max_depth': 24, 'learning_rate': 0.1736194897018815, 'n_estimators': 860, 'min_child_samples': 15, 'subsample': 0.6396282662645508, 'colsample_bytree': 0.5006727183353178}. Best is trial 11 with value: 65459.4133356878.


Early stopping, best iteration is:
[128]	valid_0's rmse: 69649
Trial 20 finished in 6.42 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.111712 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 477466
[LightGBM] [Info] Number of data points in the train set: 11645, number of used features: 1878
[LightGBM] [Info] Start training from score 105788.593173
Training until validation scores don't improve for 100 rounds


[I 2024-08-18 22:48:05,262] Trial 21 finished with value: 65900.9122541324 and parameters: {'num_leaves': 48, 'max_depth': 27, 'learning_rate': 0.062091782110197044, 'n_estimators': 1202, 'min_child_samples': 14, 'subsample': 0.44266523754724657, 'colsample_bytree': 0.7122748214955599}. Best is trial 11 with value: 65459.4133356878.


Early stopping, best iteration is:
[53]	valid_0's rmse: 65900.9
Trial 21 finished in 6.18 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.068720 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 477466
[LightGBM] [Info] Number of data points in the train set: 11645, number of used features: 1878
[LightGBM] [Info] Start training from score 105788.593173
Training until validation scores don't improve for 100 rounds


[I 2024-08-18 22:48:13,676] Trial 22 finished with value: 65654.97653009945 and parameters: {'num_leaves': 50, 'max_depth': 26, 'learning_rate': 0.06773931186208057, 'n_estimators': 1183, 'min_child_samples': 10, 'subsample': 0.5172479134257488, 'colsample_bytree': 0.7546018411922621}. Best is trial 11 with value: 65459.4133356878.


Early stopping, best iteration is:
[121]	valid_0's rmse: 65655
Trial 22 finished in 8.41 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.067157 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 477466
[LightGBM] [Info] Number of data points in the train set: 11645, number of used features: 1878
[LightGBM] [Info] Start training from score 105788.593173
Training until validation scores don't improve for 100 rounds


[I 2024-08-18 22:48:24,344] Trial 23 finished with value: 67789.07324819162 and parameters: {'num_leaves': 61, 'max_depth': 17, 'learning_rate': 0.04678350362408646, 'n_estimators': 1051, 'min_child_samples': 12, 'subsample': 0.35273671334488443, 'colsample_bytree': 0.7564919303059028}. Best is trial 11 with value: 65459.4133356878.


Early stopping, best iteration is:
[144]	valid_0's rmse: 67789.1
Trial 23 finished in 10.67 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.064584 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 477466
[LightGBM] [Info] Number of data points in the train set: 11645, number of used features: 1878
[LightGBM] [Info] Start training from score 105788.593173
Training until validation scores don't improve for 100 rounds
[500]	valid_0's rmse: 67591.3
[1000]	valid_0's rmse: 67109.2


[I 2024-08-18 22:48:54,446] Trial 24 finished with value: 67064.82127960178 and parameters: {'num_leaves': 30, 'max_depth': 33, 'learning_rate': 0.01100605610690996, 'n_estimators': 1368, 'min_child_samples': 10, 'subsample': 0.5580046500011522, 'colsample_bytree': 0.8627419557777128}. Best is trial 11 with value: 65459.4133356878.


Early stopping, best iteration is:
[1153]	valid_0's rmse: 67064.8
Trial 24 finished in 30.10 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.069872 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 477466
[LightGBM] [Info] Number of data points in the train set: 11645, number of used features: 1878
[LightGBM] [Info] Start training from score 105788.593173
Training until validation scores don't improve for 100 rounds


[I 2024-08-18 22:49:00,328] Trial 25 finished with value: 66616.72779033607 and parameters: {'num_leaves': 53, 'max_depth': 24, 'learning_rate': 0.1025126951228665, 'n_estimators': 1181, 'min_child_samples': 18, 'subsample': 0.5309622307108373, 'colsample_bytree': 0.6214419616286769}. Best is trial 11 with value: 65459.4133356878.


Early stopping, best iteration is:
[55]	valid_0's rmse: 66616.7
Trial 25 finished in 5.88 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.052444 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 477466
[LightGBM] [Info] Number of data points in the train set: 11645, number of used features: 1878
[LightGBM] [Info] Start training from score 105788.593173
Training until validation scores don't improve for 100 rounds


[I 2024-08-18 22:49:05,196] Trial 26 finished with value: 67553.79756686451 and parameters: {'num_leaves': 41, 'max_depth': 29, 'learning_rate': 0.12384504904447828, 'n_estimators': 1435, 'min_child_samples': 24, 'subsample': 0.6102161121887101, 'colsample_bytree': 0.5644298014677335}. Best is trial 11 with value: 65459.4133356878.


Early stopping, best iteration is:
[69]	valid_0's rmse: 67553.8
Trial 26 finished in 4.87 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.067982 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 477466
[LightGBM] [Info] Number of data points in the train set: 11645, number of used features: 1878
[LightGBM] [Info] Start training from score 105788.593173
Training until validation scores don't improve for 100 rounds


[I 2024-08-18 22:49:13,661] Trial 27 finished with value: 68158.91659138288 and parameters: {'num_leaves': 62, 'max_depth': 36, 'learning_rate': 0.0815546664623456, 'n_estimators': 974, 'min_child_samples': 34, 'subsample': 0.6951884509067245, 'colsample_bytree': 0.8204324544188326}. Best is trial 11 with value: 65459.4133356878.


Early stopping, best iteration is:
[81]	valid_0's rmse: 68158.9
Trial 27 finished in 8.47 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.064149 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 477466
[LightGBM] [Info] Number of data points in the train set: 11645, number of used features: 1878
[LightGBM] [Info] Start training from score 105788.593173
Training until validation scores don't improve for 100 rounds


[I 2024-08-18 22:49:21,664] Trial 28 finished with value: 65809.7807088139 and parameters: {'num_leaves': 71, 'max_depth': 20, 'learning_rate': 0.10248456594362704, 'n_estimators': 1265, 'min_child_samples': 17, 'subsample': 0.3851648570294126, 'colsample_bytree': 0.9052922862920998}. Best is trial 11 with value: 65459.4133356878.


Early stopping, best iteration is:
[30]	valid_0's rmse: 65809.8
Trial 28 finished in 8.00 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.062982 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 477466
[LightGBM] [Info] Number of data points in the train set: 11645, number of used features: 1878
[LightGBM] [Info] Start training from score 105788.593173
Training until validation scores don't improve for 100 rounds


[I 2024-08-18 22:49:29,119] Trial 29 finished with value: 67011.05928949668 and parameters: {'num_leaves': 52, 'max_depth': 27, 'learning_rate': 0.06593577416139215, 'n_estimators': 1169, 'min_child_samples': 13, 'subsample': 0.4537569347283547, 'colsample_bytree': 0.6883752477798109}. Best is trial 11 with value: 65459.4133356878.


Early stopping, best iteration is:
[86]	valid_0's rmse: 67011.1
Trial 29 finished in 7.45 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.066823 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 477466
[LightGBM] [Info] Number of data points in the train set: 11645, number of used features: 1878
[LightGBM] [Info] Start training from score 105788.593173
Training until validation scores don't improve for 100 rounds


[I 2024-08-18 22:49:40,526] Trial 30 finished with value: 67085.4739670082 and parameters: {'num_leaves': 44, 'max_depth': 27, 'learning_rate': 0.032005635980564995, 'n_estimators': 1072, 'min_child_samples': 23, 'subsample': 0.5120747595902727, 'colsample_bytree': 0.7350576221172269}. Best is trial 11 with value: 65459.4133356878.


Early stopping, best iteration is:
[248]	valid_0's rmse: 67085.5
Trial 30 finished in 11.41 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.067185 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 477466
[LightGBM] [Info] Number of data points in the train set: 11645, number of used features: 1878
[LightGBM] [Info] Start training from score 105788.593173
Training until validation scores don't improve for 100 rounds


[I 2024-08-18 22:49:51,252] Trial 31 finished with value: 65563.21826193623 and parameters: {'num_leaves': 73, 'max_depth': 21, 'learning_rate': 0.09925180568540952, 'n_estimators': 1254, 'min_child_samples': 17, 'subsample': 0.3953535991252081, 'colsample_bytree': 0.948206579068672}. Best is trial 11 with value: 65459.4133356878.


Early stopping, best iteration is:
[79]	valid_0's rmse: 65563.2
Trial 31 finished in 10.73 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.068216 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 477466
[LightGBM] [Info] Number of data points in the train set: 11645, number of used features: 1878
[LightGBM] [Info] Start training from score 105788.593173
Training until validation scores don't improve for 100 rounds


[I 2024-08-18 22:50:05,220] Trial 32 finished with value: 67022.34774504078 and parameters: {'num_leaves': 67, 'max_depth': 24, 'learning_rate': 0.0559429445336214, 'n_estimators': 1245, 'min_child_samples': 15, 'subsample': 0.30729820928073437, 'colsample_bytree': 0.9848128999763406}. Best is trial 11 with value: 65459.4133356878.


Early stopping, best iteration is:
[147]	valid_0's rmse: 67022.3
Trial 32 finished in 13.97 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.065305 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 477466
[LightGBM] [Info] Number of data points in the train set: 11645, number of used features: 1878
[LightGBM] [Info] Start training from score 105788.593173
Training until validation scores don't improve for 100 rounds


[I 2024-08-18 22:50:15,354] Trial 33 finished with value: 65921.18380702297 and parameters: {'num_leaves': 75, 'max_depth': 17, 'learning_rate': 0.07767294974908735, 'n_estimators': 1310, 'min_child_samples': 10, 'subsample': 0.35723938356915497, 'colsample_bytree': 0.9151098868650449}. Best is trial 11 with value: 65459.4133356878.


Early stopping, best iteration is:
[60]	valid_0's rmse: 65921.2
Trial 33 finished in 10.13 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.067543 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 477466
[LightGBM] [Info] Number of data points in the train set: 11645, number of used features: 1878
[LightGBM] [Info] Start training from score 105788.593173
Training until validation scores don't improve for 100 rounds


[I 2024-08-18 22:50:23,046] Trial 34 finished with value: 68267.73152726596 and parameters: {'num_leaves': 60, 'max_depth': 31, 'learning_rate': 0.13042846720246898, 'n_estimators': 1141, 'min_child_samples': 18, 'subsample': 0.43648026135823487, 'colsample_bytree': 0.9411045985117141}. Best is trial 11 with value: 65459.4133356878.


Early stopping, best iteration is:
[44]	valid_0's rmse: 68267.7
Trial 34 finished in 7.69 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.065751 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 477466
[LightGBM] [Info] Number of data points in the train set: 11645, number of used features: 1878
[LightGBM] [Info] Start training from score 105788.593173
Training until validation scores don't improve for 100 rounds


[I 2024-08-18 22:50:31,662] Trial 35 finished with value: 68045.56461897939 and parameters: {'num_leaves': 65, 'max_depth': 22, 'learning_rate': 0.10910436831052048, 'n_estimators': 1241, 'min_child_samples': 28, 'subsample': 0.49551442919165584, 'colsample_bytree': 0.763241697421487}. Best is trial 11 with value: 65459.4133356878.


Early stopping, best iteration is:
[97]	valid_0's rmse: 68045.6
Trial 35 finished in 8.62 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.067346 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 477466
[LightGBM] [Info] Number of data points in the train set: 11645, number of used features: 1878
[LightGBM] [Info] Start training from score 105788.593173
Training until validation scores don't improve for 100 rounds


[I 2024-08-18 22:50:39,227] Trial 36 finished with value: 66861.42608829284 and parameters: {'num_leaves': 29, 'max_depth': 21, 'learning_rate': 0.16255379510031878, 'n_estimators': 1396, 'min_child_samples': 16, 'subsample': 0.3443159645740303, 'colsample_bytree': 0.9457823266134335}. Best is trial 11 with value: 65459.4133356878.


Early stopping, best iteration is:
[181]	valid_0's rmse: 66861.4
Trial 36 finished in 7.57 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.066785 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 477466
[LightGBM] [Info] Number of data points in the train set: 11645, number of used features: 1878
[LightGBM] [Info] Start training from score 105788.593173
Training until validation scores don't improve for 100 rounds


[I 2024-08-18 22:50:45,147] Trial 37 finished with value: 66573.21822272603 and parameters: {'num_leaves': 50, 'max_depth': 34, 'learning_rate': 0.0923481495884107, 'n_estimators': 1298, 'min_child_samples': 27, 'subsample': 0.47454353343007716, 'colsample_bytree': 0.6724620488797869}. Best is trial 11 with value: 65459.4133356878.


Early stopping, best iteration is:
[59]	valid_0's rmse: 66573.2
Trial 37 finished in 5.92 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.065157 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 477466
[LightGBM] [Info] Number of data points in the train set: 11645, number of used features: 1878
[LightGBM] [Info] Start training from score 105788.593173
Training until validation scores don't improve for 100 rounds


[I 2024-08-18 22:50:48,180] Trial 38 finished with value: 66577.19865081883 and parameters: {'num_leaves': 10, 'max_depth': 11, 'learning_rate': 0.15833199097549433, 'n_estimators': 1030, 'min_child_samples': 12, 'subsample': 0.38305673429250275, 'colsample_bytree': 0.9992908850919945}. Best is trial 11 with value: 65459.4133356878.


Early stopping, best iteration is:
[32]	valid_0's rmse: 66577.2
Trial 38 finished in 3.03 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.051065 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 477466
[LightGBM] [Info] Number of data points in the train set: 11645, number of used features: 1878
[LightGBM] [Info] Start training from score 105788.593173
Training until validation scores don't improve for 100 rounds


[I 2024-08-18 22:50:56,350] Trial 39 finished with value: 65846.10305775091 and parameters: {'num_leaves': 58, 'max_depth': 26, 'learning_rate': 0.13970925403482748, 'n_estimators': 689, 'min_child_samples': 31, 'subsample': 0.582498241771537, 'colsample_bytree': 0.5610486618423869}. Best is trial 11 with value: 65459.4133356878.


Early stopping, best iteration is:
[182]	valid_0's rmse: 65846.1
Trial 39 finished in 8.17 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.068663 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 477466
[LightGBM] [Info] Number of data points in the train set: 11645, number of used features: 1878
[LightGBM] [Info] Start training from score 105788.593173
Training until validation scores don't improve for 100 rounds


[I 2024-08-18 22:51:02,793] Trial 40 finished with value: 66621.95887507637 and parameters: {'num_leaves': 65, 'max_depth': 29, 'learning_rate': 0.11148556463727519, 'n_estimators': 826, 'min_child_samples': 22, 'subsample': 0.5374342560692402, 'colsample_bytree': 0.7915196138460581}. Best is trial 11 with value: 65459.4133356878.


Early stopping, best iteration is:
[26]	valid_0's rmse: 66622
Trial 40 finished in 6.44 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.067908 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 477466
[LightGBM] [Info] Number of data points in the train set: 11645, number of used features: 1878
[LightGBM] [Info] Start training from score 105788.593173
Training until validation scores don't improve for 100 rounds


[I 2024-08-18 22:51:10,539] Trial 41 finished with value: 67420.78870828972 and parameters: {'num_leaves': 72, 'max_depth': 18, 'learning_rate': 0.09682030618276238, 'n_estimators': 1236, 'min_child_samples': 19, 'subsample': 0.39258007321972943, 'colsample_bytree': 0.8966613971795455}. Best is trial 11 with value: 65459.4133356878.


Early stopping, best iteration is:
[27]	valid_0's rmse: 67420.8
Trial 41 finished in 7.75 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.067569 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 477466
[LightGBM] [Info] Number of data points in the train set: 11645, number of used features: 1878
[LightGBM] [Info] Start training from score 105788.593173
Training until validation scores don't improve for 100 rounds


[I 2024-08-18 22:51:20,270] Trial 42 finished with value: 67535.4406887213 and parameters: {'num_leaves': 74, 'max_depth': 14, 'learning_rate': 0.07447348686794536, 'n_estimators': 1285, 'min_child_samples': 16, 'subsample': 0.4308110002062678, 'colsample_bytree': 0.9449344223597312}. Best is trial 11 with value: 65459.4133356878.


Early stopping, best iteration is:
[63]	valid_0's rmse: 67535.4
Trial 42 finished in 9.73 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.068220 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 477466
[LightGBM] [Info] Number of data points in the train set: 11645, number of used features: 1878
[LightGBM] [Info] Start training from score 105788.593173
Training until validation scores don't improve for 100 rounds


[I 2024-08-18 22:51:32,563] Trial 43 finished with value: 67256.72914120845 and parameters: {'num_leaves': 79, 'max_depth': 20, 'learning_rate': 0.08343734179434187, 'n_estimators': 1349, 'min_child_samples': 13, 'subsample': 0.37826585363862336, 'colsample_bytree': 0.9216114461624333}. Best is trial 11 with value: 65459.4133356878.


Early stopping, best iteration is:
[102]	valid_0's rmse: 67256.7
Trial 43 finished in 12.29 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.066370 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 477466
[LightGBM] [Info] Number of data points in the train set: 11645, number of used features: 1878
[LightGBM] [Info] Start training from score 105788.593173
Training until validation scores don't improve for 100 rounds


[I 2024-08-18 22:51:40,488] Trial 44 finished with value: 66588.86951024954 and parameters: {'num_leaves': 70, 'max_depth': 20, 'learning_rate': 0.13007564184279705, 'n_estimators': 1205, 'min_child_samples': 20, 'subsample': 0.3282067505985533, 'colsample_bytree': 0.8358654952649135}. Best is trial 11 with value: 65459.4133356878.


Early stopping, best iteration is:
[47]	valid_0's rmse: 66588.9
Trial 44 finished in 7.93 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.065604 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 477466
[LightGBM] [Info] Number of data points in the train set: 11645, number of used features: 1878
[LightGBM] [Info] Start training from score 105788.593173
Training until validation scores don't improve for 100 rounds


[I 2024-08-18 22:51:49,619] Trial 45 finished with value: 68459.63046649474 and parameters: {'num_leaves': 68, 'max_depth': 23, 'learning_rate': 0.11463266203544686, 'n_estimators': 1118, 'min_child_samples': 33, 'subsample': 0.47681027594381514, 'colsample_bytree': 0.9638209662643493}. Best is trial 11 with value: 65459.4133356878.


Early stopping, best iteration is:
[63]	valid_0's rmse: 68459.6
Trial 45 finished in 9.13 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.067860 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 477466
[LightGBM] [Info] Number of data points in the train set: 11645, number of used features: 1878
[LightGBM] [Info] Start training from score 105788.593173
Training until validation scores don't improve for 100 rounds


[I 2024-08-18 22:51:55,391] Trial 46 finished with value: 68245.18493406706 and parameters: {'num_leaves': 63, 'max_depth': 14, 'learning_rate': 0.09870478676166348, 'n_estimators': 513, 'min_child_samples': 38, 'subsample': 0.4109384067047198, 'colsample_bytree': 0.6173775795488161}. Best is trial 11 with value: 65459.4133356878.


Early stopping, best iteration is:
[50]	valid_0's rmse: 68245.2
Trial 46 finished in 5.77 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.073645 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 477466
[LightGBM] [Info] Number of data points in the train set: 11645, number of used features: 1878
[LightGBM] [Info] Start training from score 105788.593173
Training until validation scores don't improve for 100 rounds


[I 2024-08-18 22:52:07,389] Trial 47 finished with value: 66395.8680705369 and parameters: {'num_leaves': 75, 'max_depth': 49, 'learning_rate': 0.05428231714670221, 'n_estimators': 1079, 'min_child_samples': 11, 'subsample': 0.8126210256592509, 'colsample_bytree': 0.8657023817914052}. Best is trial 11 with value: 65459.4133356878.


Early stopping, best iteration is:
[104]	valid_0's rmse: 66395.9
Trial 47 finished in 12.00 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.068721 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 477466
[LightGBM] [Info] Number of data points in the train set: 11645, number of used features: 1878
[LightGBM] [Info] Start training from score 105788.593173
Training until validation scores don't improve for 100 rounds


[I 2024-08-18 22:52:12,967] Trial 48 finished with value: 67555.80330552178 and parameters: {'num_leaves': 47, 'max_depth': 19, 'learning_rate': 0.0906547709875118, 'n_estimators': 1277, 'min_child_samples': 14, 'subsample': 0.6201457700855204, 'colsample_bytree': 0.6457477587215775}. Best is trial 11 with value: 65459.4133356878.


Early stopping, best iteration is:
[52]	valid_0's rmse: 67555.8
Trial 48 finished in 5.58 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.066441 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 477466
[LightGBM] [Info] Number of data points in the train set: 11645, number of used features: 1878
[LightGBM] [Info] Start training from score 105788.593173
Training until validation scores don't improve for 100 rounds


[I 2024-08-18 22:52:22,187] Trial 49 finished with value: 67490.91903301542 and parameters: {'num_leaves': 59, 'max_depth': 26, 'learning_rate': 0.0713294213626146, 'n_estimators': 1153, 'min_child_samples': 18, 'subsample': 0.4967251788678449, 'colsample_bytree': 0.8092330290286219}. Best is trial 11 with value: 65459.4133356878.


Early stopping, best iteration is:
[101]	valid_0's rmse: 67490.9
Trial 49 finished in 9.22 seconds.
Best parameters found:  {'num_leaves': 49, 'max_depth': 30, 'learning_rate': 0.07297368356113065, 'n_estimators': 1113, 'min_child_samples': 10, 'subsample': 0.4518093784606565, 'colsample_bytree': 0.6502770948193778}


In [402]:
# best_params = {
#     "num_leaves": 70,
#     "max_depth": 36,
#     "learning_rate": 0.09304350950671668,
#     "n_estimators": 1158,
#     "min_child_samples": 18,
#     "subsample": 0.579731306036922,
#     "colsample_bytree": 0.8511910376678277,
# }

In [403]:
print("Best parameters found: ", best_params)
best_params["num_threads"] = 16

# Train the model with the best parameters
best_model = LGBMRegressor(**best_params)
best_model.fit(X_train, y_train)


# Extract feature importances
feature_importances = best_model.feature_importances_

# Create a DataFrame to hold the features and their importances
importance_df = pd.DataFrame(
    {"Feature": X_train.columns, "Importance": feature_importances}
)

# Sort the DataFrame by importance
importance_df = importance_df.sort_values(by="Importance", ascending=False)

Best parameters found:  {'num_leaves': 49, 'max_depth': 30, 'learning_rate': 0.07297368356113065, 'n_estimators': 1113, 'min_child_samples': 10, 'subsample': 0.4518093784606565, 'colsample_bytree': 0.6502770948193778}


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.069748 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 477510
[LightGBM] [Info] Number of data points in the train set: 14557, number of used features: 1878
[LightGBM] [Info] Start training from score 101346.615130


In [404]:
# Predict on the test set
y_pred = best_model.predict(X_test)

# Calculate RMSE
rmse = root_mean_squared_error(y_test, y_pred)
print(f"RMSE on the test set: {rmse:.4f}")

RMSE on the test set: 63809.0179


In [407]:
# save important_features to csv file
if use_test_for_validation_flag:
    importance_df.to_csv(
        "farm_important_features_use_test_for_validation.csv", index=False
    )
else:
    importance_df.to_csv("farm_important_features.csv", index=False)

In [408]:
feature_importance_df = importance_df
top_K = 5


# Function to extract the base feature name
def extract_base_feature(feature_name):
    if "_lag_" in feature_name:
        return feature_name.split("_lag_")[0]
    return feature_name


# Group features by their base name and sort within each group
grouped_features = feature_importance_df.groupby(
    feature_importance_df["Feature"].apply(extract_base_feature)
)

top_features = []
mean_importances = []

for base_feature, group in grouped_features:
    # Sort the group by importance and select the top_K
    top_group = group.sort_values(by="Importance", ascending=False).head(top_K)

    # Calculate the mean importance for this group
    mean_importance = top_group["Importance"].mean()

    # Add the group to the list with their mean importance
    for _, row in top_group.iterrows():
        top_features.append((row["Feature"], row["Importance"]))

    # Store the base feature and mean importance for sorting later
    mean_importances.append((base_feature, mean_importance))

# Sort the groups by mean importance
sorted_mean_importances = sorted(mean_importances, key=lambda x: x[1], reverse=True)

# Create the final sorted dataframe
final_features = []
for base_feature, _ in sorted_mean_importances:
    for feature, importance in top_features:
        if feature.startswith(base_feature):
            final_features.append((feature, importance))

final_df = pd.DataFrame(final_features, columns=["Feature", "Importance"])

# Display the final dataframe
final_df.to_csv("farm_top_features.csv", index=False)


In [None]:
# Set the threshold for feature importance
importance_threshold = 0

# Select features with importance greater than or equal to the threshold
selected_features = importance_df[importance_df["Importance"] >= importance_threshold][
    "Feature"
].tolist()

# Separate selected features into original input features and time features
selected_input_features = [
    feature for feature in selected_features if feature not in time_features
]
selected_time_features = [
    feature for feature in selected_features if feature in time_features
]

In [None]:
# Combine the filtered features with the power column and time features
train_data_final = pd.concat(
    [
        train_data_selected["time"],
        X_train[selected_input_features],
        train_data_selected["power"],
        train_data_selected[time_features],
    ],
    axis=1,
)
test_data_final = pd.concat(
    [
        test_data_selected["time"],
        X_test[selected_input_features],
        test_data_selected["power"],
        test_data_selected[time_features],
    ],
    axis=1,
)

# Ensure the final order of columns
final_columns = ["time"] + selected_input_features + ["power"] + time_features
train_data_final = train_data_final[final_columns]
test_data_final = test_data_final[final_columns]

# Display the new shapes of train_data_final and test_data_final
print("Shape of train_data_final:", train_data_final.shape)
print("Shape of test_data_final:", test_data_final.shape)
print("Selected input features:", selected_input_features)
print(f"Train columns : {train_data_final.columns}")

In [None]:
# Save the final datasets to CSV
file_name_common = '66_withTime'
train_data_final.to_csv( f"4-train_{file_name_common}.csv", index=False)
test_data_final.to_csv( f"4-test_{file_name_common}.csv", index=False)
print("Final datasets saved to CSV.")