In [13]:
import pandas as pd
import numpy as np  # noqa
import optuna
import time
import logging  # noqa
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error
from lightgbm import LGBMRegressor, early_stopping, log_evaluation

# Enable logging
optuna.logging.set_verbosity(optuna.logging.INFO)


class TimingCallback:
    def __init__(self):
        self.start_time = None

    def __call__(self, study, trial):
        if self.start_time is None:
            self.start_time = time.time()
        else:
            elapsed_time = time.time() - self.start_time
            print(f"Trial {trial.number} finished in {elapsed_time:.2f} seconds.")
            self.start_time = time.time()

In [14]:
# Load data
data_dir = "/data3/lsf/Pein/Power-Prediction/data/"
file_name_common = 'farm_92'
train_data_selected = pd.read_csv(data_dir + f"train_{file_name_common}.csv")
test_data_selected = pd.read_csv(data_dir + f"test_{file_name_common}.csv")

# Define features and target
features = [
    col
    for col in train_data_selected.columns
    if col not in ["time", "lead_hour", "power"]
]
X_train = train_data_selected[features]
y_train = train_data_selected["power"]
X_test = test_data_selected[features]
y_test = test_data_selected["power"]

In [15]:
print(f'Shape of X_train: {X_train.shape}, Shape of y_train: {y_train.shape}')
print(f'Shape of X_test: {X_test.shape}, Shape of y_test: {y_test.shape}')

Shape of X_train: (14592, 89), Shape of y_train: (14592,)
Shape of X_test: (2880, 89), Shape of y_test: (2880,)


### Adding time features

In [16]:
# Convert 'time' column to datetime
train_data_selected["time"] = pd.to_datetime(train_data_selected["time"])
test_data_selected["time"] = pd.to_datetime(test_data_selected["time"])

In [17]:
# Function to add time features
def add_time_features(df):
    df["time"] = pd.to_datetime(df["time"])

    # Existing time features
    df["hour"] = df["time"].dt.hour
    df["quarter_hour"] = df["time"].dt.minute // 15

    # New time features
    df["day"] = df["time"].dt.day
    df["day_in_week"] = df["time"].dt.weekday

    # Sine and cosine transformations
    df["hour_sin"] = np.sin(2 * np.pi * df["hour"] / 24)
    df["hour_cos"] = np.cos(2 * np.pi * df["hour"] / 24)

    df["quarter_hour_sin"] = np.sin(2 * np.pi * df["quarter_hour"] / 4)
    df["quarter_hour_cos"] = np.cos(2 * np.pi * df["quarter_hour"] / 4)

    df["day_sin"] = np.sin(2 * np.pi * df["day"] / 31)
    df["day_cos"] = np.cos(2 * np.pi * df["day"] / 31)

    df["day_in_week_sin"] = np.sin(2 * np.pi * df["day_in_week"] / 7)
    df["day_in_week_cos"] = np.cos(2 * np.pi * df["day_in_week"] / 7)

    return df

In [18]:
# # Add time features to both train and test data
train_data_selected = add_time_features(train_data_selected)
test_data_selected = add_time_features(test_data_selected)

In [19]:
# Define features including the new time features
time_features = [
    "hour",
    "quarter_hour",
    "day",
    "day_in_week",
    "hour_sin",
    "hour_cos",
    "quarter_hour_sin",
    "quarter_hour_cos",
    "day_sin",
    "day_cos",
    "day_in_week_sin",
    "day_in_week_cos",
]

In [20]:
all_features = features  + time_features

X_train = train_data_selected[all_features]
X_test = test_data_selected[all_features]

In [21]:
def objective(
    trial,
    X_train,
    y_train,
    X_test=None,
    y_test=None,
    use_test_for_validation_flag=False,
):
    params = {
        "objective": "regression",
        "metric": "rmse",
        "boosting_type": "gbdt",
        "num_leaves": trial.suggest_int("num_leaves", 10, 80),
        "max_depth": trial.suggest_int("max_depth", 10, 50),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2),
        "n_estimators": trial.suggest_int("n_estimators", 500, 1500),
        "min_child_samples": trial.suggest_int("min_child_samples", 10, 50),
        "subsample": trial.suggest_float("subsample", 0.3, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "num_threads": 16,
        "seed": 42,
    }

    model = LGBMRegressor(**params)

    if use_test_for_validation_flag and X_test is not None and y_test is not None:
        X_val_split, y_val_split = X_test, y_test
        model.fit(
            X_train,
            y_train,
            eval_set=[(X_val_split, y_val_split)],
            eval_metric="rmse",
            callbacks=[early_stopping(stopping_rounds=15), log_evaluation(period=500)],
        )
        preds = model.predict(X_val_split)
    else:
        # Calculate the split index
        split_index = int(0.8 * len(X_train))

        # Split the data
        X_train_split, X_val_split = X_train[:split_index], X_train[split_index:]
        y_train_split, y_val_split = y_train[:split_index], y_train[split_index:]
        
        model.fit(
            X_train_split,
            y_train_split,
            eval_set=[(X_val_split, y_val_split)],
            eval_metric="rmse",
            callbacks=[early_stopping(stopping_rounds=100), log_evaluation(period=500)],
        )
        preds = model.predict(X_val_split)

    # Calculate RMSE
    rmse = root_mean_squared_error(y_val_split, preds)
    return rmse

In [22]:
# Create the study and optimize
use_test_for_validation_flag = False
study = optuna.create_study(direction="minimize")
study.optimize(
    lambda trial: objective(
        trial,
        X_train,
        y_train,
        X_test,
        y_test,
        use_test_for_validation_flag=use_test_for_validation_flag,
    ),
    n_trials=30,
    callbacks=[TimingCallback()],
)

# Get the best parameters
best_params = study.best_params
print("Best parameters found: ", best_params)

[I 2024-08-07 14:20:18,239] A new study created in memory with name: no-name-2f35a737-4a6f-4d14-a18c-01581519b1c8


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005700 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22878
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 564.183276
Training until validation scores don't improve for 100 rounds


[I 2024-08-07 14:20:18,581] Trial 0 finished with value: 361.3208677782303 and parameters: {'num_leaves': 28, 'max_depth': 15, 'learning_rate': 0.16552636426472944, 'n_estimators': 808, 'min_child_samples': 43, 'subsample': 0.5659593422415234, 'colsample_bytree': 0.8550790981732623}. Best is trial 0 with value: 361.3208677782303.


Early stopping, best iteration is:
[18]	valid_0's rmse: 361.321
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002519 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22878
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 564.183276
Training until validation scores don't improve for 100 rounds


[I 2024-08-07 14:20:18,991] Trial 1 finished with value: 359.613092031948 and parameters: {'num_leaves': 38, 'max_depth': 15, 'learning_rate': 0.17991063786833297, 'n_estimators': 1262, 'min_child_samples': 34, 'subsample': 0.7348078640655231, 'colsample_bytree': 0.8873377717742184}. Best is trial 1 with value: 359.613092031948.


Early stopping, best iteration is:
[21]	valid_0's rmse: 359.613
Trial 1 finished in 0.41 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002879 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22878
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 564.183276
Training until validation scores don't improve for 100 rounds


[I 2024-08-07 14:20:20,069] Trial 2 finished with value: 356.87173104820914 and parameters: {'num_leaves': 78, 'max_depth': 35, 'learning_rate': 0.04732641308299843, 'n_estimators': 743, 'min_child_samples': 35, 'subsample': 0.9579247897655769, 'colsample_bytree': 0.9623900013559523}. Best is trial 2 with value: 356.87173104820914.


Early stopping, best iteration is:
[84]	valid_0's rmse: 356.872
Trial 2 finished in 1.08 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003137 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22878
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 564.183276
Training until validation scores don't improve for 100 rounds


[I 2024-08-07 14:20:20,599] Trial 3 finished with value: 366.9324462668337 and parameters: {'num_leaves': 37, 'max_depth': 46, 'learning_rate': 0.05289563007637222, 'n_estimators': 1236, 'min_child_samples': 40, 'subsample': 0.40207546941973255, 'colsample_bytree': 0.8197961282539361}. Best is trial 2 with value: 356.87173104820914.


Early stopping, best iteration is:
[57]	valid_0's rmse: 366.932
Trial 3 finished in 0.53 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002578 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22878
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 564.183276
Training until validation scores don't improve for 100 rounds


[I 2024-08-07 14:20:20,918] Trial 4 finished with value: 370.79632151867736 and parameters: {'num_leaves': 28, 'max_depth': 38, 'learning_rate': 0.14198272367539663, 'n_estimators': 725, 'min_child_samples': 29, 'subsample': 0.6467655723248791, 'colsample_bytree': 0.7785551502763874}. Best is trial 2 with value: 356.87173104820914.


Early stopping, best iteration is:
[20]	valid_0's rmse: 370.796
Trial 4 finished in 0.32 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002514 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22878
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 564.183276
Training until validation scores don't improve for 100 rounds


[I 2024-08-07 14:20:21,457] Trial 5 finished with value: 365.4959032882582 and parameters: {'num_leaves': 51, 'max_depth': 28, 'learning_rate': 0.1291341814606853, 'n_estimators': 1116, 'min_child_samples': 17, 'subsample': 0.8554255271337983, 'colsample_bytree': 0.93775375539685}. Best is trial 2 with value: 356.87173104820914.


Early stopping, best iteration is:
[30]	valid_0's rmse: 365.496
Trial 5 finished in 0.54 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002119 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22878
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 564.183276
Training until validation scores don't improve for 100 rounds


[I 2024-08-07 14:20:21,683] Trial 6 finished with value: 362.48589452098787 and parameters: {'num_leaves': 15, 'max_depth': 17, 'learning_rate': 0.14715316058427122, 'n_estimators': 921, 'min_child_samples': 48, 'subsample': 0.8677863465068953, 'colsample_bytree': 0.54276995345514}. Best is trial 2 with value: 356.87173104820914.


Early stopping, best iteration is:
[34]	valid_0's rmse: 362.486
Trial 6 finished in 0.23 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002729 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22878
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 564.183276
Training until validation scores don't improve for 100 rounds


[I 2024-08-07 14:20:22,751] Trial 7 finished with value: 358.84029197393596 and parameters: {'num_leaves': 74, 'max_depth': 16, 'learning_rate': 0.024895390419359835, 'n_estimators': 904, 'min_child_samples': 47, 'subsample': 0.3422377807076081, 'colsample_bytree': 0.6086988912903137}. Best is trial 2 with value: 356.87173104820914.


Early stopping, best iteration is:
[107]	valid_0's rmse: 358.84
Trial 7 finished in 1.07 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003043 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22878
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 564.183276
Training until validation scores don't improve for 100 rounds


[I 2024-08-07 14:20:23,591] Trial 8 finished with value: 361.25546530029527 and parameters: {'num_leaves': 79, 'max_depth': 27, 'learning_rate': 0.12856457762844486, 'n_estimators': 822, 'min_child_samples': 26, 'subsample': 0.8110657023130203, 'colsample_bytree': 0.760711251561369}. Best is trial 2 with value: 356.87173104820914.


Early stopping, best iteration is:
[61]	valid_0's rmse: 361.255
Trial 8 finished in 0.84 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002441 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22878
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 564.183276
Training until validation scores don't improve for 100 rounds


[I 2024-08-07 14:20:24,083] Trial 9 finished with value: 361.2347561573442 and parameters: {'num_leaves': 54, 'max_depth': 25, 'learning_rate': 0.10409986865436618, 'n_estimators': 815, 'min_child_samples': 10, 'subsample': 0.9760659033368986, 'colsample_bytree': 0.5115715465484687}. Best is trial 2 with value: 356.87173104820914.


Early stopping, best iteration is:
[36]	valid_0's rmse: 361.235
Trial 9 finished in 0.49 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002822 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22878
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 564.183276
Training until validation scores don't improve for 100 rounds


[I 2024-08-07 14:20:25,039] Trial 10 finished with value: 366.36027000419364 and parameters: {'num_leaves': 65, 'max_depth': 39, 'learning_rate': 0.07618799560862825, 'n_estimators': 526, 'min_child_samples': 21, 'subsample': 0.9987157320287122, 'colsample_bytree': 0.9984890489739019}. Best is trial 2 with value: 356.87173104820914.


Early stopping, best iteration is:
[97]	valid_0's rmse: 366.36
Trial 10 finished in 0.96 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002827 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22878
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 564.183276
Training until validation scores don't improve for 100 rounds


[I 2024-08-07 14:20:27,318] Trial 11 finished with value: 361.24143982179515 and parameters: {'num_leaves': 78, 'max_depth': 35, 'learning_rate': 0.010148711298342886, 'n_estimators': 581, 'min_child_samples': 50, 'subsample': 0.3151842235962813, 'colsample_bytree': 0.6404620338962632}. Best is trial 2 with value: 356.87173104820914.


Early stopping, best iteration is:
[379]	valid_0's rmse: 361.241
Trial 11 finished in 2.28 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002634 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22878
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 564.183276
Training until validation scores don't improve for 100 rounds


[I 2024-08-07 14:20:28,198] Trial 12 finished with value: 364.7601132608837 and parameters: {'num_leaves': 66, 'max_depth': 10, 'learning_rate': 0.026796048340675734, 'n_estimators': 1031, 'min_child_samples': 36, 'subsample': 0.47017295454810515, 'colsample_bytree': 0.668670804362171}. Best is trial 2 with value: 356.87173104820914.


Early stopping, best iteration is:
[101]	valid_0's rmse: 364.76
Trial 12 finished in 0.88 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002520 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22878
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 564.183276
Training until validation scores don't improve for 100 rounds


[I 2024-08-07 14:20:28,921] Trial 13 finished with value: 365.7372479396751 and parameters: {'num_leaves': 67, 'max_depth': 49, 'learning_rate': 0.06288610001069915, 'n_estimators': 1484, 'min_child_samples': 44, 'subsample': 0.5356358470365424, 'colsample_bytree': 0.6688190584592907}. Best is trial 2 with value: 356.87173104820914.


Early stopping, best iteration is:
[46]	valid_0's rmse: 365.737
Trial 13 finished in 0.72 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002324 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22878
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 564.183276
Training until validation scores don't improve for 100 rounds


[I 2024-08-07 14:20:29,940] Trial 14 finished with value: 361.7573271822551 and parameters: {'num_leaves': 72, 'max_depth': 33, 'learning_rate': 0.034438031005016065, 'n_estimators': 667, 'min_child_samples': 36, 'subsample': 0.6678519476014424, 'colsample_bytree': 0.5967710437544302}. Best is trial 2 with value: 356.87173104820914.


Early stopping, best iteration is:
[113]	valid_0's rmse: 361.757
Trial 14 finished in 1.02 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002900 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22878
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 564.183276
Training until validation scores don't improve for 100 rounds


[I 2024-08-07 14:20:30,591] Trial 15 finished with value: 373.7785457614279 and parameters: {'num_leaves': 58, 'max_depth': 21, 'learning_rate': 0.07812331654536908, 'n_estimators': 938, 'min_child_samples': 40, 'subsample': 0.3165122158726478, 'colsample_bytree': 0.7359132164140239}. Best is trial 2 with value: 356.87173104820914.


Early stopping, best iteration is:
[50]	valid_0's rmse: 373.779
Trial 15 finished in 0.65 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002934 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22878
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 564.183276
Training until validation scores don't improve for 100 rounds


[I 2024-08-07 14:20:31,378] Trial 16 finished with value: 363.6063331439195 and parameters: {'num_leaves': 79, 'max_depth': 43, 'learning_rate': 0.09583362877288779, 'n_estimators': 1099, 'min_child_samples': 31, 'subsample': 0.44550292169764394, 'colsample_bytree': 0.973360127213602}. Best is trial 2 with value: 356.87173104820914.


Early stopping, best iteration is:
[30]	valid_0's rmse: 363.606
Trial 16 finished in 0.79 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002361 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22878
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 564.183276
Training until validation scores don't improve for 100 rounds


[I 2024-08-07 14:20:32,414] Trial 17 finished with value: 361.17055472217567 and parameters: {'num_leaves': 59, 'max_depth': 21, 'learning_rate': 0.04362733997772725, 'n_estimators': 627, 'min_child_samples': 46, 'subsample': 0.761216013076758, 'colsample_bytree': 0.594786523240673}. Best is trial 2 with value: 356.87173104820914.


Early stopping, best iteration is:
[172]	valid_0's rmse: 361.171
Trial 17 finished in 1.04 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002828 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22878
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 564.183276
Training until validation scores don't improve for 100 rounds


[I 2024-08-07 14:20:34,214] Trial 18 finished with value: 360.0855169479112 and parameters: {'num_leaves': 72, 'max_depth': 32, 'learning_rate': 0.017184101658983434, 'n_estimators': 733, 'min_child_samples': 23, 'subsample': 0.9146115744604509, 'colsample_bytree': 0.7290841853016669}. Best is trial 2 with value: 356.87173104820914.


Early stopping, best iteration is:
[254]	valid_0's rmse: 360.086
Trial 18 finished in 1.80 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003030 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22878
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 564.183276
Training until validation scores don't improve for 100 rounds


[I 2024-08-07 14:20:34,865] Trial 19 finished with value: 358.8624453204303 and parameters: {'num_leaves': 45, 'max_depth': 11, 'learning_rate': 0.06785831364793997, 'n_estimators': 898, 'min_child_samples': 40, 'subsample': 0.5858787994783083, 'colsample_bytree': 0.8914022995045465}. Best is trial 2 with value: 356.87173104820914.


Early stopping, best iteration is:
[81]	valid_0's rmse: 358.862
Trial 19 finished in 0.65 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002861 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22878
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 564.183276
Training until validation scores don't improve for 100 rounds


[I 2024-08-07 14:20:35,829] Trial 20 finished with value: 363.6627134308896 and parameters: {'num_leaves': 73, 'max_depth': 23, 'learning_rate': 0.03852940201584988, 'n_estimators': 1227, 'min_child_samples': 32, 'subsample': 0.3905400180261214, 'colsample_bytree': 0.691372029914296}. Best is trial 2 with value: 356.87173104820914.


Early stopping, best iteration is:
[86]	valid_0's rmse: 363.663
Trial 20 finished in 0.96 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002904 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22878
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 564.183276
Training until validation scores don't improve for 100 rounds


[I 2024-08-07 14:20:36,163] Trial 21 finished with value: 357.3230002361739 and parameters: {'num_leaves': 14, 'max_depth': 11, 'learning_rate': 0.06782396022375077, 'n_estimators': 904, 'min_child_samples': 40, 'subsample': 0.5736581520422552, 'colsample_bytree': 0.9148208843310814}. Best is trial 2 with value: 356.87173104820914.


Early stopping, best iteration is:
[63]	valid_0's rmse: 357.323
Trial 21 finished in 0.33 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002798 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22878
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 564.183276
Training until validation scores don't improve for 100 rounds


[I 2024-08-07 14:20:36,529] Trial 22 finished with value: 344.1361996159678 and parameters: {'num_leaves': 15, 'max_depth': 18, 'learning_rate': 0.08952512928506817, 'n_estimators': 996, 'min_child_samples': 38, 'subsample': 0.4880352084515938, 'colsample_bytree': 0.9424165955548771}. Best is trial 22 with value: 344.1361996159678.


Early stopping, best iteration is:
[80]	valid_0's rmse: 344.136
Trial 22 finished in 0.37 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002869 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22878
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 564.183276
Training until validation scores don't improve for 100 rounds


[I 2024-08-07 14:20:36,841] Trial 23 finished with value: 363.10431238597226 and parameters: {'num_leaves': 12, 'max_depth': 10, 'learning_rate': 0.08979721961102416, 'n_estimators': 1005, 'min_child_samples': 39, 'subsample': 0.49958607176488834, 'colsample_bytree': 0.9501055289758689}. Best is trial 22 with value: 344.1361996159678.


Early stopping, best iteration is:
[60]	valid_0's rmse: 363.104
Trial 23 finished in 0.31 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002659 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22878
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 564.183276
Training until validation scores don't improve for 100 rounds


[I 2024-08-07 14:20:37,204] Trial 24 finished with value: 351.50682240543796 and parameters: {'num_leaves': 20, 'max_depth': 19, 'learning_rate': 0.1148027897945388, 'n_estimators': 1114, 'min_child_samples': 36, 'subsample': 0.6317099035278214, 'colsample_bytree': 0.9172099517897333}. Best is trial 22 with value: 344.1361996159678.


Early stopping, best iteration is:
[45]	valid_0's rmse: 351.507
Trial 24 finished in 0.36 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002718 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22878
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 564.183276
Training until validation scores don't improve for 100 rounds


[I 2024-08-07 14:20:37,612] Trial 25 finished with value: 341.7043715533523 and parameters: {'num_leaves': 21, 'max_depth': 21, 'learning_rate': 0.1170591971593041, 'n_estimators': 1350, 'min_child_samples': 35, 'subsample': 0.6506927490892368, 'colsample_bytree': 0.8103864882602075}. Best is trial 25 with value: 341.7043715533523.


Early stopping, best iteration is:
[77]	valid_0's rmse: 341.704
Trial 25 finished in 0.41 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002913 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22878
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 564.183276
Training until validation scores don't improve for 100 rounds


[I 2024-08-07 14:20:38,076] Trial 26 finished with value: 358.3933191239015 and parameters: {'num_leaves': 21, 'max_depth': 19, 'learning_rate': 0.1168419190905329, 'n_estimators': 1406, 'min_child_samples': 28, 'subsample': 0.6479681696648903, 'colsample_bytree': 0.8238819309641573}. Best is trial 25 with value: 341.7043715533523.


Early stopping, best iteration is:
[116]	valid_0's rmse: 358.393
Trial 26 finished in 0.46 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002713 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22878
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 564.183276
Training until validation scores don't improve for 100 rounds


[I 2024-08-07 14:20:38,407] Trial 27 finished with value: 354.57101922117715 and parameters: {'num_leaves': 21, 'max_depth': 26, 'learning_rate': 0.11862924721168484, 'n_estimators': 1342, 'min_child_samples': 33, 'subsample': 0.7046799257757459, 'colsample_bytree': 0.8491064004527067}. Best is trial 25 with value: 341.7043715533523.


Early stopping, best iteration is:
[25]	valid_0's rmse: 354.571
Trial 27 finished in 0.33 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002726 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22878
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 564.183276
Training until validation scores don't improve for 100 rounds


[I 2024-08-07 14:20:38,775] Trial 28 finished with value: 350.73729426434807 and parameters: {'num_leaves': 20, 'max_depth': 19, 'learning_rate': 0.09281879787338747, 'n_estimators': 1155, 'min_child_samples': 37, 'subsample': 0.6122687197116748, 'colsample_bytree': 0.7966872430791309}. Best is trial 25 with value: 341.7043715533523.


Early stopping, best iteration is:
[52]	valid_0's rmse: 350.737
Trial 28 finished in 0.37 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002827 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22878
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 101
[LightGBM] [Info] Start training from score 564.183276
Training until validation scores don't improve for 100 rounds


[I 2024-08-07 14:20:39,167] Trial 29 finished with value: 370.86154589971875 and parameters: {'num_leaves': 28, 'max_depth': 13, 'learning_rate': 0.1614890618665271, 'n_estimators': 1323, 'min_child_samples': 43, 'subsample': 0.5365135472026565, 'colsample_bytree': 0.7897437546217961}. Best is trial 25 with value: 341.7043715533523.


Early stopping, best iteration is:
[35]	valid_0's rmse: 370.862
Trial 29 finished in 0.39 seconds.
Best parameters found:  {'num_leaves': 21, 'max_depth': 21, 'learning_rate': 0.1170591971593041, 'n_estimators': 1350, 'min_child_samples': 35, 'subsample': 0.6506927490892368, 'colsample_bytree': 0.8103864882602075}


In [23]:
# best_params = {
#     "num_leaves": 70,
#     "max_depth": 36,
#     "learning_rate": 0.09304350950671668,
#     "n_estimators": 1158,
#     "min_child_samples": 18,
#     "subsample": 0.579731306036922,
#     "colsample_bytree": 0.8511910376678277,
# }

In [24]:
print("Best parameters found: ", best_params)
best_params["num_threads"] = 16

# Train the model with the best parameters
best_model = LGBMRegressor(**best_params)
best_model.fit(X_train, y_train)


# Extract feature importances
feature_importances = best_model.feature_importances_

# Create a DataFrame to hold the features and their importances
importance_df = pd.DataFrame(
    {"Feature": X_train.columns, "Importance": feature_importances}
)

# Sort the DataFrame by importance
importance_df = importance_df.sort_values(by="Importance", ascending=False)

Best parameters found:  {'num_leaves': 21, 'max_depth': 21, 'learning_rate': 0.1170591971593041, 'n_estimators': 1350, 'min_child_samples': 35, 'subsample': 0.6506927490892368, 'colsample_bytree': 0.8103864882602075}


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002841 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22879
[LightGBM] [Info] Number of data points in the train set: 14592, number of used features: 101
[LightGBM] [Info] Start training from score 540.706096


In [25]:
# Predict on the test set
y_pred = best_model.predict(X_test)

# Calculate RMSE
rmse = root_mean_squared_error(y_test, y_pred)
print(f"RMSE on the test set: {rmse:.4f}")

RMSE on the test set: 335.8639


In [26]:
# save important_features to csv file
if use_test_for_validation_flag:
    importance_df.to_csv("farm_important_features_use_test_for_validation.csv", index=False)
else:
    importance_df.to_csv("farm_important_features.csv", index=False)

In [27]:
# Set the threshold for feature importance
importance_threshold = 54

# Select features with importance greater than or equal to the threshold
selected_features = importance_df[importance_df["Importance"] >= importance_threshold][
    "Feature"
].tolist()

# Separate selected features into original input features and time features
selected_input_features = [
    feature for feature in selected_features if feature not in time_features
]
selected_time_features = [
    feature for feature in selected_features if feature in time_features
]

In [28]:
# Combine the filtered features with the power column and time features
train_data_final = pd.concat(
    [
        train_data_selected["time"],
        X_train[selected_input_features],
        train_data_selected["power"],
        train_data_selected[time_features],
    ],
    axis=1,
)
test_data_final = pd.concat(
    [
        test_data_selected["time"],
        X_test[selected_input_features],
        test_data_selected["power"],
        test_data_selected[time_features],
    ],
    axis=1,
)

# Ensure the final order of columns
final_columns = ["time"] + selected_input_features + ["power"] + time_features
train_data_final = train_data_final[final_columns]
test_data_final = test_data_final[final_columns]

# Display the new shapes of train_data_final and test_data_final
print("Shape of train_data_final:", train_data_final.shape)
print("Shape of test_data_final:", test_data_final.shape)
print("Selected input features:", selected_input_features)
print(f"Train columns : {train_data_final.columns}")

Shape of train_data_final: (14592, 98)
Shape of test_data_final: (2880, 98)
Selected input features: ['mgws', 'lgws', 'altitude', 'trpp', 'azimuth', 'deg0l', 'mudlp', 'blh', 'viwvn', 'kx', 'degm10l', 'mcc', 'gwd', 'p3020', 'totalx', 'lcc', 'mld', 'tcsw', 'sund', 'vimd', 'capes', 'bld', 'tcc', 'fg10', 'ishf', 'lspf', 'viwve', 'ewss', 'litoti', 'flsr', 'nsss', 'ilspf', 'str', 'u100', 'cape', 'hcc', 'u200', 'mlcape100', 'ws10', 'hwbt0', 'u10', 'ws200', 'msl', 'ws100', 'i10fg', 'ttrc', 'dsrp', 'mlcape50', 'u10n', 'slhf', 'hwbt1', 'ttr', 'mucape', 'fdir', 'cdir', 'strc', 'sshf', 'v200', 'v10n', 'ssrd', 'strd', 'd2m', 'v10', 'tcw', 'par', 'v100', 'uvb', 'tsr', 'tisr', 'sp', 'parcs', 't2m', 'mx2t', 'ssrc', 'tsrc', 'ssr', 'tcwv', 'mn2t', 'ssrdc', 'sst', 'skt', 'strdc', 'lblt', 'stl1']
Train columns : Index(['time', 'mgws', 'lgws', 'altitude', 'trpp', 'azimuth', 'deg0l', 'mudlp',
       'blh', 'viwvn', 'kx', 'degm10l', 'mcc', 'gwd', 'p3020', 'totalx', 'lcc',
       'mld', 'tcsw', 'sund', 'vimd'

In [29]:
# Save the final datasets to CSV
file_name_common = 'farm_98_withTime'
train_data_final.to_csv(data_dir + f"train_{file_name_common}.csv", index=False)
test_data_final.to_csv(data_dir + f"test_{file_name_common}.csv", index=False)
print("Final datasets saved to CSV.")

Final datasets saved to CSV.
