In [124]:
import pandas as pd
import numpy as np  # noqa
import optuna
import time
import logging  # noqa
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error
from lightgbm import LGBMRegressor, early_stopping, log_evaluation

# Enable logging
optuna.logging.set_verbosity(optuna.logging.INFO)


class TimingCallback:
    def __init__(self):
        self.start_time = None

    def __call__(self, study, trial):
        if self.start_time is None:
            self.start_time = time.time()
        else:
            elapsed_time = time.time() - self.start_time
            print(f"Trial {trial.number} finished in {elapsed_time:.2f} seconds.")
            self.start_time = time.time()

In [125]:
# Load data
# data_dir = "/data/Pein/Pytorch/Wind-Power-Prediction/data/"

train_path = '3-train_merged_55.csv'
test_path = '3-test_merged_55.csv'

train_data_selected = pd.read_csv(train_path)
test_data_selected = pd.read_csv(test_path)

# Set the ['power'] to be non-negative
train_data_selected['power'] = train_data_selected['power'].apply(lambda x: x if x >= 0 else 0)
test_data_selected['power'] = test_data_selected['power'].apply(lambda x: x if x >= 0 else 0)



# Define features and target
features = [
    col
    for col in train_data_selected.columns
    if col not in ["time", "lead_hour", 'initial_time',"power"]
]
X_train = train_data_selected[features]
y_train = train_data_selected["power"]
X_test = test_data_selected[features]
y_test = test_data_selected["power"]

In [126]:
print(f'Shape of X_train: {X_train.shape}, Shape of y_train: {y_train.shape}')
print(f'Shape of X_test: {X_test.shape}, Shape of y_test: {y_test.shape}')

Shape of X_train: (14592, 52), Shape of y_train: (14592,)
Shape of X_test: (2880, 52), Shape of y_test: (2880,)


### Adding time features

In [127]:
# Convert 'time' column to datetime
train_data_selected["time"] = pd.to_datetime(train_data_selected["time"])
test_data_selected["time"] = pd.to_datetime(test_data_selected["time"])

In [128]:
# Function to add time features
def add_time_features(df):
    df["time"] = pd.to_datetime(df["time"])

    # Existing time features
    df["hour"] = df["time"].dt.hour
    df["quarter_hour"] = df["time"].dt.minute // 15

    # New time features
    df["day"] = df["time"].dt.day
    df["day_in_week"] = df["time"].dt.weekday

    # Sine and cosine transformations
    df["hour_sin"] = np.sin(2 * np.pi * df["hour"] / 24)
    df["hour_cos"] = np.cos(2 * np.pi * df["hour"] / 24)

    df["quarter_hour_sin"] = np.sin(2 * np.pi * df["quarter_hour"] / 4)
    df["quarter_hour_cos"] = np.cos(2 * np.pi * df["quarter_hour"] / 4)

    df["day_sin"] = np.sin(2 * np.pi * df["day"] / 31)
    df["day_cos"] = np.cos(2 * np.pi * df["day"] / 31)

    df["day_in_week_sin"] = np.sin(2 * np.pi * df["day_in_week"] / 7)
    df["day_in_week_cos"] = np.cos(2 * np.pi * df["day_in_week"] / 7)

    return df

In [129]:
# # Add time features to both train and test data
train_data_selected = add_time_features(train_data_selected)
test_data_selected = add_time_features(test_data_selected)

In [130]:
# Define features including the new time features
time_features = [
    "hour",
    "quarter_hour",
    "day",
    # "day_in_week",
    "hour_sin",
    "hour_cos",
    "quarter_hour_sin",
    "quarter_hour_cos",
    "day_sin",
    "day_cos",
    # "day_in_week_sin",
    # "day_in_week_cos",
]

In [131]:
all_features = features  + time_features

X_train = train_data_selected[all_features]
X_test = test_data_selected[all_features]

In [132]:
from sklearn.model_selection import train_test_split
def objective(
    trial,
    X_train,
    y_train,
    X_test=None,
    y_test=None,
    use_test_for_validation_flag=False,
    use_sklearn_split = False,
):
    
    params = {
        "objective": "regression",
        "metric": "rmse",
        "boosting_type": "gbdt",
        "num_leaves": trial.suggest_int("num_leaves", 10, 80),
        "max_depth": trial.suggest_int("max_depth", 10, 50),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2),
        "n_estimators": trial.suggest_int("n_estimators", 500, 1500),
        "min_child_samples": trial.suggest_int("min_child_samples", 10, 50),
        "subsample": trial.suggest_float("subsample", 0.3, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "num_threads": 16,
        "seed": 42,
    }

    model = LGBMRegressor(**params)

    if use_test_for_validation_flag and X_test is not None and y_test is not None:
        X_val_split, y_val_split = X_test, y_test
        model.fit(
            X_train,
            y_train,
            eval_set=[(X_val_split, y_val_split)],
            eval_metric="rmse",
            callbacks=[early_stopping(stopping_rounds=15), log_evaluation(period=500)],
        )
        preds = model.predict(X_val_split)
    else:
        
        if use_sklearn_split:
            # Use sklearn's train_test_split
            X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
                X_train, y_train, test_size=0.2, random_state=42)
        else:
            # Calculate the split index
            split_index = int(0.8 * len(X_train))

            # Split the data manually
            X_train_split, X_val_split = X_train[:split_index], X_train[split_index:]
            y_train_split, y_val_split = y_train[:split_index], y_train[split_index:]
        
        model.fit(
            X_train_split,
            y_train_split,
            eval_set=[(X_val_split, y_val_split)],
            eval_metric="rmse",
            callbacks=[early_stopping(stopping_rounds=100), log_evaluation(period=500)],
        )
        preds = model.predict(X_val_split)

    # Calculate RMSE
    rmse = root_mean_squared_error(y_val_split, preds)
    return rmse

In [133]:
# Create the study and optimize
use_test_for_validation_flag = False
study = optuna.create_study(direction="minimize")
study.optimize(
    lambda trial: objective(
        trial,
        X_train,
        y_train,
        X_test,
        y_test,
        use_test_for_validation_flag=use_test_for_validation_flag,
    ),
    n_trials=30,
    callbacks=[TimingCallback()],
)

# Get the best parameters
best_params = study.best_params
print("Best parameters found: ", best_params)

[I 2024-08-15 17:25:46,935] A new study created in memory with name: no-name-182b8974-5f90-409c-9360-b84f35d3a171


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002925 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13422
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 61
[LightGBM] [Info] Start training from score 105784.364306
Training until validation scores don't improve for 100 rounds


[I 2024-08-15 17:25:47,515] Trial 0 finished with value: 66672.32036769795 and parameters: {'num_leaves': 15, 'max_depth': 23, 'learning_rate': 0.02492148539094784, 'n_estimators': 650, 'min_child_samples': 16, 'subsample': 0.7865766454128991, 'colsample_bytree': 0.8479795769134237}. Best is trial 0 with value: 66672.32036769795.


Early stopping, best iteration is:
[199]	valid_0's rmse: 66672.3
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002260 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13422
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 61
[LightGBM] [Info] Start training from score 105784.364306
Training until validation scores don't improve for 100 rounds


[I 2024-08-15 17:25:47,941] Trial 1 finished with value: 67016.25081615792 and parameters: {'num_leaves': 18, 'max_depth': 19, 'learning_rate': 0.03813624414734297, 'n_estimators': 1144, 'min_child_samples': 34, 'subsample': 0.8300798499868236, 'colsample_bytree': 0.6099124850313824}. Best is trial 0 with value: 66672.32036769795.


Early stopping, best iteration is:
[162]	valid_0's rmse: 67016.3
Trial 1 finished in 0.43 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001599 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13422
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 61
[LightGBM] [Info] Start training from score 105784.364306
Training until validation scores don't improve for 100 rounds


[I 2024-08-15 17:25:48,618] Trial 2 finished with value: 68023.32035367792 and parameters: {'num_leaves': 64, 'max_depth': 25, 'learning_rate': 0.05188382970501199, 'n_estimators': 973, 'min_child_samples': 38, 'subsample': 0.33944024751559815, 'colsample_bytree': 0.8582559004267389}. Best is trial 0 with value: 66672.32036769795.


Early stopping, best iteration is:
[100]	valid_0's rmse: 68023.3
Trial 2 finished in 0.68 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001669 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13422
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 61
[LightGBM] [Info] Start training from score 105784.364306
Training until validation scores don't improve for 100 rounds


[I 2024-08-15 17:25:48,917] Trial 3 finished with value: 68784.9410242906 and parameters: {'num_leaves': 40, 'max_depth': 42, 'learning_rate': 0.13433431374324697, 'n_estimators': 1371, 'min_child_samples': 46, 'subsample': 0.8692597354266967, 'colsample_bytree': 0.7912792597139707}. Best is trial 0 with value: 66672.32036769795.


Early stopping, best iteration is:
[31]	valid_0's rmse: 68784.9
Trial 3 finished in 0.30 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001610 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13422
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 61
[LightGBM] [Info] Start training from score 105784.364306
Training until validation scores don't improve for 100 rounds


[I 2024-08-15 17:25:49,232] Trial 4 finished with value: 67824.26479661587 and parameters: {'num_leaves': 37, 'max_depth': 37, 'learning_rate': 0.09883749714331959, 'n_estimators': 801, 'min_child_samples': 13, 'subsample': 0.7502629169448335, 'colsample_bytree': 0.7060143511699986}. Best is trial 0 with value: 66672.32036769795.


Early stopping, best iteration is:
[41]	valid_0's rmse: 67824.3
Trial 4 finished in 0.31 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001485 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13422
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 61
[LightGBM] [Info] Start training from score 105784.364306
Training until validation scores don't improve for 100 rounds


[I 2024-08-15 17:25:49,766] Trial 5 finished with value: 67337.9682548772 and parameters: {'num_leaves': 54, 'max_depth': 48, 'learning_rate': 0.07700192204482569, 'n_estimators': 1494, 'min_child_samples': 36, 'subsample': 0.8162637604894329, 'colsample_bytree': 0.9956034868588317}. Best is trial 0 with value: 66672.32036769795.


Early stopping, best iteration is:
[74]	valid_0's rmse: 67338
Trial 5 finished in 0.53 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001745 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13422
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 61
[LightGBM] [Info] Start training from score 105784.364306
Training until validation scores don't improve for 100 rounds


[I 2024-08-15 17:25:50,051] Trial 6 finished with value: 67312.61983334826 and parameters: {'num_leaves': 16, 'max_depth': 34, 'learning_rate': 0.05438476311398182, 'n_estimators': 914, 'min_child_samples': 18, 'subsample': 0.44068392697408054, 'colsample_bytree': 0.8923778469751158}. Best is trial 0 with value: 66672.32036769795.
[I 2024-08-15 17:25:50,219] Trial 7 finished with value: 67265.10477500617 and parameters: {'num_leaves': 15, 'max_depth': 28, 'learning_rate': 0.1850782116676188, 'n_estimators': 716, 'min_child_samples': 17, 'subsample': 0.5212540975106517, 'colsample_bytree': 0.5870951206837318}. Best is trial 0 with value: 66672.32036769795.


Early stopping, best iteration is:
[113]	valid_0's rmse: 67312.6
Trial 6 finished in 0.28 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001813 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13422
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 61
[LightGBM] [Info] Start training from score 105784.364306
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[22]	valid_0's rmse: 67265.1
Trial 7 finished in 0.17 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001651 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13422
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 61
[LightGBM] [Info] Start training from score 105784.364306
Training until validation scores don't improv

[I 2024-08-15 17:25:50,692] Trial 8 finished with value: 68594.74168030538 and parameters: {'num_leaves': 74, 'max_depth': 19, 'learning_rate': 0.13617140008168402, 'n_estimators': 1418, 'min_child_samples': 50, 'subsample': 0.7161320240200888, 'colsample_bytree': 0.6731146245336079}. Best is trial 0 with value: 66672.32036769795.


Early stopping, best iteration is:
[29]	valid_0's rmse: 68594.7
Trial 8 finished in 0.47 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001626 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13422
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 61
[LightGBM] [Info] Start training from score 105784.364306
Training until validation scores don't improve for 100 rounds


[I 2024-08-15 17:25:51,041] Trial 9 finished with value: 66126.73272823748 and parameters: {'num_leaves': 45, 'max_depth': 23, 'learning_rate': 0.17801774348406232, 'n_estimators': 1445, 'min_child_samples': 47, 'subsample': 0.7293072402595091, 'colsample_bytree': 0.9898948059484636}. Best is trial 9 with value: 66126.73272823748.


Early stopping, best iteration is:
[23]	valid_0's rmse: 66126.7
Trial 9 finished in 0.35 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001995 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13422
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 61
[LightGBM] [Info] Start training from score 105784.364306
Training until validation scores don't improve for 100 rounds


[I 2024-08-15 17:25:51,378] Trial 10 finished with value: 69306.63487728058 and parameters: {'num_leaves': 31, 'max_depth': 11, 'learning_rate': 0.191670012768715, 'n_estimators': 1251, 'min_child_samples': 28, 'subsample': 0.9996113805242436, 'colsample_bytree': 0.9654904513977842}. Best is trial 9 with value: 66126.73272823748.


Early stopping, best iteration is:
[14]	valid_0's rmse: 69306.6
Trial 10 finished in 0.34 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002263 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13422
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 61
[LightGBM] [Info] Start training from score 105784.364306
Training until validation scores don't improve for 100 rounds


[I 2024-08-15 17:25:53,154] Trial 11 finished with value: 68307.77210256713 and parameters: {'num_leaves': 53, 'max_depth': 21, 'learning_rate': 0.010874323270268021, 'n_estimators': 518, 'min_child_samples': 25, 'subsample': 0.5608954161002693, 'colsample_bytree': 0.8819106546609187}. Best is trial 9 with value: 66126.73272823748.


[500]	valid_0's rmse: 68436.5
Did not meet early stopping. Best iteration is:
[450]	valid_0's rmse: 68307.8
Trial 11 finished in 1.78 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001778 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13422
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 61
[LightGBM] [Info] Start training from score 105784.364306
Training until validation scores don't improve for 100 rounds


[I 2024-08-15 17:25:53,464] Trial 12 finished with value: 66777.97493932008 and parameters: {'num_leaves': 27, 'max_depth': 11, 'learning_rate': 0.13677308109654157, 'n_estimators': 510, 'min_child_samples': 43, 'subsample': 0.6443768795595819, 'colsample_bytree': 0.7904252527352098}. Best is trial 9 with value: 66126.73272823748.


Early stopping, best iteration is:
[32]	valid_0's rmse: 66778
Trial 12 finished in 0.31 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001596 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13422
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 61
[LightGBM] [Info] Start training from score 105784.364306
Training until validation scores don't improve for 100 rounds


[I 2024-08-15 17:25:53,845] Trial 13 finished with value: 67499.25083921826 and parameters: {'num_leaves': 50, 'max_depth': 25, 'learning_rate': 0.16599509924524186, 'n_estimators': 1127, 'min_child_samples': 22, 'subsample': 0.9378948338037529, 'colsample_bytree': 0.5009649316701343}. Best is trial 9 with value: 66126.73272823748.


Early stopping, best iteration is:
[32]	valid_0's rmse: 67499.3
Trial 13 finished in 0.38 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001767 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13422
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 61
[LightGBM] [Info] Start training from score 105784.364306
Training until validation scores don't improve for 100 rounds


[I 2024-08-15 17:25:54,465] Trial 14 finished with value: 70169.97552620983 and parameters: {'num_leaves': 78, 'max_depth': 32, 'learning_rate': 0.10790048114023813, 'n_estimators': 687, 'min_child_samples': 31, 'subsample': 0.6776827813450419, 'colsample_bytree': 0.9335020425646819}. Best is trial 9 with value: 66126.73272823748.


Early stopping, best iteration is:
[41]	valid_0's rmse: 70170
Trial 14 finished in 0.62 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001521 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13422
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 61
[LightGBM] [Info] Start training from score 105784.364306
Training until validation scores don't improve for 100 rounds
[500]	valid_0's rmse: 66580.1


[I 2024-08-15 17:25:55,224] Trial 15 finished with value: 65739.98221690734 and parameters: {'num_leaves': 10, 'max_depth': 16, 'learning_rate': 0.012909538157342448, 'n_estimators': 1082, 'min_child_samples': 12, 'subsample': 0.6140953107725337, 'colsample_bytree': 0.8300534305565738}. Best is trial 15 with value: 65739.98221690734.


Early stopping, best iteration is:
[728]	valid_0's rmse: 65740
Trial 15 finished in 0.76 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001875 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13422
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 61
[LightGBM] [Info] Start training from score 105784.364306
Training until validation scores don't improve for 100 rounds


[I 2024-08-15 17:25:55,574] Trial 16 finished with value: 67665.54326742933 and parameters: {'num_leaves': 27, 'max_depth': 14, 'learning_rate': 0.16528372292003557, 'n_estimators': 1236, 'min_child_samples': 11, 'subsample': 0.6427294235693647, 'colsample_bytree': 0.7829327872381497}. Best is trial 15 with value: 65739.98221690734.


Early stopping, best iteration is:
[34]	valid_0's rmse: 67665.5
Trial 16 finished in 0.35 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002114 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13422
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 61
[LightGBM] [Info] Start training from score 105784.364306
Training until validation scores don't improve for 100 rounds


[I 2024-08-15 17:25:56,313] Trial 17 finished with value: 69071.41258423016 and parameters: {'num_leaves': 65, 'max_depth': 15, 'learning_rate': 0.08445330223368193, 'n_estimators': 1106, 'min_child_samples': 41, 'subsample': 0.5290470444393731, 'colsample_bytree': 0.9290153265415981}. Best is trial 15 with value: 65739.98221690734.


Early stopping, best iteration is:
[95]	valid_0's rmse: 69071.4
Trial 17 finished in 0.74 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001590 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13422
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 61
[LightGBM] [Info] Start training from score 105784.364306
Training until validation scores don't improve for 100 rounds


[I 2024-08-15 17:25:56,722] Trial 18 finished with value: 68029.88608087871 and parameters: {'num_leaves': 44, 'max_depth': 16, 'learning_rate': 0.11926907734405161, 'n_estimators': 1302, 'min_child_samples': 22, 'subsample': 0.5763689205758367, 'colsample_bytree': 0.9992270880430033}. Best is trial 15 with value: 65739.98221690734.


Early stopping, best iteration is:
[33]	valid_0's rmse: 68029.9
Trial 18 finished in 0.41 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001719 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13422
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 61
[LightGBM] [Info] Start training from score 105784.364306
Training until validation scores don't improve for 100 rounds


[I 2024-08-15 17:25:57,236] Trial 19 finished with value: 68041.4345312645 and parameters: {'num_leaves': 66, 'max_depth': 28, 'learning_rate': 0.15603884765359305, 'n_estimators': 893, 'min_child_samples': 49, 'subsample': 0.43936322978833287, 'colsample_bytree': 0.7144099366319887}. Best is trial 15 with value: 65739.98221690734.


Early stopping, best iteration is:
[36]	valid_0's rmse: 68041.4
Trial 19 finished in 0.51 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001732 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13422
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 61
[LightGBM] [Info] Start training from score 105784.364306
Training until validation scores don't improve for 100 rounds


[I 2024-08-15 17:25:57,612] Trial 20 finished with value: 67533.45769330811 and parameters: {'num_leaves': 34, 'max_depth': 40, 'learning_rate': 0.07335655939255928, 'n_estimators': 1048, 'min_child_samples': 31, 'subsample': 0.6033057990895455, 'colsample_bytree': 0.8283248955244971}. Best is trial 15 with value: 65739.98221690734.


Early stopping, best iteration is:
[51]	valid_0's rmse: 67533.5
Trial 20 finished in 0.38 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001557 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13422
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 61
[LightGBM] [Info] Start training from score 105784.364306
Training until validation scores don't improve for 100 rounds


[I 2024-08-15 17:25:58,218] Trial 21 finished with value: 65934.3336045431 and parameters: {'num_leaves': 11, 'max_depth': 22, 'learning_rate': 0.011663946804831295, 'n_estimators': 621, 'min_child_samples': 15, 'subsample': 0.7516236368382061, 'colsample_bytree': 0.8426340570876922}. Best is trial 15 with value: 65739.98221690734.


[500]	valid_0's rmse: 66595.4
Did not meet early stopping. Best iteration is:
[589]	valid_0's rmse: 65934.3
Trial 21 finished in 0.61 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001641 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13422
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 61
[LightGBM] [Info] Start training from score 105784.364306
Training until validation scores don't improve for 100 rounds
[500]	valid_0's rmse: 65434.8


[I 2024-08-15 17:25:59,063] Trial 22 finished with value: 65145.628618067436 and parameters: {'num_leaves': 11, 'max_depth': 18, 'learning_rate': 0.019114033699704894, 'n_estimators': 1485, 'min_child_samples': 11, 'subsample': 0.7222384863256942, 'colsample_bytree': 0.9213403246085562}. Best is trial 22 with value: 65145.628618067436.


Early stopping, best iteration is:
[653]	valid_0's rmse: 65145.6
Trial 22 finished in 0.84 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002359 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13422
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 61
[LightGBM] [Info] Start training from score 105784.364306
Training until validation scores don't improve for 100 rounds


[I 2024-08-15 17:25:59,819] Trial 23 finished with value: 65380.53357340086 and parameters: {'num_leaves': 10, 'max_depth': 17, 'learning_rate': 0.01397772176500813, 'n_estimators': 784, 'min_child_samples': 12, 'subsample': 0.8750592317426404, 'colsample_bytree': 0.9069662874673612}. Best is trial 22 with value: 65145.628618067436.


[500]	valid_0's rmse: 65411.2
Early stopping, best iteration is:
[511]	valid_0's rmse: 65380.5
Trial 23 finished in 0.76 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001625 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13422
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 61
[LightGBM] [Info] Start training from score 105784.364306
Training until validation scores don't improve for 100 rounds


[I 2024-08-15 17:26:00,468] Trial 24 finished with value: 65070.400147098524 and parameters: {'num_leaves': 23, 'max_depth': 17, 'learning_rate': 0.032836943700487606, 'n_estimators': 791, 'min_child_samples': 10, 'subsample': 0.9166068377635276, 'colsample_bytree': 0.9172201388711717}. Best is trial 24 with value: 65070.400147098524.


Early stopping, best iteration is:
[259]	valid_0's rmse: 65070.4
Trial 24 finished in 0.65 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001572 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13422
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 61
[LightGBM] [Info] Start training from score 105784.364306
Training until validation scores don't improve for 100 rounds


[I 2024-08-15 17:26:00,905] Trial 25 finished with value: 67878.93919828579 and parameters: {'num_leaves': 22, 'max_depth': 10, 'learning_rate': 0.03538533516585298, 'n_estimators': 798, 'min_child_samples': 20, 'subsample': 0.8839365379133461, 'colsample_bytree': 0.9238931531282645}. Best is trial 24 with value: 65070.400147098524.


Early stopping, best iteration is:
[150]	valid_0's rmse: 67878.9
Trial 25 finished in 0.44 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001564 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13422
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 61
[LightGBM] [Info] Start training from score 105784.364306
Training until validation scores don't improve for 100 rounds


[I 2024-08-15 17:26:01,341] Trial 26 finished with value: 66889.00344512996 and parameters: {'num_leaves': 22, 'max_depth': 19, 'learning_rate': 0.05440573411956387, 'n_estimators': 789, 'min_child_samples': 10, 'subsample': 0.9374130179763374, 'colsample_bytree': 0.9102133952373954}. Best is trial 24 with value: 65070.400147098524.


Early stopping, best iteration is:
[154]	valid_0's rmse: 66889
Trial 26 finished in 0.44 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001669 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13422
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 61
[LightGBM] [Info] Start training from score 105784.364306
Training until validation scores don't improve for 100 rounds


[I 2024-08-15 17:26:01,808] Trial 27 finished with value: 67024.0342190091 and parameters: {'num_leaves': 22, 'max_depth': 13, 'learning_rate': 0.030300532784675087, 'n_estimators': 856, 'min_child_samples': 14, 'subsample': 0.9537195569905171, 'colsample_bytree': 0.9433299670590505}. Best is trial 24 with value: 65070.400147098524.


Early stopping, best iteration is:
[166]	valid_0's rmse: 67024
Trial 27 finished in 0.47 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001480 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13422
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 61
[LightGBM] [Info] Start training from score 105784.364306
Training until validation scores don't improve for 100 rounds


[I 2024-08-15 17:26:02,091] Trial 28 finished with value: 66633.85305085807 and parameters: {'num_leaves': 10, 'max_depth': 18, 'learning_rate': 0.04635807529669439, 'n_estimators': 943, 'min_child_samples': 19, 'subsample': 0.8931459791884895, 'colsample_bytree': 0.8820843070045483}. Best is trial 24 with value: 65070.400147098524.


Early stopping, best iteration is:
[146]	valid_0's rmse: 66633.9
Trial 28 finished in 0.28 seconds.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001793 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13422
[LightGBM] [Info] Number of data points in the train set: 11673, number of used features: 61
[LightGBM] [Info] Start training from score 105784.364306
Training until validation scores don't improve for 100 rounds


[I 2024-08-15 17:26:02,700] Trial 29 finished with value: 66004.01667130472 and parameters: {'num_leaves': 18, 'max_depth': 26, 'learning_rate': 0.02609676978967953, 'n_estimators': 583, 'min_child_samples': 10, 'subsample': 0.8018952318719619, 'colsample_bytree': 0.9576584468751066}. Best is trial 24 with value: 65070.400147098524.


Early stopping, best iteration is:
[232]	valid_0's rmse: 66004
Trial 29 finished in 0.61 seconds.
Best parameters found:  {'num_leaves': 23, 'max_depth': 17, 'learning_rate': 0.032836943700487606, 'n_estimators': 791, 'min_child_samples': 10, 'subsample': 0.9166068377635276, 'colsample_bytree': 0.9172201388711717}


In [134]:
# best_params = {
#     "num_leaves": 70,
#     "max_depth": 36,
#     "learning_rate": 0.09304350950671668,
#     "n_estimators": 1158,
#     "min_child_samples": 18,
#     "subsample": 0.579731306036922,
#     "colsample_bytree": 0.8511910376678277,
# }

In [135]:
print("Best parameters found: ", best_params)
best_params["num_threads"] = 16

# Train the model with the best parameters
best_model = LGBMRegressor(**best_params)
best_model.fit(X_train, y_train)


# Extract feature importances
feature_importances = best_model.feature_importances_

# Create a DataFrame to hold the features and their importances
importance_df = pd.DataFrame(
    {"Feature": X_train.columns, "Importance": feature_importances}
)

# Sort the DataFrame by importance
importance_df = importance_df.sort_values(by="Importance", ascending=False)

Best parameters found:  {'num_leaves': 23, 'max_depth': 17, 'learning_rate': 0.032836943700487606, 'n_estimators': 791, 'min_child_samples': 10, 'subsample': 0.9166068377635276, 'colsample_bytree': 0.9172201388711717}
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002393 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13423
[LightGBM] [Info] Number of data points in the train set: 14592, number of used features: 61
[LightGBM] [Info] Start training from score 101382.392957


In [136]:
# Predict on the test set
y_pred = best_model.predict(X_test)

# Calculate RMSE
rmse = root_mean_squared_error(y_test, y_pred)
print(f"RMSE on the test set: {rmse:.4f}")

RMSE on the test set: 61308.7057


In [None]:
# save important_features to csv file
if use_test_for_validation_flag:
    importance_df.to_csv("farm_important_features_use_test_for_validation.csv", index=False)
else:
    importance_df.to_csv("farm_important_features.csv", index=False)

In [None]:
# Set the threshold for feature importance
importance_threshold = 0

# Select features with importance greater than or equal to the threshold
selected_features = importance_df[importance_df["Importance"] >= importance_threshold][
    "Feature"
].tolist()

# Separate selected features into original input features and time features
selected_input_features = [
    feature for feature in selected_features if feature not in time_features
]
selected_time_features = [
    feature for feature in selected_features if feature in time_features
]

In [None]:
# Combine the filtered features with the power column and time features
train_data_final = pd.concat(
    [
        train_data_selected["time"],
        X_train[selected_input_features],
        train_data_selected["power"],
        train_data_selected[time_features],
    ],
    axis=1,
)
test_data_final = pd.concat(
    [
        test_data_selected["time"],
        X_test[selected_input_features],
        test_data_selected["power"],
        test_data_selected[time_features],
    ],
    axis=1,
)

# Ensure the final order of columns
final_columns = ["time"] + selected_input_features + ["power"] + time_features
train_data_final = train_data_final[final_columns]
test_data_final = test_data_final[final_columns]

# Display the new shapes of train_data_final and test_data_final
print("Shape of train_data_final:", train_data_final.shape)
print("Shape of test_data_final:", test_data_final.shape)
print("Selected input features:", selected_input_features)
print(f"Train columns : {train_data_final.columns}")

In [None]:
# Save the final datasets to CSV
file_name_common = '66_withTime'
train_data_final.to_csv( f"4-train_{file_name_common}.csv", index=False)
test_data_final.to_csv( f"4-test_{file_name_common}.csv", index=False)
print("Final datasets saved to CSV.")