In [None]:
import pandas as pd
import numpy as np  # noqa
import optuna
import time
import logging  # noqa
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error
from lightgbm import LGBMRegressor, early_stopping, log_evaluation

# Enable logging
optuna.logging.set_verbosity(optuna.logging.INFO)


class TimingCallback:
    def __init__(self):
        self.start_time = None

    def __call__(self, study, trial):
        if self.start_time is None:
            self.start_time = time.time()
        else:
            elapsed_time = time.time() - self.start_time
            print(f"Trial {trial.number} finished in {elapsed_time:.2f} seconds.")
            self.start_time = time.time()

In [None]:
data_dir = "/data3/lsf/Pein/Power-Prediction/data/"
train_data_selected = pd.read_csv(data_dir + "train_farm_66.csv")
test_data_selected = pd.read_csv(data_dir + "test_farm_66.csv")

# Define features and target
features = [
    col
    for col in train_data_selected.columns
    if col not in ["time", "lead_hour", "power"]
]
X_train = train_data_selected[features]
y_train = train_data_selected["power"]
X_test = test_data_selected[features]
y_test = test_data_selected["power"]

### Adding time features

In [None]:
# Convert 'time' column to datetime
train_data_selected["time"] = pd.to_datetime(train_data_selected["time"])
test_data_selected["time"] = pd.to_datetime(test_data_selected["time"])

In [None]:
# Function to add time features
def add_time_features(df):
    df["time"] = pd.to_datetime(df["time"])

    # Existing time features
    df["hour"] = df["time"].dt.hour
    df["quarter_hour"] = df["time"].dt.minute // 15

    # New time features
    df["day"] = df["time"].dt.day
    df["day_in_week"] = df["time"].dt.weekday

    # Sine and cosine transformations
    df["hour_sin"] = np.sin(2 * np.pi * df["hour"] / 24)
    df["hour_cos"] = np.cos(2 * np.pi * df["hour"] / 24)

    df["quarter_hour_sin"] = np.sin(2 * np.pi * df["quarter_hour"] / 4)
    df["quarter_hour_cos"] = np.cos(2 * np.pi * df["quarter_hour"] / 4)

    df["day_sin"] = np.sin(2 * np.pi * df["day"] / 31)
    df["day_cos"] = np.cos(2 * np.pi * df["day"] / 31)

    df["day_in_week_sin"] = np.sin(2 * np.pi * df["day_in_week"] / 7)
    df["day_in_week_cos"] = np.cos(2 * np.pi * df["day_in_week"] / 7)

    return df

In [None]:
# Add time features to both train and test data
train_data_selected = add_time_features(train_data_selected)
test_data_selected = add_time_features(test_data_selected)

In [None]:
# Define features including the new time features
time_features = [
    "hour",
    "quarter_hour",
    "day",
    "day_in_week",
    "hour_sin",
    "hour_cos",
    "quarter_hour_sin",
    "quarter_hour_cos",
    "day_sin",
    "day_cos",
    "day_in_week_sin",
    "day_in_week_cos",
]

In [None]:
all_features = features + time_features

X_train = train_data_selected[all_features]
X_test = test_data_selected[all_features]

In [None]:
def objective(
    trial,
    X_train,
    y_train,
    X_test=None,
    y_test=None,
    use_test_for_validation_flag=False,
):
    params = {
        "objective": "regression",
        "metric": "rmse",
        "boosting_type": "gbdt",
        "num_leaves": trial.suggest_int("num_leaves", 10, 80),
        "max_depth": trial.suggest_int("max_depth", 10, 50),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2),
        "n_estimators": trial.suggest_int("n_estimators", 500, 1500),
        "min_child_samples": trial.suggest_int("min_child_samples", 10, 50),
        "subsample": trial.suggest_float("subsample", 0.3, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "num_threads": 16,
        "seed": 42,
    }

    model = LGBMRegressor(**params)

    if use_test_for_validation_flag and X_test is not None and y_test is not None:
        X_val_split, y_val_split = X_test, y_test
        model.fit(
            X_train,
            y_train,
            eval_set=[(X_val_split, y_val_split)],
            eval_metric="rmse",
            callbacks=[early_stopping(stopping_rounds=15), log_evaluation(period=500)],
        )
        preds = model.predict(X_val_split)
    else:
        X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
            X_train, y_train, test_size=0.2, random_state=42
        )
        model.fit(
            X_train_split,
            y_train_split,
            eval_set=[(X_val_split, y_val_split)],
            eval_metric="rmse",
            callbacks=[early_stopping(stopping_rounds=15), log_evaluation(period=500)],
        )
        preds = model.predict(X_val_split)

    # Calculate RMSE
    rmse = root_mean_squared_error(y_val_split, preds)
    return rmse

In [None]:
# Create the study and optimize
use_test_for_validation_flag = False
study = optuna.create_study(direction="minimize")
study.optimize(
    lambda trial: objective(
        trial,
        X_train,
        y_train,
        X_test,
        y_test,
        use_test_for_validation_flag=use_test_for_validation_flag,
    ),
    n_trials=50,
    callbacks=[TimingCallback()],
)

# Get the best parameters
best_params = study.best_params
print("Best parameters found: ", best_params)

In [None]:
# best_params = {
#     "num_leaves": 70,
#     "max_depth": 36,
#     "learning_rate": 0.09304350950671668,
#     "n_estimators": 1158,
#     "min_child_samples": 18,
#     "subsample": 0.579731306036922,
#     "colsample_bytree": 0.8511910376678277,
# }

In [None]:
print("Best parameters found: ", best_params)
best_params["num_threads"] = 16

# Train the model with the best parameters
best_model = LGBMRegressor(**best_params)
best_model.fit(X_train, y_train)


# Extract feature importances
feature_importances = best_model.feature_importances_

# Create a DataFrame to hold the features and their importances
importance_df = pd.DataFrame(
    {"Feature": X_train.columns, "Importance": feature_importances}
)

# Sort the DataFrame by importance
importance_df = importance_df.sort_values(by="Importance", ascending=False)

In [None]:
# Predict on the test set
y_pred = best_model.predict(X_test)

# Calculate RMSE
rmse = root_mean_squared_error(y_test, y_pred)
print(f"RMSE on the test set: {rmse:.4f}")

In [None]:
# save important_features to csv file
if use_test_for_validation_flag:
    importance_df.to_csv("farm_important_features_use_test_for_validation.csv", index=False)
else:
    importance_df.to_csv("farm_important_features.csv", index=False)

In [None]:
# Set the threshold for feature importance
importance_threshold = 90

# Select features with importance greater than or equal to the threshold
selected_features = importance_df[importance_df["Importance"] >= importance_threshold][
    "Feature"
].tolist()

# Separate selected features into original input features and time features
selected_input_features = [
    feature for feature in selected_features if feature not in time_features
]
selected_time_features = [
    feature for feature in selected_features if feature in time_features
]

In [None]:
# Combine the filtered features with the power column and time features
train_data_final = pd.concat(
    [
        train_data_selected["time"],
        X_train[selected_input_features],
        train_data_selected["power"],
        train_data_selected[time_features],
    ],
    axis=1,
)
test_data_final = pd.concat(
    [
        test_data_selected["time"],
        X_test[selected_input_features],
        test_data_selected["power"],
        test_data_selected[time_features],
    ],
    axis=1,
)

# Ensure the final order of columns
final_columns = ["time"] + selected_input_features + ["power"] + time_features
train_data_final = train_data_final[final_columns]
test_data_final = test_data_final[final_columns]

# Display the new shapes of train_data_final and test_data_final
print("Shape of train_data_final:", train_data_final.shape)
print("Shape of test_data_final:", test_data_final.shape)
print("Selected input features:", selected_input_features)
print(f"Train columns : {train_data_final.columns}")

In [None]:
# Save the final datasets to CSV
train_data_final.to_csv(data_dir + "train_farm_76_withTime.csv", index=False)
test_data_final.to_csv(data_dir + "test_farm_76_withTime.csv", index=False)
print("Final datasets saved to CSV.")