In [1]:
import pandas as pd
import numpy as np
import optuna
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, StackingRegressor
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.preprocessing import StandardScaler

# Load data (for local execution)
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# Save test uid for final submission
test_uid = test["uid"]

# Preprocess 'day' column
def preprocess_day(df):
    df['day'] = df['day'].map({'Friday': 0, 'Saturday': 1}).fillna(2)
    return df

train = preprocess_day(train)
test = preprocess_day(test)

# Feature Engineering
def create_new_features(df):
    df["vapour_efficiency"] = df["vapour_pressure"] / (df["vapour_temperature"] + 1e-6)
    df["water_flow_efficiency"] = df["feed_water_motion"] / (df["faucet_hole"] + 1e-6)
    df["vapour_pressure_x_temperature"] = df["vapour_pressure"] * df["vapour_temperature"]
    df["feed_water_motion_x_faucet_hole"] = df["feed_water_motion"] * df["faucet_hole"]
    return df

train = create_new_features(train)
test = create_new_features(test)

# Fill missing values with median
numerical_cols = train.select_dtypes(include=[np.number]).columns.drop('output_electricity_generation')
train[numerical_cols] = train[numerical_cols].fillna(train[numerical_cols].median())
test[numerical_cols] = test[numerical_cols].fillna(test[numerical_cols].median())

# Log transformation for skewed features
skewed_cols = ["vapour_pressure", "vapour_temperature", "feed_water_motion", "faucet_hole"]
for col in skewed_cols:
    train[col] = np.log1p(train[col])
    test[col] = np.log1p(test[col])

# Cyclical encoding for time-based features
for df in [train, test]:
    df["sin_hour"] = np.sin(2 * np.pi * df["hour"] / 24)
    df["cos_hour"] = np.cos(2 * np.pi * df["hour"] / 24)
    df["sin_minute"] = np.sin(2 * np.pi * df["minute"] / 60)
    df["cos_minute"] = np.cos(2 * np.pi * df["minute"] / 60)

# Drop original time columns
train.drop(columns=["hour", "minute"], inplace=True)
test.drop(columns=["hour", "minute"], inplace=True)

# Define target variable
target = "output_electricity_generation"
X = train.drop(columns=[target, "uid"])
y = train[target]
X_test = test.drop(columns=["uid"])

# Standardize numerical features
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print("Preprocessing completed.")

# Hyperparameter tuning with Optuna
def tune_lgbm(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 200, 1000, step=100),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "num_leaves": trial.suggest_int("num_leaves", 20, 300),
    }
    model = LGBMRegressor(**params, random_state=42)
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    return np.sqrt(mean_squared_error(y_val, preds))

study_lgbm = optuna.create_study(direction="minimize")
study_lgbm.optimize(tune_lgbm, n_trials=20)

best_lgbm_params = study_lgbm.best_params
print(f"Best LightGBM Params: {best_lgbm_params}")

# Train best LightGBM model
lightgbm_model = LGBMRegressor(**best_lgbm_params, random_state=42)
lightgbm_model.fit(X_train, y_train)

# Train CatBoost with Optuna
def tune_catboost(trial):
    params = {
        "iterations": trial.suggest_int("iterations", 500, 2000, step=500),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1, log=True),
        "depth": trial.suggest_int("depth", 4, 12),
    }
    model = CatBoostRegressor(**params, verbose=0, random_state=42)
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    return np.sqrt(mean_squared_error(y_val, preds))

study_catboost = optuna.create_study(direction="minimize")
study_catboost.optimize(tune_catboost, n_trials=20)

best_catboost_params = study_catboost.best_params
print(f"Best CatBoost Params: {best_catboost_params}")

# Train best CatBoost model
catboost_model = CatBoostRegressor(**best_catboost_params, verbose=0, random_state=42)
catboost_model.fit(X_train, y_train)

# RandomForest model
rf_model = RandomForestRegressor(n_estimators=200, max_depth=20, random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)

# Bagging model
bagging_model = BaggingRegressor(
    estimator=RandomForestRegressor(n_estimators=100, max_depth=12, random_state=42),
    n_estimators=10,
    random_state=42,
    n_jobs=-1
)
bagging_model.fit(X_train, y_train)

# Stacking model
stacking_model = StackingRegressor(
    estimators=[
        ('bagging', bagging_model),
        ('catboost', catboost_model),
        ('rf', rf_model),
        ('lightgbm', lightgbm_model)
    ],
    final_estimator=RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42),
    n_jobs=-1
)
stacking_model.fit(X_train, y_train)

# Predict & Evaluate
val_preds = stacking_model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, val_preds))
print(f"Validation RMSE: {rmse:.5f}")

# Predict on test data
test_preds = stacking_model.predict(X_test)

# Save submission with uid
submission = pd.DataFrame({"uid": test_uid, "output_electricity_generation": test_preds})
submission.to_csv("submission_stacking.csv", index=False)

print("Submission file 'submission_stacking.csv' saved successfully.")


[I 2025-02-08 23:02:08,345] A new study created in memory with name: no-name-c5525392-ad59-4d35-84df-467c75de4760


Preprocessing completed.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001558 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3450
[LightGBM] [Info] Number of data points in the train set: 40320, number of used features: 18
[LightGBM] [Info] Start training from score 832.300201


[I 2025-02-08 23:02:09,916] Trial 0 finished with value: 3.26934187295432 and parameters: {'n_estimators': 800, 'learning_rate': 0.05310438141292969, 'max_depth': 9, 'num_leaves': 82}. Best is trial 0 with value: 3.26934187295432.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001390 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3450
[LightGBM] [Info] Number of data points in the train set: 40320, number of used features: 18
[LightGBM] [Info] Start training from score 832.300201


[I 2025-02-08 23:02:10,911] Trial 1 finished with value: 3.7697370296263837 and parameters: {'n_estimators': 1000, 'learning_rate': 0.05352694304808862, 'max_depth': 5, 'num_leaves': 206}. Best is trial 0 with value: 3.26934187295432.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001523 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3450
[LightGBM] [Info] Number of data points in the train set: 40320, number of used features: 18
[LightGBM] [Info] Start training from score 832.300201


[I 2025-02-08 23:02:11,844] Trial 2 finished with value: 5.321718908877159 and parameters: {'n_estimators': 600, 'learning_rate': 0.011793727752882647, 'max_depth': 6, 'num_leaves': 52}. Best is trial 0 with value: 3.26934187295432.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001355 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3450
[LightGBM] [Info] Number of data points in the train set: 40320, number of used features: 18
[LightGBM] [Info] Start training from score 832.300201


[I 2025-02-08 23:02:12,287] Trial 3 finished with value: 7.895848327877802 and parameters: {'n_estimators': 500, 'learning_rate': 0.010177524200239707, 'max_depth': 4, 'num_leaves': 54}. Best is trial 0 with value: 3.26934187295432.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001527 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3450
[LightGBM] [Info] Number of data points in the train set: 40320, number of used features: 18
[LightGBM] [Info] Start training from score 832.300201


[I 2025-02-08 23:02:13,344] Trial 4 finished with value: 4.339711410514116 and parameters: {'n_estimators': 700, 'learning_rate': 0.02113052026309358, 'max_depth': 6, 'num_leaves': 127}. Best is trial 0 with value: 3.26934187295432.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001535 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3450
[LightGBM] [Info] Number of data points in the train set: 40320, number of used features: 18
[LightGBM] [Info] Start training from score 832.300201


[I 2025-02-08 23:02:16,335] Trial 5 finished with value: 3.45121244792371 and parameters: {'n_estimators': 900, 'learning_rate': 0.0183524959366317, 'max_depth': 8, 'num_leaves': 129}. Best is trial 0 with value: 3.26934187295432.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002797 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3450
[LightGBM] [Info] Number of data points in the train set: 40320, number of used features: 18
[LightGBM] [Info] Start training from score 832.300201


[I 2025-02-08 23:02:16,686] Trial 6 finished with value: 7.084154589922612 and parameters: {'n_estimators': 300, 'learning_rate': 0.0627064517113243, 'max_depth': 3, 'num_leaves': 104}. Best is trial 0 with value: 3.26934187295432.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002823 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3450
[LightGBM] [Info] Number of data points in the train set: 40320, number of used features: 18
[LightGBM] [Info] Start training from score 832.300201


[I 2025-02-08 23:02:18,013] Trial 7 finished with value: 5.199897604595132 and parameters: {'n_estimators': 500, 'learning_rate': 0.014432307395960936, 'max_depth': 6, 'num_leaves': 231}. Best is trial 0 with value: 3.26934187295432.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002384 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3450
[LightGBM] [Info] Number of data points in the train set: 40320, number of used features: 18
[LightGBM] [Info] Start training from score 832.300201


[I 2025-02-08 23:02:20,405] Trial 8 finished with value: 3.944446542157666 and parameters: {'n_estimators': 800, 'learning_rate': 0.011833483663688759, 'max_depth': 9, 'num_leaves': 54}. Best is trial 0 with value: 3.26934187295432.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002404 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3450
[LightGBM] [Info] Number of data points in the train set: 40320, number of used features: 18
[LightGBM] [Info] Start training from score 832.300201


[I 2025-02-08 23:02:23,269] Trial 9 finished with value: 4.056520567916753 and parameters: {'n_estimators': 600, 'learning_rate': 0.012197645334218634, 'max_depth': 8, 'num_leaves': 154}. Best is trial 0 with value: 3.26934187295432.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002639 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3450
[LightGBM] [Info] Number of data points in the train set: 40320, number of used features: 18
[LightGBM] [Info] Start training from score 832.300201


[I 2025-02-08 23:02:24,485] Trial 10 finished with value: 3.60367343428193 and parameters: {'n_estimators': 200, 'learning_rate': 0.09903643948017143, 'max_depth': 12, 'num_leaves': 283}. Best is trial 0 with value: 3.26934187295432.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002625 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3450
[LightGBM] [Info] Number of data points in the train set: 40320, number of used features: 18
[LightGBM] [Info] Start training from score 832.300201


[I 2025-02-08 23:02:28,168] Trial 11 finished with value: 3.1617306914279464 and parameters: {'n_estimators': 1000, 'learning_rate': 0.030694837408056935, 'max_depth': 10, 'num_leaves': 106}. Best is trial 11 with value: 3.1617306914279464.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002180 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3450
[LightGBM] [Info] Number of data points in the train set: 40320, number of used features: 18
[LightGBM] [Info] Start training from score 832.300201


[I 2025-02-08 23:02:31,516] Trial 12 finished with value: 3.5504630763535507 and parameters: {'n_estimators': 1000, 'learning_rate': 0.036071115195491936, 'max_depth': 11, 'num_leaves': 88}. Best is trial 11 with value: 3.1617306914279464.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002183 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3450
[LightGBM] [Info] Number of data points in the train set: 40320, number of used features: 18
[LightGBM] [Info] Start training from score 832.300201


[I 2025-02-08 23:02:32,815] Trial 13 finished with value: 3.82683415235946 and parameters: {'n_estimators': 800, 'learning_rate': 0.03280854017262973, 'max_depth': 10, 'num_leaves': 20}. Best is trial 11 with value: 3.1617306914279464.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006843 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3450
[LightGBM] [Info] Number of data points in the train set: 40320, number of used features: 18
[LightGBM] [Info] Start training from score 832.300201


[I 2025-02-08 23:02:36,474] Trial 14 finished with value: 3.483288224765628 and parameters: {'n_estimators': 900, 'learning_rate': 0.05308473758391269, 'max_depth': 10, 'num_leaves': 191}. Best is trial 11 with value: 3.1617306914279464.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002490 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3450
[LightGBM] [Info] Number of data points in the train set: 40320, number of used features: 18
[LightGBM] [Info] Start training from score 832.300201


[I 2025-02-08 23:02:41,025] Trial 15 finished with value: 3.6623890001727406 and parameters: {'n_estimators': 800, 'learning_rate': 0.025010362022107366, 'max_depth': 12, 'num_leaves': 164}. Best is trial 11 with value: 3.1617306914279464.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003595 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3450
[LightGBM] [Info] Number of data points in the train set: 40320, number of used features: 18
[LightGBM] [Info] Start training from score 832.300201


[I 2025-02-08 23:02:44,563] Trial 16 finished with value: 3.382947180247932 and parameters: {'n_estimators': 1000, 'learning_rate': 0.04135408111568061, 'max_depth': 9, 'num_leaves': 98}. Best is trial 11 with value: 3.1617306914279464.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002304 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3450
[LightGBM] [Info] Number of data points in the train set: 40320, number of used features: 18
[LightGBM] [Info] Start training from score 832.300201


[I 2025-02-08 23:02:47,148] Trial 17 finished with value: 3.472952874498083 and parameters: {'n_estimators': 900, 'learning_rate': 0.08411288059484197, 'max_depth': 10, 'num_leaves': 76}. Best is trial 11 with value: 3.1617306914279464.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001904 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3450
[LightGBM] [Info] Number of data points in the train set: 40320, number of used features: 18
[LightGBM] [Info] Start training from score 832.300201


[I 2025-02-08 23:02:49,410] Trial 18 finished with value: 3.524551012706339 and parameters: {'n_estimators': 700, 'learning_rate': 0.026603666645336137, 'max_depth': 7, 'num_leaves': 141}. Best is trial 11 with value: 3.1617306914279464.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002385 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3450
[LightGBM] [Info] Number of data points in the train set: 40320, number of used features: 18
[LightGBM] [Info] Start training from score 832.300201


[I 2025-02-08 23:02:50,935] Trial 19 finished with value: 3.4657653212703905 and parameters: {'n_estimators': 900, 'learning_rate': 0.04209530309589328, 'max_depth': 9, 'num_leaves': 23}. Best is trial 11 with value: 3.1617306914279464.


Best LightGBM Params: {'n_estimators': 1000, 'learning_rate': 0.030694837408056935, 'max_depth': 10, 'num_leaves': 106}
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001933 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3450
[LightGBM] [Info] Number of data points in the train set: 40320, number of used features: 18
[LightGBM] [Info] Start training from score 832.300201


[I 2025-02-08 23:02:54,010] A new study created in memory with name: no-name-4b1f7c54-527d-44f1-a6e8-40c88c639528
[I 2025-02-08 23:03:05,426] Trial 0 finished with value: 3.7242531963244536 and parameters: {'iterations': 1500, 'learning_rate': 0.050288345822171406, 'depth': 6}. Best is trial 0 with value: 3.7242531963244536.
[I 2025-02-08 23:03:10,379] Trial 1 finished with value: 4.922183703774461 and parameters: {'iterations': 1000, 'learning_rate': 0.05203191545147163, 'depth': 4}. Best is trial 0 with value: 3.7242531963244536.
[I 2025-02-08 23:03:16,154] Trial 2 finished with value: 4.245100867081052 and parameters: {'iterations': 1000, 'learning_rate': 0.05705722607828336, 'depth': 5}. Best is trial 0 with value: 3.7242531963244536.
[I 2025-02-08 23:08:24,605] Trial 3 finished with value: 5.509250654038724 and parameters: {'iterations': 2000, 'learning_rate': 0.018897960175695838, 'depth': 12}. Best is trial 0 with value: 3.7242531963244536.
[I 2025-02-08 23:08:31,959] Trial 4 fi

Best CatBoost Params: {'iterations': 1500, 'learning_rate': 0.0988919543279248, 'depth': 7}
Validation RMSE: 2.86964
Submission file 'submission_stacking.csv' saved successfully.
