In [1]:
import warnings

warnings.filterwarnings("ignore")

In [3]:
import numpy as np
import pandas as pd
train_df_dir = "train.csv"
test_df_dir = "test.csv"

In [5]:
train_df = pd.read_csv(train_df_dir)
test_df = pd.read_csv(test_df_dir)


In [7]:
test_uid = test_df["uid"]

train_df.drop("uid", axis=1, inplace=True)
test_df.drop("uid", axis=1, inplace=True)

day_map = {
    "Friday": 0,
    "Saturday": 1,
}

train_df["day"] = train_df["day"].map(day_map)
test_df["day"] = test_df["day"].map(day_map)

train_df.drop("minute", axis=1, inplace=True)
test_df.drop("minute", axis=1, inplace=True)

print(train_df.isna().sum())

day                              5479
hour                             5613
C_motion                         5517
feed_water_motion                5597
faucet_hole                      5566
vapour_pressure                  5479
vapour_enthalpy                  5437
vapour_pressure_at_division      5477
vapour_motion                    5477
feed_water_enth                  5496
vapour_temperature               5517
output_electricity_generation       0
dtype: int64


In [9]:
from sklearn.neighbors import KNeighborsRegressor


def KNN_Imputer(df):
    for i in df.columns:
        if df[i].isna().sum() == 0:
            continue

        temp_df = df.copy()
        for x in df.columns:
            if x == i:
                continue
            temp_df[x].fillna(value=temp_df[x].mean(), inplace=True)

        col = i
        other_cols = [x for x in df.columns if x != col]
        X = temp_df[other_cols][df[col].notna()]
        y = temp_df[col][df[col].notna()]

        neigh = KNeighborsRegressor(n_neighbors=3)
        neigh.fit(np.array(X), np.array(y))

        print(f"Imputing {i}")
        for i, j in enumerate(df[col].isnull()):
            if j == True:
                df[col].iloc[i] = neigh.predict(temp_df[other_cols].iloc[i : i + 1, :])[
                    0
                ]
    return df


train_df = KNN_Imputer(train_df)
test_df = KNN_Imputer(test_df)

print(train_df.isna().sum())

Imputing day
Imputing hour
Imputing C_motion
Imputing feed_water_motion
Imputing faucet_hole
Imputing vapour_pressure
Imputing vapour_enthalpy
Imputing vapour_pressure_at_division
Imputing vapour_motion
Imputing feed_water_enth
Imputing vapour_temperature
Imputing day
Imputing hour
Imputing C_motion
Imputing feed_water_motion
Imputing faucet_hole
Imputing vapour_pressure
Imputing vapour_enthalpy
Imputing vapour_pressure_at_division
Imputing vapour_motion
Imputing feed_water_enth
Imputing vapour_temperature
day                              0
hour                             0
C_motion                         0
feed_water_motion                0
faucet_hole                      0
vapour_pressure                  0
vapour_enthalpy                  0
vapour_pressure_at_division      0
vapour_motion                    0
feed_water_enth                  0
vapour_temperature               0
output_electricity_generation    0
dtype: int64


In [11]:
from sklearn.model_selection import train_test_split

X = train_df.drop("output_electricity_generation", axis=1)
y = train_df["output_electricity_generation"]

X["vapour_pressure_per_temp"] = X["vapour_pressure"] / X["vapour_temperature"]
test_df["vapour_pressure_per_temp"] = (
    test_df["vapour_pressure"] / test_df["vapour_temperature"]
)

X["vapour_enthalpy_per_temp"] = X["vapour_enthalpy"] / X["vapour_temperature"]
test_df["vapour_enthalpy_per_temp"] = (
    test_df["vapour_enthalpy"] / test_df["vapour_temperature"]
)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

X_train["hour_sin"] = np.sin(2 * np.pi * X_train["hour"] / 23.0)
X_train["hour_cos"] = np.cos(2 * np.pi * X_train["hour"] / 23.0)

X_test["hour_sin"] = np.sin(2 * np.pi * X_test["hour"] / 23.0)
X_test["hour_cos"] = np.cos(2 * np.pi * X_test["hour"] / 23.0)

test_df["hour_sin"] = np.sin(2 * np.pi * test_df["hour"] / 23.0)
test_df["hour_cos"] = np.cos(2 * np.pi * test_df["hour"] / 23.0)

In [17]:
import optuna
import lightgbm as lgb
from sklearn.metrics import mean_squared_error


def objective(trial):
    param = {
        "objective": "regression",
        "metric": "rmse",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
    }

    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

    model = lgb.train(param, lgb_train, valid_sets=[lgb_train, lgb_eval])

    y_pred = model.predict(X_test, num_iteration=model.best_iteration)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    return rmse


study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100)

lgb_params = study.best_params
lgb_params["objective"] = "regression"
lgb_params["metric"] = "rmse"
lgb_params["verbosity"] = -1
lgb_params["boosting_type"] = "gbdt"

lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

lgb_model = lgb.train(lgb_params, lgb_train, valid_sets=[lgb_train, lgb_eval])

lgb_pred = lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration)
lgb_rmse = np.sqrt(mean_squared_error(y_test, lgb_pred))
print(f"LightGBM RMSE: {lgb_rmse}\n")

[I 2025-03-08 23:45:03,414] A new study created in memory with name: no-name-775bf1d2-04ac-468e-9d64-c29378f1c5a6
[I 2025-03-08 23:45:04,224] Trial 0 finished with value: 1.0012692448386569 and parameters: {'lambda_l1': 5.789940319717543e-07, 'lambda_l2': 0.00014909020055718186, 'num_leaves': 233, 'feature_fraction': 0.7289139942704017, 'bagging_fraction': 0.42061577568864245, 'bagging_freq': 3, 'min_child_samples': 25}. Best is trial 0 with value: 1.0012692448386569.
[I 2025-03-08 23:45:04,775] Trial 1 finished with value: 1.3001504646488515 and parameters: {'lambda_l1': 8.983293063948936e-07, 'lambda_l2': 2.9125163093981784e-07, 'num_leaves': 214, 'feature_fraction': 0.7798267379903163, 'bagging_fraction': 0.520701422195395, 'bagging_freq': 2, 'min_child_samples': 82}. Best is trial 0 with value: 1.0012692448386569.
[I 2025-03-08 23:45:05,312] Trial 2 finished with value: 1.008626353504314 and parameters: {'lambda_l1': 3.513720643653179e-08, 'lambda_l2': 1.1661002814819067e-06, 'num_

LightGBM RMSE: 0.8670963493613207



In [15]:
pip install optuna

Collecting optuna
  Downloading optuna-4.2.1-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.15.1-py3-none-any.whl.metadata (7.2 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.9-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.2.1-py3-none-any.whl (383 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m383.6/383.6 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading alembic-1.15.1-py3-none-any.whl (231 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m231.8/231.8 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.9-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.5/78.5 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?2

In [None]:
from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import RandomForestRegressor

estimators = [
    ("lgb", lgb.LGBMRegressor(**lgb_params)),
    ("rf", RandomForestRegressor(max_depth=32, random_state=1)),
]

stacker = StackingRegressor(
    estimators=estimators, final_estimator=RandomForestRegressor()
)
stacker.fit(X_train, y_train)

stacker_pred = stacker.predict(X_test)

stacker_rmse = np.sqrt(mean_squared_error(y_test, stacker_pred))

print(f"Stacker RMSE: {stacker_rmse}\n")