In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK, space_eval
from hyperopt.pyll import scope
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Set seed and max evaluations
SEED = 10
MAX_EVALS = 30
np.random.seed(SEED)

# Load data
df = pd.read_csv('C:/Users/Multiplexon/Desktop/data/2/total_selected_augmented.csv')
features = ['Transaction Hash_len', 'Original_len', 'signature_len', 'From_len', 'To_len',
            'sender_len', 'paymaster_len', 'Txn Fee', 'logIndex', 'actualGasCost',
            'actualGasUsed', 'nonce', 'success', 'Blockno', 'DateTime_ts']
X = df[features].astype(float)
y = df['Gas Used'].astype(float)

# Split 80/20 for tuning
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=SEED)

# MinMaxScaler
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()
X_train_scaled = pd.DataFrame(scaler_X.fit_transform(X_train), columns=X.columns, index=X_train.index)
X_val_scaled = pd.DataFrame(scaler_X.transform(X_val), columns=X.columns, index=X_val.index)
y_train_scaled = scaler_y.fit_transform(y_train.values.reshape(-1, 1)).ravel()
y_val_scaled = scaler_y.transform(y_val.values.reshape(-1, 1)).ravel()

# Hyperopt search space for LightGBM
space = {
    "reg_params": {
        "learning_rate": hp.uniform("lgb_lr", 0.01, 0.2),
        "num_leaves": scope.int(hp.quniform("lgb_nl", 31, 511, 1)),
        "max_depth": scope.int(hp.choice("lgb_md", [-1, 2, 5, 10])),
        "min_data_in_leaf": scope.int(hp.quniform("lgb_mdl", 20, 200, 1)),
        "lambda_l1": hp.uniform("lgb_l1", 1.0, 10.0),
        "lambda_l2": hp.uniform("lgb_l2", 1.0, 10.0),
        "min_gain_to_split": hp.uniform("lgb_mgs", 0.0, 2.0),
        "feature_fraction": hp.uniform("lgb_ff", 0.6, 1.0),
        "bagging_fraction": hp.uniform("lgb_bf", 0.6, 1.0),
        "bagging_freq": scope.int(hp.quniform("lgb_bfreq", 0, 5, 1)),
        "max_bin": scope.int(hp.quniform("lgb_mb", 63, 255, 1)),
        "objective": "regression",
        "metric": "rmse",
        "seed": SEED,
        "verbosity": -1,
        "device": "gpu",
        "gpu_platform_id": 0,
        "gpu_device_id": 0
    },
    "fit_params": {}
}

class HPOptimiser:
    def __init__(self, Xtr, Xval, ytr, yval):
        self.Xtr, self.Xval, self.ytr, self.yval = Xtr, Xval, ytr, yval

    def lgb_reg(self, p):
        rp = p["reg_params"].copy()
        for k in ["num_leaves", "max_depth", "min_data_in_leaf", "bagging_freq", "max_bin"]:
            rp[k] = int(rp[k])
        try:
            return self._train_eval(rp)  # GPU
        except Exception:
            rp["device"] = "cpu"  # Fallback CPU
            return self._train_eval(rp)

    def _train_eval(self, rp):
        params = rp.copy()
        num_round = 1000
        train_ds = lgb.Dataset(self.Xtr, label=self.ytr, feature_name=list(self.Xtr.columns))
        valid_ds = lgb.Dataset(self.Xval, label=self.yval, reference=train_ds)
        model = lgb.train(
            params, train_ds, num_boost_round=num_round,
            valid_sets=[valid_ds], valid_names=['val'],
            callbacks=[lgb.early_stopping(stopping_rounds=10, verbose=False)]
        )
        pred = model.predict(self.Xval, num_iteration=model.best_iteration)
        rmse = float(np.sqrt(mean_squared_error(self.yval, pred)))  # RMSE on scaled data
        return {"loss": rmse, "status": STATUS_OK}

# Run Hyperopt (scaled)
trials = Trials()
optim = HPOptimiser(X_train_scaled, X_val_scaled, y_train_scaled, y_val_scaled)
best_raw = fmin(fn=optim.lgb_reg, space=space, algo=tpe.suggest, max_evals=MAX_EVALS, trials=trials, rstate=np.random.default_rng(SEED))
best = space_eval(space, best_raw)
for k in ["num_leaves", "max_depth", "min_data_in_leaf", "bagging_freq", "max_bin"]:
    best["reg_params"][k] = int(best["reg_params"][k])
print("Best RMSE (scaled):", trials.best_trial["result"]["loss"])
print("Best params:", best["reg_params"])

# K-fold evaluation with optimized parameters (scaled)
kf = KFold(n_splits=10, shuffle=True, random_state=SEED)
metrics_per_fold = []
for fold, (tr_idx, va_idx) in enumerate(kf.split(X_train_scaled), 1):
    X_tr, X_va = X_train_scaled.iloc[tr_idx], X_train_scaled.iloc[va_idx]
    y_tr, y_va = y_train_scaled[tr_idx], y_train_scaled[va_idx]
    train_ds = lgb.Dataset(X_tr, y_tr, feature_name=list(X_train_scaled.columns))
    valid_ds = lgb.Dataset(X_va, y_va, reference=train_ds)
    model = lgb.train(
        best["reg_params"], train_ds, num_boost_round=1000,
        valid_sets=[valid_ds], valid_names=['val'],
        callbacks=[lgb.early_stopping(stopping_rounds=200, verbose=False)]
    )
    pred = model.predict(X_va, num_iteration=model.best_iteration)
    mse = mean_squared_error(y_va, pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_va, pred)
    r2 = r2_score(y_va, pred)
    metrics_per_fold.append({'fold': fold, 'RMSE': rmse, 'MAE': mae, 'R^2': r2, 'MSE': mse})
    print(f'Fold {fold} metrics (scaled): RMSE: {rmse:.6f} | MAE: {mae:.6f} | R^2: {r2:.6f} | MSE: {mse:.6f}')

# Aggregate and display K-fold metrics
metrics_df = pd.DataFrame(metrics_per_fold)
print('\n=== 10-fold CV metrics (scaled) ===')
print(metrics_df.round(6))

# Train final model (scaled)
rp = best["reg_params"].copy()
num_round = 1000
try:
    train_ds = lgb.Dataset(X_train_scaled, label=y_train_scaled, feature_name=list(X_train_scaled.columns))
    valid_ds = lgb.Dataset(X_val_scaled, label=y_val_scaled, reference=train_ds)
    final = lgb.train(
        rp, train_ds, num_boost_round=num_round,
        valid_sets=[valid_ds], valid_names=['val'],
        callbacks=[lgb.early_stopping(stopping_rounds=10, verbose=False)]
    )
except Exception:
    rp["device"] = "cpu"
    final = lgb.train(
        rp, train_ds, num_boost_round=num_round,
        valid_sets=[valid_ds], valid_names=['val'],
        callbacks=[lgb.early_stopping(stopping_rounds=10, verbose=False)]
    )

pred = final.predict(X_val_scaled, num_iteration=final.best_iteration)
val_mse = mean_squared_error(y_val_scaled, pred)
val_rmse = np.sqrt(val_mse)
val_mae = mean_absolute_error(y_val_scaled, pred)
val_r2 = r2_score(y_val_scaled, pred)
print(f"\nFinal metrics (scaled): RMSE: {val_rmse:.6f} | MAE: {val_mae:.6f} | R^2: {val_r2:.6f} | MSE: {val_mse:.6f} |")

100%|██████████| 30/30 [09:39<00:00, 19.31s/trial, best loss: 0.007600191993309882]
Raw best hyperopt params: {'bagging_fraction': np.float64(0.9711428019901442), 'bagging_freq': np.float64(3.0), 'feature_fraction': np.float64(0.7758107849942487), 'lambda_l1': np.float64(0.0002956434439069393), 'lambda_l2': np.float64(0.008192413045405612), 'learning_rate': np.float64(0.055704635679263516), 'max_bin': np.float64(114.0), 'max_depth': np.int64(2), 'min_data_in_leaf': np.float64(21.0), 'min_gain_to_split': np.float64(0.000415433449785654), 'num_leaves': np.float64(165.0)}
Best LightGBM params: {'objective': 'regression', 'metric': 'rmse', 'boosting_type': 'gbdt', 'feature_pre_filter': False, 'verbosity': -1, 'seed': 10, 'learning_rate': np.float64(0.055704635679263516), 'num_leaves': 165, 'max_depth': 12, 'min_data_in_leaf': 21, 'feature_fraction': np.float64(0.7758107849942487), 'bagging_fraction': np.float64(0.9711428019901442), 'bagging_freq': 3, 'lambda_l1': np.float64(0.0002956434439