In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import lightgbm as lgb
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from collections import OrderedDict

In [None]:
SEED = 10
MAX_EVALS = 50
np.random.seed(SEED)

# ---------- Helpers ----------
def rmse(y_true, y_pred):
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))

def cast_ints(p):
    p = p.copy()
    for k in ["num_leaves", "max_depth", "min_data_in_leaf", "bagging_freq", "max_bin"]:
        if k in p:
            p[k] = int(p[k])
    return p

def gpu_flag():
    """Return correct GPU param for LightGBM version (GPU-ONLY)."""
    try:
        major = int(lgb.__version__.split('.')[0])
    except Exception:
        major = 4
    return {"device": "gpu"} if major >= 4 else {"device_type": "gpu"}

def map_lgbm_params(p):
    """Map hyperopt keys to LGBMRegressor kwargs."""
    p = p.copy()
    # map regularization names
    if "lambda_l1" in p:
        p["reg_alpha"] = p.pop("lambda_l1")
    if "lambda_l2" in p:
        p["reg_lambda"] = p.pop("lambda_l2")
    return p

# ---------- 1) Load data ----------
df = pd.read_csv('C:/Users/Multiplexon/Desktop/data/2/total_selected_augmented.csv', sep=',')

features = ['Transaction Hash_len','Original_len','signature_len',
            'From_len','To_len','sender_len','paymaster_len',
            'Txn Fee','logIndex','actualGasCost',
            'actualGasUsed','nonce','success','Blockno','DateTime_ts']

X = df.loc[:, features].astype(float)
y = df['Gas Used'].astype(float)

# ---------- 2) Split 80/20 for tuning ----------
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.20, random_state=SEED, shuffle=True
)

# ---------- 3) MinMaxScaler (fit on train for tuning stage) ----------
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()

X_train_s = pd.DataFrame(scaler_X.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
X_val_s   = pd.DataFrame(scaler_X.transform(X_val),     columns=X_val.columns,     index=X_val.index)
y_train_s = scaler_y.fit_transform(y_train.values.reshape(-1,1)).ravel()
y_val_s   = scaler_y.transform(y_val.values.reshape(-1,1)).ravel()

# ---------- 4) Hyperopt search space (GPU-safe max_bin ≤ 255) ----------
space = {
    "learning_rate":     hp.loguniform("learning_rate", np.log(0.01), np.log(0.2)),
    "num_leaves":        hp.quniform("num_leaves", 15, 255, 1),
    "max_depth":         hp.quniform("max_depth", -1, 16, 1),  # -1 = no limit
    "min_data_in_leaf":  hp.quniform("min_data_in_leaf", 10, 500, 1),
    "feature_fraction":  hp.uniform("feature_fraction", 0.6, 1.0),
    "bagging_fraction":  hp.uniform("bagging_fraction", 0.6, 1.0),
    "bagging_freq":      hp.quniform("bagging_freq", 0, 10, 1),
    "lambda_l1":         hp.loguniform("lambda_l1", np.log(1e-4), np.log(10.0)),
    "lambda_l2":         hp.loguniform("lambda_l2", np.log(1e-4), np.log(10.0)),
    "min_split_gain":    hp.uniform("min_split_gain", 0.0, 1.0),
    "extra_trees":       hp.choice("extra_trees", [False, True]),
    "max_bin":           hp.quniform("max_bin", 63, 255, 1),   # GPU limit
}

# ---------- 5) Objective (GPU-only, scaled RMSE) ----------
def objective(params):
    params = cast_ints(params)
    params["max_bin"] = min(255, params.get("max_bin", 255))
    params = map_lgbm_params(params)

    model = LGBMRegressor(
        n_estimators=10000,
        boosting_type="gbdt",
        objective="regression",
        random_state=SEED,
        # GPU-ONLY flag (v3/v4 compatible)
        **gpu_flag(),
        # forward tuned params:
        **params
    )
    model.fit(
        X_train_s, y_train_s,
        eval_set=[(X_val_s, y_val_s)],
        eval_metric="rmse",
        callbacks=[
            lgb.early_stopping(stopping_rounds=100, verbose=False),
            lgb.log_evaluation(period=0)  # silence
        ]
    )
    pred_val_s = model.predict(X_val_s)
    return {"loss": rmse(y_val_s, pred_val_s), "status": STATUS_OK,
            "best_iteration": getattr(model, "best_iteration_", None)}

# ---------- 6) Run Hyperopt ----------
trials = Trials()
best = fmin(
    fn=objective, space=space, algo=tpe.suggest,
    max_evals=MAX_EVALS, trials=trials, rstate=np.random.default_rng(SEED)
)
best = cast_ints(best)
best["max_bin"] = min(255, best.get("max_bin", 255))  # (GPU-safe)
if "extra_trees" in best:
    best["extra_trees"] = bool(best["extra_trees"])   # <<< FIX 0/1 -> bool
best = map_lgbm_params(best)  # nếu bạn dùng bản LGBMRegressor có map reg_alpha/reg_lambda
print("Best hyperparams (scaled-RMSE objective):", best)

# ---------- 7) Final 10-fold CV (GPU-only, scaled-only) ----------
base_final = dict(
    n_estimators=10000,
    boosting_type="gbdt",
    objective="regression",
    random_state=SEED,
    **gpu_flag(),
    **best
)

kf = KFold(n_splits=10, shuffle=True, random_state=SEED)
fold_rows = [] 
rmse_s_list, mae_s_list, r2_s_list = [], [], []
best_iters = []
feat_gain = np.zeros(X.shape[1], dtype=float)

for fold, (tr_idx, va_idx) in enumerate(kf.split(X), 1):
    X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
    y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]

    # --- per-fold scalers (no leakage)
    scX = MinMaxScaler(); scY = MinMaxScaler()
    X_tr_s = pd.DataFrame(scX.fit_transform(X_tr), columns=X.columns, index=X_tr.index)
    X_va_s = pd.DataFrame(scX.transform(X_va),    columns=X.columns, index=X_va.index)
    y_tr_s = scY.fit_transform(y_tr.values.reshape(-1,1)).ravel()
    y_va_s = scY.transform(y_va.values.reshape(-1,1)).ravel()

    model = LGBMRegressor(**base_final)
    model.fit(
        X_tr_s, y_tr_s,
        eval_set=[(X_va_s, y_va_s)],
        eval_metric="rmse",
        callbacks=[
            lgb.early_stopping(stopping_rounds=100, verbose=False),
            lgb.log_evaluation(period=0)
        ]
    )

    bi = getattr(model, "best_iteration_", base_final["n_estimators"])
    best_iters.append(bi)
    pred_va_s = model.predict(X_va_s)
    
    
    mse_val  = float(mean_squared_error(y_va_s, pred_va_s))
    rmse_val = float(np.sqrt(mse_val))
    mae_val  = float(mean_absolute_error(y_va_s, pred_va_s))
    r2_val   = float(r2_score(y_va_s, pred_va_s))

    rmse_s_list.append(rmse_s); mae_s_list.append(mae_s); r2_s_list.append(r2_s)
    print(f"{ {'fold': fold, 'MSE': round(mse_val, 10), 'RMSE': round(rmse_val, 8), 'MAE': round(mae_val, 8), 'R^2': round(r2_val, 6)} }")

    # -- lưu vào bảng
    fold_rows.append(OrderedDict([
        ("fold", fold),
        ("MSE",  mse_val),
        ("RMSE", rmse_val),
        ("MAE",  mae_val),
        ("R^2",  r2_val),
    ]))
    df_folds = pd.DataFrame(fold_rows)
# làm tròn khi hiển thị
print("\nKết quả từng fold (scaled):")
print(df_folds.round({"MSE": 10, "RMSE": 8, "MAE": 8, "R^2": 6}).to_string(index=False))

 22%|██▏       | 11/50 [00:11<00:35,  1.09trial/s, best loss: 0.011410109905839143]