In [11]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK, space_eval
from hyperopt.pyll import scope


In [None]:

SEED = 10
MAX_EVALS = 30
np.random.seed(SEED)

# Load data
df = pd.read_csv('C:/Users/Multiplexon/Desktop/data/2/total_selected_augmented.csv')
features = ['Transaction Hash_len', 'Original_len', 'signature_len', 'From_len', 'To_len',
            'sender_len', 'paymaster_len', 'Txn Fee', 'logIndex', 'actualGasCost',
            'actualGasUsed', 'nonce', 'success', 'Blockno', 'DateTime_ts']
X = df[features].astype(float)
y = df['Gas Used'].astype(float)

# Split 80/20 for tuning
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=SEED)

# MinMaxScaler
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()
X_train_scaled = pd.DataFrame(scaler_X.fit_transform(X_train), columns=X.columns, index=X_train.index)
X_val_scaled = pd.DataFrame(scaler_X.transform(X_val), columns=X.columns, index=X_val.index)
y_train_scaled = scaler_y.fit_transform(y_train.values.reshape(-1, 1)).ravel()
y_val_scaled = scaler_y.transform(y_val.values.reshape(-1, 1)).ravel()

# Hyperopt search space for GBDT
space = {
    "reg_params": {
        "n_estimators": scope.int(hp.quniform("gbdt_ne", 200, 1200, 50)),
        "max_depth": scope.int(hp.choice("gbdt_md", [-1, 2, 5, 10])),
        "min_samples_split": scope.int(hp.quniform("gbdt_mss", 2, 20, 1)),
        "min_samples_leaf": scope.int(hp.quniform("gbdt_msl", 1, 10, 1)),
        "learning_rate": hp.uniform("gbdt_lr", 0.01, 0.2),
        "subsample": hp.uniform("gbdt_ss", 0.6, 1.0),
        "max_features": hp.choice("gbdt_mf", ["sqrt", "log2", 0.3, 0.5, 0.7, 0.9]),
        "random_state": SEED
    },
    "fit_params": {}
}

class HPOptimiser:
    def __init__(self, Xtr, Xval, ytr, yval):
        self.Xtr, self.Xval, self.ytr, self.yval = Xtr, Xval, ytr, yval

    def gbdt_reg(self, p):
        rp = p["reg_params"].copy()
        for k in ["n_estimators", "max_depth", "min_samples_split", "min_samples_leaf"]:
            rp[k] = int(rp[k]) if k != "max_depth" or rp[k] != -1 else None
        model = GradientBoostingRegressor(**rp)
        model.fit(self.Xtr, self.ytr)
        pred = model.predict(self.Xval)
        rmse = float(np.sqrt(mean_squared_error(self.yval, pred)))  # RMSE on scaled data
        return {"loss": rmse, "status": STATUS_OK}

# Run Hyperopt (scaled)
trials = Trials()
optim = HPOptimiser(X_train_scaled, X_val_scaled, y_train_scaled, y_val_scaled)
best_raw = fmin(fn=optim.gbdt_reg, space=space, algo=tpe.suggest, max_evals=MAX_EVALS, trials=trials, rstate=np.random.default_rng(SEED))
best = space_eval(space, best_raw)
for k in ["n_estimators", "max_depth", "min_samples_split", "min_samples_leaf"]:
    best["reg_params"][k] = int(best["reg_params"][k]) if k != "max_depth" or best["reg_params"][k] != -1 else None
print("Best RMSE (scaled):", trials.best_trial["result"]["loss"])
print("Best params:", best["reg_params"])

# K-fold evaluation with optimized parameters (scaled)
kf = KFold(n_splits=10, shuffle=True, random_state=SEED)
metrics_per_fold = []
for fold, (tr_idx, va_idx) in enumerate(kf.split(X_train), 1):
    # Split raw data
    X_tr_raw, X_va_raw = X_train.iloc[tr_idx], X_train.iloc[va_idx]
    y_tr_raw, y_va_raw = y_train.iloc[tr_idx], y_train.iloc[va_idx]

    # Fit scalers on training fold
    scaler_X = MinMaxScaler()
    scaler_y = MinMaxScaler()
    X_tr = scaler_X.fit_transform(X_tr_raw)
    X_va = scaler_X.transform(X_va_raw)
    y_tr = scaler_y.fit_transform(y_tr_raw.values.reshape(-1, 1)).ravel()
    y_va = scaler_y.transform(y_va_raw.values.reshape(-1, 1)).ravel()

    # Train and predict
    model = GradientBoostingRegressor(**best["reg_params"])
    model.fit(X_tr, y_tr)
    pred = model.predict(X_va)
    mse = mean_squared_error(y_va, pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_va, pred)
    r2 = r2_score(y_va, pred)
    metrics_per_fold.append({'fold': fold, 'RMSE': rmse, 'MAE': mae, 'R^2': r2, 'MSE': mse})
    print(f'Fold {fold} metrics (scaled): RMSE: {rmse:.6f} | MAE: {mae:.6f} | R^2: {r2:.6f} | MSE: {mse:.6f}')



  0%|          | 0/30 [00:12<?, ?trial/s, best loss=?]


KeyboardInterrupt: 