In [1]:
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import MinMaxScaler
from hyperopt import hp, fmin, tpe, STATUS_OK, STATUS_FAIL, Trials, space_eval
import pandas as pd
from hyperopt.pyll import scope
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('C:/Users/Multiplexon/Desktop/data/2/total_selected_augmented.csv', sep=',')
df.head()


Unnamed: 0,Transaction Hash_len,Original_len,signature_len,From_len,To_len,sender_len,paymaster_len,Txn Fee,Gas Used,logIndex,actualGasCost,actualGasUsed,nonce,success,Blockno,DateTime_ts
0,32,964,4,20,20,20,20,0.022033,398741,245,2.19e+16,397164,0.0,1,17066994,1681740540
1,32,868,4,20,20,20,20,0.004362,87702,231,4280000000000000.0,86113,1.0,1,17067000,1681740600
2,32,868,4,20,20,20,20,0.003971,87714,273,3900000000000000.0,86125,2.0,1,17067009,1681740720
3,32,1188,4,20,20,20,20,0.008673,161702,429,8650000000000000.0,161161,3.0,1,17078992,1681887000
4,32,1188,4,20,20,20,20,0.006445,127502,393,6420000000000000.0,126973,4.0,1,17079029,1681887420


In [None]:
# ================== 0) SEED ==================
SEED = 42
np.random.seed(SEED)

# ================== 1) LOAD & PREP ==================
df = pd.read_csv(r'C:/Users/Multiplexon/Desktop/data/2/total_selected_augmented.csv')

features = [
    'Transaction Hash_len', 'Original_len', 'signature_len', 'From_len', 'To_len',
    'sender_len', 'paymaster_len', 'Txn Fee', 'logIndex', 'actualGasCost',
    'actualGasUsed', 'nonce', 'success', 'Blockno', 'DateTime_ts'
]
target = 'Gas Used'

X_raw = df[features].copy()
y_raw = df[target].astype(float).copy()

# Hold-out for Hyperopt
X_train, X_val, y_train, y_val = train_test_split(
    X_raw, y_raw, test_size=0.2, random_state=SEED
)

# MinMax scale on TRAIN, transform VAL
x_scaler = MinMaxScaler()
y_scaler = MinMaxScaler()

X_train_s = x_scaler.fit_transform(X_train)
X_val_s   = x_scaler.transform(X_val)
y_train_s = y_scaler.fit_transform(y_train.values.reshape(-1,1)).ravel()
y_val_s   = y_scaler.transform(y_val.values.reshape(-1,1)).ravel()

# ----- 1) Định nghĩa space và các tham số cố định -----
catboost_hyperopt_space = {
    "iterations": scope.int(hp.quniform("iterations", 1000, 50000, 500)),
    "learning_rate": hp.loguniform("learning_rate", np.log(1e-3), np.log(3e-1)),
    "depth": scope.int(hp.quniform("depth", 4, 10, 1)),
    "l2_leaf_reg": hp.loguniform("l2_leaf_reg", np.log(1e-2), np.log(1e2)),
    "min_data_in_leaf": scope.int(hp.quniform("min_data_in_leaf", 1, 2000, 1)),
    "random_strength": hp.uniform("random_strength", 0.0, 2.0),
}

catboost_fixed_params = {
    "eval_metric": "RMSE",
    "random_seed": SEED,
    "task_type": "GPU",
    "devices": "0",
    "logging_level": "Silent",
}

catboost_fit_params = {
    "early_stopping_rounds":100,
    "verbose": False,
    "use_best_model": True,
}

# ----- 2) Đơn giản hóa HPOptimiser -----
class HPOptimiser:
    def __init__(self, X_train, X_val, y_train, y_val):
        self.X_train = X_train
        self.X_val   = X_val
        self.y_train = y_train
        self.y_val   = y_val

    def objective(self, hyperparams):
        model_params = {**hyperparams, **catboost_fixed_params}
        model = CatBoostRegressor(**model_params)
        model.fit(self.X_train, self.y_train,
                  eval_set=(self.X_val, self.y_val),
                  **catboost_fit_params)
        pred = model.predict(self.X_val)
        rmse = float(np.sqrt(mean_squared_error(self.y_val, pred)))
        return {'loss': rmse, 'status': STATUS_OK, 'model': model}

    def run(self, space, max_evals=80):
        trials = Trials()
        best = fmin(
            fn=self.objective,
            space=space,
            algo=tpe.suggest,
            max_evals=max_evals,
            trials=trials,
            rstate=np.random.default_rng(SEED)
        )
        return best, trials

# ====== 3) CHẠY HYPEROPT TRÊN DỮ LIỆU ĐÃ SCALE ======
print("Bắt đầu tối ưu hóa Hyperopt...")
opt = HPOptimiser(X_train_s, X_val_s, y_train_s, y_val_s)
best_indices, trials = opt.run(catboost_hyperopt_space, max_evals=80) 

opt_catboost_params = space_eval(catboost_hyperopt_space, best_indices)
opt_catboost_params.update(catboost_fixed_params)

print("\n=== TỐI ƯU HOÀN TẤT ===")
print("Best scaled RMSE:", trials.best_trial["result"]["loss"])
print("Các tham số tối ưu (opt_catboost_params):")
print(opt_catboost_params)

N_FOLDS = 10
early_stop_rounds = 100

k_folds = KFold(n_splits=N_FOLDS, shuffle=True, random_state=42)
fold_rmses, fold_mses, fold_maes, fold_r2s = [], [], [], []

# Bỏ rsm nếu lỡ còn trong params (GPU + RMSE không hỗ trợ)
run_params = dict(opt_catboost_params)
run_params.pop("rsm", None)

for fold, (train_idx, val_idx) in enumerate(k_folds.split(X, y), 1):
    X_tr, X_va = X.iloc[train_idx].copy(), X.iloc[val_idx].copy()
    y_tr = y.iloc[train_idx].astype(float).values.reshape(-1,1)
    y_va = y.iloc[val_idx].astype(float).values.reshape(-1,1)

    xs, ys = MinMaxScaler(), MinMaxScaler()
    X_tr_s = xs.fit_transform(X_tr).astype(np.float32)
    X_va_s = xs.transform(X_va).astype(np.float32)
    y_tr_s = ys.fit_transform(y_tr).ravel().astype(np.float32)
    y_va_s = ys.transform(y_va).ravel().astype(np.float32)

    model = CatBoostRegressor(**run_params)
    model.fit(X_tr_s, y_tr_s,
              eval_set=(X_va_s, y_va_s),
              use_best_model=True,
              early_stopping_rounds=early_stop_rounds,
              verbose=False)

    pred = model.predict(X_va_s).ravel().astype(np.float32)
    mse  = mean_squared_error(y_va_s, pred)
    rmse = float(np.sqrt(mse))
    mae  = mean_absolute_error(y_va_s, pred)
    r2   = r2_score(y_va_s, pred)

    fold_mses.append(mse); fold_rmses.append(rmse); fold_maes.append(mae); fold_r2s.append(r2)
    print(f'[Fold {fold:2d}] RMSE: {rmse:.6f} | MSE: {mse:.6f} | MAE: {mae:.6f} | R2: {r2:.6f}')



Before scaling:
X_train: (87492, 15) 
y_train: (87492,) 
X_val: (21873, 15), 
y_val: (21873,)

After scaling:
X_train_scaled: (87492, 15) 
y_train_scaled: (87492,) 
X_val_scaled: (21873, 15), 
y_val_scaled: (21873,)

Check scaling X_train (min/max):
Transaction Hash_len    0.0
Original_len            0.0
signature_len           0.0
From_len                0.0
To_len                  0.0
sender_len              0.0
paymaster_len           0.0
Txn Fee                 0.0
logIndex                0.0
actualGasCost           0.0
actualGasUsed           0.0
nonce                   0.0
success                 0.0
Blockno                 0.0
DateTime_ts             0.0
dtype: float64
Transaction Hash_len    0.0
Original_len            1.0
signature_len           0.0
From_len                0.0
To_len                  0.0
sender_len              0.0
paymaster_len           0.0
Txn Fee                 1.0
logIndex                1.0
actualGasCost           1.0
actualGasUsed           1.0
nonce  