In [8]:
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import MinMaxScaler
from hyperopt import hp, fmin, tpe, STATUS_OK, STATUS_FAIL, Trials, space_eval
import pandas as pd
from hyperopt.pyll import scope
import matplotlib.pyplot as plt
import seaborn as sns
from math import sqrt

In [9]:
df = pd.read_csv('C:/Users/Multiplexon/Desktop/data/2/total_selected_augmented.csv', sep=',')
df.head()


Unnamed: 0,Transaction Hash_len,Original_len,signature_len,From_len,To_len,sender_len,paymaster_len,Txn Fee,Gas Used,logIndex,actualGasCost,actualGasUsed,nonce,success,Blockno,DateTime_ts
0,32,964,4,20,20,20,20,0.022033,398741,245,2.19e+16,397164,0.0,1,17066994,1681740540
1,32,868,4,20,20,20,20,0.004362,87702,231,4280000000000000.0,86113,1.0,1,17067000,1681740600
2,32,868,4,20,20,20,20,0.003971,87714,273,3900000000000000.0,86125,2.0,1,17067009,1681740720
3,32,1188,4,20,20,20,20,0.008673,161702,429,8650000000000000.0,161161,3.0,1,17078992,1681887000
4,32,1188,4,20,20,20,20,0.006445,127502,393,6420000000000000.0,126973,4.0,1,17079029,1681887420


In [None]:
# ===== 1) Load data =====
df = pd.read_csv('C:/Users/Multiplexon/Desktop/data/2/total_selected_augmented.csv')

features = [
    'Transaction Hash_len', 'Original_len', 'signature_len',
    'From_len', 'To_len', 'sender_len', 'paymaster_len',
    'Txn Fee', 'logIndex', 'actualGasCost',
    'actualGasUsed', 'nonce', 'success', 'Blockno', 'DateTime_ts'
]
X = df[features].reset_index(drop=True)
y = df['Gas Used'].astype(float).reset_index(drop=True)

# (Tùy chọn) log-transform target nếu skew mạnh
USE_LOG_TARGET = False
y_trainable = np.log1p(y) if USE_LOG_TARGET else y.copy()

# ===== 2) Search space =====
def base_space():
    return {
        'depth': hp.quniform('depth', 4, 10, 1),
        'l2_leaf_reg': hp.loguniform('l2_leaf_reg', np.log(1e-2), np.log(1e2)),
        'iterations': hp.quniform('iterations', 400, 2000, 50),
        'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.2)),
        'bootstrap_type': hp.choice('bootstrap_type', ['Bayesian', 'Bernoulli']),
        'subsample': hp.uniform('subsample', 0.6, 1.0),
        'bagging_temperature': hp.uniform('bagging_temperature', 0.0, 1.0),
        'random_strength': hp.uniform('random_strength', 0.0, 5.0),
        'rsm': hp.uniform('rsm', 0.6, 1.0),
        'leaf_estimation_iterations': hp.quniform('leaf_estimation_iterations', 1, 20, 1)
    }

def trust_space(center, delta=0.25):
    def clamp(v, lo, hi):
        return float(max(lo, min(hi, v)))
    return {
        'depth': hp.quniform(
            'depth',
            max(4, int(center['depth']) - 1),
            min(10, int(center['depth']) + 1), 1
        ),
        'l2_leaf_reg': hp.loguniform(
            'l2_leaf_reg',
            np.log(clamp(center['l2_leaf_reg'] * (1 - delta), 1e-2, 1e2)),
            np.log(clamp(center['l2_leaf_reg'] * (1 + delta), 1e-2, 1e2))
        ),
        'iterations': hp.quniform(
            'iterations',
            clamp(center['iterations'] - 300, 200, 3000),
            clamp(center['iterations'] + 300, 200, 3000), 50
        ),
        'learning_rate': hp.loguniform(
            'learning_rate',
            np.log(clamp(center['learning_rate'] * (1 - delta), 0.005, 0.3)),
            np.log(clamp(center['learning_rate'] * (1 + delta), 0.005, 0.3))
        ),
        'bootstrap_type': hp.choice('bootstrap_type', ['Bayesian', 'Bernoulli']),
        'subsample': hp.uniform(
            'subsample',
            clamp(center.get('subsample', 0.8) - delta, 0.5, 1.0),
            clamp(center.get('subsample', 0.8) + delta, 0.5, 1.0)
        ),
        'bagging_temperature': hp.uniform(
            'bagging_temperature',
            clamp(center.get('bagging_temperature', 0.5) - delta, 0.0, 1.0),
            clamp(center.get('bagging_temperature', 0.5) + delta, 0.0, 1.0)
        ),
        'random_strength': hp.uniform(
            'random_strength',
            clamp(center.get('random_strength', 1.0) - 2 * delta, 0.0, 10.0),
            clamp(center.get('random_strength', 1.0) + 2 * delta, 0.0, 10.0)
        ),
        'rsm': hp.uniform(
            'rsm',
            clamp(center.get('rsm', 0.8) - delta, 0.5, 1.0),
            clamp(center.get('rsm', 0.8) + delta, 0.5, 1.0)
        ),
        'leaf_estimation_iterations': hp.quniform(
            'leaf_estimation_iterations',
            max(1, int(center.get('leaf_estimation_iterations', 10)) - 5),
            min(32, int(center.get('leaf_estimation_iterations', 10)) + 5), 1
        )
    }

# ===== 3) Cross-validation objective =====
kf = KFold(n_splits=10, shuffle=True, random_state=42)

def cv_metrics(params):
    p = params.copy()
    p['depth'] = int(p['depth'])
    p['iterations'] = int(p['iterations'])
    p['leaf_estimation_iterations'] = int(p['leaf_estimation_iterations'])
    p['subsample'] = float(p['subsample'])
    p['rsm'] = float(p['rsm'])

    rmse_list, mse_list, mae_list, r2_list = [], [], [], []

    for tr_idx, va_idx in kf.split(X):
        X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
        y_tr, y_va = y_trainable.iloc[tr_idx], y_trainable.iloc[va_idx]

        kwargs = dict(
            loss_function='RMSE',
            eval_metric='R2',
            iterations=p['iterations'],
            learning_rate=float(p['learning_rate']),
            depth=p['depth'],
            l2_leaf_reg=float(p['l2_leaf_reg']),
            random_strength=float(p['random_strength']),
            rsm=float(p['rsm']),
            leaf_estimation_iterations=p['leaf_estimation_iterations'],
            random_seed=42,
            verbose=False,
            allow_writing_files=False
        )
        if p['bootstrap_type'] == 'Bayesian':
            kwargs.update(bootstrap_type='Bayesian',
                          bagging_temperature=float(p['bagging_temperature']))
        else:
            kwargs.update(bootstrap_type='Bernoulli',
                          subsample=float(p['subsample']))

        model = CatBoostRegressor(**kwargs)
        model.fit(X_tr, y_tr, eval_set=(X_va, y_va),
                  use_best_model=True, early_stopping_rounds=200)

        preds = model.predict(X_va)
        if USE_LOG_TARGET:
            y_true = np.expm1(y_va.values)
            y_pred = np.expm1(preds)
        else:
            y_true = y_va.values
            y_pred = preds

        rmse = np.sqrt(mean_squared_error(y_true, y_pred))
        mse  = mean_squared_error(y_true, y_pred)
        mae  = mean_absolute_error(y_true, y_pred)
        r2   = r2_score(y_true, y_pred)

        if not np.isfinite([rmse, mse, mae, r2]).all():
            return None

        rmse_list.append(rmse); mse_list.append(mse); mae_list.append(mae); r2_list.append(r2)

    return {
        'rmse': rmse_list,
        'mse' : mse_list,
        'mae' : mae_list,
        'r2'  : r2_list,
        'mean_r2': float(np.mean(r2_list))
    }

def objective(params):
    out = cv_metrics(params)
    if out is None:
        return {'loss': 1e9, 'status': STATUS_OK}
    mean_r2 = out['mean_r2']
    return {
        'loss': float(1.0 - mean_r2),  # minimize (1 - R²)
        'status': STATUS_OK,
        'metrics': out,
        'params': params
    }

# ===== 4) Bayesian Optimization + Trust Region =====
TOTAL_EVALS = 60
STEP = 20
trials = Trials()
rng = np.random.default_rng(42)

for i in range(0, TOTAL_EVALS, STEP):
    if i == 0:
        space = base_space()
    else:
        past = sorted(trials.trials, key=lambda t: t['result']['loss'])
        topk = past[: min(10, len(past))]
        center = {}
        for k in ['depth','l2_leaf_reg','iterations','learning_rate','subsample',
                  'bagging_temperature','random_strength','rsm','leaf_estimation_iterations']:
            center[k] = float(np.mean([t['result']['params'][k] for t in topk]))
        space = trust_space(center, delta=0.25)

    fmin(fn=objective, space=space, algo=tpe.suggest,
         max_evals=i + STEP, trials=trials, rstate=rng)

# ===== 5) Collect & report =====
rows = []
for t_idx, t in enumerate(trials.trials, start=1):
    m = t['result']['metrics']
    for f in range(10):
        rows.append({
            'trial': t_idx, 'fold': f+1,
            'rmse': m['rmse'][f], 'mse': m['mse'][f],
            'mae': m['mae'][f], 'r2': m['r2'][f]
        })
results_df = pd.DataFrame(rows)

print("\n📊 Mỗi trial × fold:")
print(results_df)

print("\n📈 Trung bình theo fold:")
print(results_df.groupby('fold', as_index=True).mean(numeric_only=True))

final_avg = results_df[['rmse','mse','mae','r2']].mean()
print("\n🎯 Trung bình cuối cùng (10 folds × tất cả trials):")
print(final_avg)

# ===== 6) Best trial =====
best_trial = min(trials.trials, key=lambda t: t['result']['loss'])
best_loss = best_trial['result']['loss']
best_r2 = 1.0 - best_loss
best_params = best_trial['result']['params']

print("\n🏆 Best R² (CV mean):", best_r2)
print("🔧 Best params:", best_params)

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 95)