In [None]:
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from hyperopt.pyll import scope
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, train_test_split

In [11]:
SEED = 42
np.random.seed(SEED)

# Load dữ liệu
data_path = 'C:/Users/Multiplexon/Desktop/data/2/total_selected_augmented.csv'
df = pd.read_csv(data_path)

# Chọn đặc trưng và mục tiêu
features = [
    'Transaction Hash_len', 'Original_len', 'signature_len',
    'From_len', 'To_len', 'sender_len', 'paymaster_len',
    'Txn Fee', 'logIndex', 'actualGasCost',
    'actualGasUsed', 'nonce', 'success', 'Blockno', 'DateTime_ts'
]
target = 'Gas Used'

X = df[features].astype(float)
y = df[target].astype(float)

# Tách train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=SEED
)

# Chuẩn hóa
scaler_X = MinMaxScaler()
X_train_scaled = pd.DataFrame(scaler_X.fit_transform(X_train), columns=X.columns)
X_test_scaled = pd.DataFrame(scaler_X.transform(X_test), columns=X.columns)

scaler_y = MinMaxScaler()
y_train_scaled = scaler_y.fit_transform(y_train.values.reshape(-1, 1)).flatten()
y_test_scaled = scaler_y.transform(y_test.values.reshape(-1, 1)).flatten()

# K-Fold
kf = KFold(n_splits=10, shuffle=True, random_state=SEED)

# Đánh giá
mse_scores, mae_scores, rmse_scores, r2_scores = [], [], [], []

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_scaled), start=1):
    X_tr = X_train_scaled.iloc[train_idx]
    y_tr = y_train_scaled[train_idx]
    X_val = X_train_scaled.iloc[val_idx]
    y_val = y_train_scaled[val_idx]

    model = RandomForestRegressor(
        n_estimators=500,   # tăng cây để ổn định hơn
        max_depth=12,       # bạn có thể điều chỉnh
        min_samples_split=2,
        min_samples_leaf=1,
        max_features="sqrt",
        bootstrap=True,
        random_state=SEED,
        n_jobs=-1
    )

    model.fit(X_tr, y_tr)
    y_pred = model.predict(X_val)

    mse  = mean_squared_error(y_val, y_pred)
    mae  = mean_absolute_error(y_val, y_pred)
    rmse = np.sqrt(mse)
    r2   = r2_score(y_val, y_pred)

    mse_scores.append(mse)
    mae_scores.append(mae)
    rmse_scores.append(rmse)
    r2_scores.append(r2)

    print(f"Fold {fold} - MSE: {mse:.6f} | MAE: {mae:.6f} | RMSE: {rmse:.6f} | R²: {r2:.6f}")

# Tổng kết sau tất cả folds
print("\n=== Kết quả trung bình (scaled) ===")
print(f"Avg MSE : {np.mean(mse_scores):.6f}")
print(f"Avg MAE : {np.mean(mae_scores):.6f}")
print(f"Avg RMSE: {np.mean(rmse_scores):.6f}")
print(f"Avg R²  : {np.mean(r2_scores):.6f}")

Fold 1 - MSE: 0.000076 | MAE: 0.001776 | RMSE: 0.008691 | R²: 0.871646
Fold 2 - MSE: 0.000041 | MAE: 0.001776 | RMSE: 0.006382 | R²: 0.933690
Fold 3 - MSE: 0.000015 | MAE: 0.001703 | RMSE: 0.003828 | R²: 0.969653
Fold 4 - MSE: 0.000017 | MAE: 0.001744 | RMSE: 0.004160 | R²: 0.964813
Fold 5 - MSE: 0.000026 | MAE: 0.001735 | RMSE: 0.005134 | R²: 0.940824
Fold 6 - MSE: 0.000014 | MAE: 0.001730 | RMSE: 0.003725 | R²: 0.966693
Fold 7 - MSE: 0.000023 | MAE: 0.001791 | RMSE: 0.004795 | R²: 0.963587
Fold 8 - MSE: 0.000013 | MAE: 0.001692 | RMSE: 0.003630 | R²: 0.972506
Fold 9 - MSE: 0.000049 | MAE: 0.001766 | RMSE: 0.006999 | R²: 0.912340
Fold 10 - MSE: 0.000027 | MAE: 0.001775 | RMSE: 0.005192 | R²: 0.948422

=== Kết quả trung bình (scaled) ===
Avg MSE : 0.000030
Avg MAE : 0.001749
Avg RMSE: 0.005254
Avg R²  : 0.944418
