In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score



In [None]:
# Set seed
SEED = 42
np.random.seed(SEED)

# Load dữ liệu
data_path = 'C:/Users/Multiplexon/Desktop/data/2/total_selected_augmented.csv'
df = pd.read_csv(data_path)

# Chọn đặc trưng và mục tiêu
features = [
    'Transaction Hash_len', 'Original_len', 'signature_len',
    'From_len', 'To_len', 'sender_len', 'paymaster_len',
    'Txn Fee', 'logIndex', 'actualGasCost',
    'actualGasUsed', 'nonce', 'success', 'Blockno', 'DateTime_ts'
]
target = 'Gas Used'

X = df[features].astype(float)
y = df[target].astype(float)

# Tách train/test (80/20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=SEED
)

# Chuẩn hóa
scaler_X = MinMaxScaler()
X_train_scaled = pd.DataFrame(scaler_X.fit_transform(X_train), columns=X.columns)
X_test_scaled = pd.DataFrame(scaler_X.transform(X_test), columns=X.columns)

scaler_y = MinMaxScaler()
y_train_scaled = scaler_y.fit_transform(y_train.values.reshape(-1, 1)).flatten()
y_test_scaled = scaler_y.transform(y_test.values.reshape(-1, 1)).flatten()

# K-Fold chỉ trên tập train
kf = KFold(n_splits=10, shuffle=True, random_state=SEED)

mse_scores = []
mae_scores = []
rmse_scores = []
r2_scores = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_scaled), start=1):
    X_tr = X_train_scaled.iloc[train_idx]
    y_tr = y_train_scaled[train_idx]
    X_val = X_train_scaled.iloc[val_idx]
    y_val = y_train_scaled[val_idx]

    model = GradientBoostingRegressor(loss='squared_error', learning_rate=0.1, n_estimators=100, subsample=1.0,
        criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, max_depth=3, min_impurity_decrease=0.0,alpha=0.9, validation_fraction=0.1, tol=0.0001)
    model.fit(X_tr, y_tr)

    y_pred = model.predict(X_val)

    mse = mean_squared_error(y_val, y_pred)
    mae = mean_absolute_error(y_val, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_val, y_pred)

    mse_scores.append(mse)
    mae_scores.append(mae)
    rmse_scores.append(rmse)
    r2_scores.append(r2)

    print(f"Fold {fold}: MSE = {mse:.6f}, MAE = {mae:.6f}, RMSE = {rmse:.6f}, R² = {r2:.6f}")

print(f"Avg MSE : {np.mean(mse_scores):.6f}")
print(f"Avg MAE : {np.mean(mae_scores):.6f}")
print(f"Avg RMSE: {np.mean(rmse_scores):.6f}")
print(f"Avg R²  : {np.mean(r2_scores):.6f}")

Fold 1: MSE = 0.000011, MAE = 0.001350, RMSE = 0.003301, R² = 0.981477
Fold 2: MSE = 0.000013, MAE = 0.001343, RMSE = 0.003570, R² = 0.979252
Fold 3: MSE = 0.000010, MAE = 0.001307, RMSE = 0.003189, R² = 0.978931
Fold 4: MSE = 0.000011, MAE = 0.001358, RMSE = 0.003355, R² = 0.977123
Fold 5: MSE = 0.000009, MAE = 0.001273, RMSE = 0.003067, R² = 0.978876
Fold 6: MSE = 0.000011, MAE = 0.001347, RMSE = 0.003265, R² = 0.974410
Fold 7: MSE = 0.000009, MAE = 0.001301, RMSE = 0.003002, R² = 0.985727
Fold 8: MSE = 0.000010, MAE = 0.001349, RMSE = 0.003126, R² = 0.979612
Fold 9: MSE = 0.000009, MAE = 0.001287, RMSE = 0.002963, R² = 0.984288
Fold 10: MSE = 0.000010, MAE = 0.001342, RMSE = 0.003160, R² = 0.980898
Avg MSE : 0.000010
Avg MAE : 0.001326
Avg RMSE: 0.003200
Avg R²  : 0.980059
