In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.svm import SVR

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [None]:
=SEED = 42
np.random.seed(SEED)

# ==== Load & select cols (đã preprocessing) ====
data_path = r'C:/Users/Multiplexon/Desktop/data/2/total_selected_augmented.csv'
df = pd.read_csv(data_path)

features = [
    'Transaction Hash_len', 'Original_len', 'signature_len',
    'From_len', 'To_len', 'sender_len', 'paymaster_len',
    'Txn Fee', 'logIndex', 'actualGasCost',
    'actualGasUsed', 'nonce', 'success', 'Blockno', 'DateTime_ts'
]
target = 'Gas Used'

X = df[features].apply(pd.to_numeric, errors='coerce').replace([np.inf,-np.inf], np.nan).fillna(0.0)
y = pd.to_numeric(df[target], errors='coerce').replace([np.inf,-np.inf], np.nan).fillna(0.0)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

# ==== K-Fold + SVR (metrics trên THANG ĐÃ SCALE) ====
kf = KFold(n_splits=5, shuffle=True, random_state=SEED)

# ⚙️ Cấu hình giúp hội tụ ổn hơn:
# - giảm C (dễ hội tụ hơn), giảm epsilon (bớt underfit),
# - tol nới lỏng một chút để sớm hội tụ,
# - max_iter=-1 (không giới hạn; thường sẽ dừng nhờ 'tol').
svr = SVR(kernel='rbf', C=30.0, gamma='scale', epsilon=0.01,
          tol=1e-2, max_iter=-1, cache_size=2000, shrinking=True)

rmse_list, mse_list, mae_list, r2_list = [], [], [], []

print("=== K-Fold SVR – metrics trên THANG ĐÃ SCALE ===")
for fold, (tr_idx, val_idx) in enumerate(kf.split(X_train), start=1):
    X_tr = X_train.iloc[tr_idx].to_numpy()
    X_va = X_train.iloc[val_idx].to_numpy()
    y_tr = y_train.iloc[tr_idx].to_numpy().reshape(-1,1)
    y_va = y_train.iloc[val_idx].to_numpy().reshape(-1,1)

    # Scale THEO FOLD
    sx, sy = StandardScaler(), MinMaxScaler()
    X_tr_s, X_va_s = sx.fit_transform(X_tr), sx.transform(X_va)
    y_tr_s, y_va_s = sy.fit_transform(y_tr).ravel(), sy.transform(y_va).ravel()

    svr.fit(X_tr_s, y_tr_s)
    y_pred_s = svr.predict(X_va_s)

    mse  = mean_squared_error(y_va_s, y_pred_s)
    rmse = float(np.sqrt(mse))
    mae  = mean_absolute_error(y_va_s, y_pred_s)
    r2   = r2_score(y_va_s, y_pred_s)

    mse_list.append(mse); rmse_list.append(rmse); mae_list.append(mae); r2_list.append(r2)
    print(f"[Fold {fold:2d}] RMSE: {rmse:.6f} | MSE: {mse:.6f} | MAE: {mae:.6f} | R2: {r2:.6f}")

print("\n=== Trung bình qua folds (scaled) ===")
print(f"RMSE: {np.mean(rmse_list):.6f}")
print(f"MSE : {np.mean(mse_list):.6f}")
print(f"MAE : {np.mean(mae_list):.6f}")
print(f"R2  : {np.mean(r2_list):.6f}")

=== K-Fold SVR – metrics trên THANG ĐÃ SCALE ===




[Fold  1] RMSE: 0.014716 | MSE: 0.000217 | MAE: 0.007277 | R2: 0.676593




[Fold  2] RMSE: 0.007399 | MSE: 0.000055 | MAE: 0.004187 | R2: 0.887684




[Fold  3] RMSE: 0.009597 | MSE: 0.000092 | MAE: 0.006993 | R2: 0.786309




[Fold  4] RMSE: 0.010747 | MSE: 0.000115 | MAE: 0.006650 | R2: 0.792060




[Fold  5] RMSE: 0.012525 | MSE: 0.000157 | MAE: 0.006942 | R2: 0.709878

=== Trung bình qua folds (scaled) ===
RMSE: 0.010997
MSE : 0.000127
MAE : 0.006410
R2  : 0.770505
