In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.svm import SVR

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [None]:
SEED = 42
np.random.seed(SEED)

# ==== Load & select cols (đã preprocessing) ====
data_path = r'C:/Users/Multiplexon/Desktop/data/2/total_selected_augmented.csv'
df = pd.read_csv(data_path)

features = [
    'Transaction Hash_len', 'Original_len', 'signature_len',
    'From_len', 'To_len', 'sender_len', 'paymaster_len',
    'Txn Fee', 'logIndex', 'actualGasCost',
    'actualGasUsed', 'nonce', 'success', 'Blockno', 'DateTime_ts'
]
target = 'Gas Used'

X = df[features].apply(pd.to_numeric, errors='coerce').replace([np.inf,-np.inf], np.nan).fillna(0.0)
y = pd.to_numeric(df[target], errors='coerce').replace([np.inf,-np.inf], np.nan).fillna(0.0)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

# ==== K-Fold + SVR (metrics trên THANG ĐÃ SCALE) ====
kf = KFold(n_splits=10, shuffle=True, random_state=SEED)

svr = SVR(kernel='rbf', C=30.0, gamma='scale', epsilon=0.01,
          tol=1e-2, max_iter=-1, cache_size=2000, shrinking=True)

rmse_list, mse_list, mae_list, r2_list = [], [], [], []

print("=== K-Fold SVR – metrics trên THANG ĐÃ SCALE ===")
for fold, (tr_idx, val_idx) in enumerate(kf.split(X_train), start=1):
    X_tr = X_train.iloc[tr_idx].to_numpy()
    X_va = X_train.iloc[val_idx].to_numpy()
    y_tr = y_train.iloc[tr_idx].to_numpy().reshape(-1,1)
    y_va = y_train.iloc[val_idx].to_numpy().reshape(-1,1)

    # Scale THEO FOLD
    sx, sy = StandardScaler(), MinMaxScaler()
    X_tr_s, X_va_s = sx.fit_transform(X_tr), sx.transform(X_va)
    y_tr_s, y_va_s = sy.fit_transform(y_tr).ravel(), sy.transform(y_va).ravel()

    svr.fit(X_tr_s, y_tr_s)
    y_pred_s = svr.predict(X_va_s)

    mse  = mean_squared_error(y_va_s, y_pred_s)
    rmse = float(np.sqrt(mse))
    mae  = mean_absolute_error(y_va_s, y_pred_s)
    r2   = r2_score(y_va_s, y_pred_s)

    mse_list.append(mse); rmse_list.append(rmse); mae_list.append(mae); r2_list.append(r2)
    print(f"[Fold {fold:2d}] RMSE: {rmse:.6f} | MSE: {mse:.6f} | MAE: {mae:.6f} | R2: {r2:.6f}")

print("\n=== Trung bình qua folds (scaled) ===")
print(f"RMSE: {np.mean(rmse_list):.6f}")
print(f"MSE : {np.mean(mse_list):.6f}")
print(f"MAE : {np.mean(mae_list):.6f}")
print(f"R2  : {np.mean(r2_list):.6f}")

=== K-Fold SVR – metrics trên THANG ĐÃ SCALE ===
[Fold  1] RMSE: 0.011646 | MSE: 0.000136 | MAE: 0.004135 | R2: 0.769510


In [None]:
epochs = 20   # bạn đổi thành 30 nếu muốn
k = 10
cats = np.zeros((epochs, 4))  # lưu [RMSE, MSE, MAE, R2] cho mỗi epoch

for ep in range(epochs):
    kf = KFold(n_splits=k, shuffle=True, random_state=SEED + ep)

    rmse_list, mse_list, mae_list, r2_list = [], [], [], []

    for train_idx, val_idx in kf.split(X_train_scaled):
        X_tr, X_va = X_train_scaled.iloc[train_idx], X_train_scaled.iloc[val_idx]
        y_tr, y_va = y_train_scaled[train_idx], y_train_scaled[val_idx]

        model = SVR()
            
        model.fit(X_tr, y_tr)
        y_pred = model.predict(X_va)

        mse  = mean_squared_error(y_va, y_pred)
        rmse = np.sqrt(mse)
        mae  = mean_absolute_error(y_va, y_pred)
        r2   = r2_score(y_va, y_pred)

        rmse_list.append(rmse)
        mse_list.append(mse)
        mae_list.append(mae)
        r2_list.append(r2)

    # lưu mean metrics của epoch này (qua 10 folds)
    cats[ep, :] = [
        np.mean(rmse_list),
        np.mean(mse_list),
        np.mean(mae_list),
        np.mean(r2_list)
    ]

# mean cuối cùng qua tất cả epochs
final_mean = np.mean(cats, axis=0)

print(final_mean)  # 1 dòng mean cuối cùng
print(cats)        # bảng (epochs × 4)

NameError: name 'np' is not defined