In [2]:
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from hyperopt.pyll import scope
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold, train_test_split

In [4]:
# Set seed
SEED = 42
np.random.seed(SEED)

# Load dữ liệu
data_path = 'C:/Users/Multiplexon/Desktop/data/2/total_selected_augmented.csv'
df = pd.read_csv(data_path)

# Chọn đặc trưng và mục tiêu
features = [
    'Transaction Hash_len', 'Original_len', 'signature_len',
    'From_len', 'To_len', 'sender_len', 'paymaster_len',
    'Txn Fee', 'logIndex', 'actualGasCost',
    'actualGasUsed', 'nonce', 'success', 'Blockno', 'DateTime_ts'
]
target = 'Gas Used'

X = df[features].astype(float)
y = df[target].astype(float)

# Tách train/test (80/20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=SEED
)

# Chuẩn hóa
scaler_X = MinMaxScaler()
X_train_scaled = pd.DataFrame(scaler_X.fit_transform(X_train), columns=X.columns)
X_test_scaled = pd.DataFrame(scaler_X.transform(X_test), columns=X.columns)

scaler_y = MinMaxScaler()
y_train_scaled = scaler_y.fit_transform(y_train.values.reshape(-1, 1)).flatten()
y_test_scaled = scaler_y.transform(y_test.values.reshape(-1, 1)).flatten()


kf = KFold(n_splits=10, shuffle=True, random_state=SEED)

# Đánh giá
mse_scores, mae_scores, rmse_scores, r2_scores = [], [], [], []

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_scaled), start=1):
    X_tr = X_train_scaled.iloc[train_idx]
    y_tr = y_train_scaled[train_idx]
    X_val = X_train_scaled.iloc[val_idx]
    y_val = y_train_scaled[val_idx]

    model = CatBoostRegressor()

    model.fit(X_tr, y_tr)
    y_pred = model.predict(X_val)

    mse  = mean_squared_error(y_val, y_pred)
    mae  = mean_absolute_error(y_val, y_pred)
    rmse = np.sqrt(mse)
    r2   = r2_score(y_val, y_pred)

    mse_scores.append(mse)
    mae_scores.append(mae)
    rmse_scores.append(rmse)
    r2_scores.append(r2)

    print(f"Fold {fold} - MSE: {mse:.6f} | MAE: {mae:.6f} | RMSE: {rmse:.6f} | R²: {r2:.6f}")

# Tổng kết sau tất cả folds
print("\n=== Kết quả trung bình (scaled) ===")
print(f"Avg MSE : {np.mean(mse_scores):.6f}")
print(f"Avg MAE : {np.mean(mae_scores):.6f}")
print(f"Avg RMSE: {np.mean(rmse_scores):.6f}")
print(f"Avg R²  : {np.mean(r2_scores):.6f}")

Learning rate set to 0.081617
0:	learn: 0.0212078	total: 6.88ms	remaining: 6.87s
1:	learn: 0.0198380	total: 12.4ms	remaining: 6.21s
2:	learn: 0.0185902	total: 18.2ms	remaining: 6.04s
3:	learn: 0.0174461	total: 24.6ms	remaining: 6.12s
4:	learn: 0.0163772	total: 30.6ms	remaining: 6.08s
5:	learn: 0.0153896	total: 36.1ms	remaining: 5.99s
6:	learn: 0.0144758	total: 41.4ms	remaining: 5.87s
7:	learn: 0.0136652	total: 46.6ms	remaining: 5.77s
8:	learn: 0.0129022	total: 52.5ms	remaining: 5.78s
9:	learn: 0.0121860	total: 57.9ms	remaining: 5.73s
10:	learn: 0.0115614	total: 64.1ms	remaining: 5.76s
11:	learn: 0.0109858	total: 69.7ms	remaining: 5.74s
12:	learn: 0.0104855	total: 75.2ms	remaining: 5.71s
13:	learn: 0.0099853	total: 80.9ms	remaining: 5.7s
14:	learn: 0.0095256	total: 86.1ms	remaining: 5.65s
15:	learn: 0.0091019	total: 91.9ms	remaining: 5.65s
16:	learn: 0.0087324	total: 97ms	remaining: 5.61s
17:	learn: 0.0084279	total: 103ms	remaining: 5.61s
18:	learn: 0.0081247	total: 108ms	remaining: 5.5

In [None]:
# ---------------- Cấu hình ----------------
SEED   = 42
epochs = 20   # đổi 30 nếu muốn
k      = 10

# ---------------- Load dữ liệu ----------------
data_path = r'C:/Users/Multiplexon/Desktop/data/2/total_selected_augmented.csv'
df = pd.read_csv(data_path)

features = [
    'Transaction Hash_len', 'Original_len', 'signature_len',
    'From_len', 'To_len', 'sender_len', 'paymaster_len',
    'Txn Fee', 'logIndex', 'actualGasCost',
    'actualGasUsed', 'nonce', 'success', 'Blockno', 'DateTime_ts'
]
target = 'Gas Used'

X = df[features].astype(float)
y = df[target].astype(float)

# ---------------- Train/Test split & MinMax scale (toàn train) ----------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=SEED
)

scaler_X = MinMaxScaler()
X_train_scaled = pd.DataFrame(scaler_X.fit_transform(X_train), columns=X.columns)
X_test_scaled  = pd.DataFrame(scaler_X.transform(X_test),  columns=X.columns)

scaler_y = MinMaxScaler()
y_train_scaled = scaler_y.fit_transform(y_train.values.reshape(-1, 1)).ravel()
y_test_scaled  = scaler_y.transform(y_test.values.reshape(-1, 1)).ravel()

# ---------------- Repeated KFold ----------------
cats = np.zeros((epochs, 4))  # mỗi hàng: [RMSE, MSE, MAE, R2] (trên thang scaled)

for ep in range(epochs):
    kf = KFold(n_splits=k, shuffle=True, random_state=SEED + ep)

    rmse_list, mse_list, mae_list, r2_list = [], [], [], []

    for train_idx, val_idx in kf.split(X_train_scaled):
        # Nếu X_train_scaled là ndarray: dùng [...], ở đây là DataFrame nên dùng .iloc
        X_tr = X_train_scaled.iloc[train_idx]
        X_va = X_train_scaled.iloc[val_idx]
        y_tr = y_train_scaled[train_idx]
        y_va = y_train_scaled[val_idx]

        # --- CatBoost ---
        model = CatBoostRegressor()
        model.fit(X_tr, y_tr)
        y_pred = model.predict(X_va)

        # === Metrics trên THANG MinMax-scaled của y ===
        mse  = mean_squared_error(y_va, y_pred)
        rmse = np.sqrt(mse)
        mae  = mean_absolute_error(y_va, y_pred)
        r2   = r2_score(y_va, y_pred)

        rmse_list.append(rmse)
        mse_list.append(mse)
        mae_list.append(mae)
        r2_list.append(r2)

    # lưu mean metrics của epoch này (qua 10 folds)
    cats[ep, :] = [
        np.mean(rmse_list),
        np.mean(mse_list),
        np.mean(mae_list),
        np.mean(r2_list)
    ]

# ---------------- Trung bình qua tất cả epochs ----------------
final_mean = np.mean(cats, axis=0)

print("Final mean on MinMax-scaled y [RMSE, MSE, MAE, R2]:")
print(final_mean)  # 1 dòng mean cuối cùng
print("\nPer-epoch metrics (rows) on MinMax-scaled y [RMSE, MSE, MAE, R2]:")
print(cats)       

KeyboardInterrupt: 