In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
from sklearn.model_selection import RandomizedSearchCV

In [5]:
# ================== 0) SEED ==================
SEED = 42
np.random.seed(SEED)

# ================== 1) LOAD & PREP ==================
data_path = 'C:/Users/Multiplexon/Desktop/data/d6/total 06.csv'
df = pd.read_csv(data_path)

features = ['Original_len','Txn Fee','logIndex','actualGasCost',
            'Blockno','DateTime_ts','nonce']
target = 'Gas Used'

X = df[features].astype(float)
y = df[target].astype(float)

# ================== 2) HOLD-OUT (80/20 split) ==================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=SEED
)

# ================== 3) MinMaxScaler cho X & y (fit trên TRAIN) ==================
xs = MinMaxScaler()
ys = MinMaxScaler()

X_train_prepared = xs.fit_transform(X_train)
X_test_prepared  = xs.transform(X_test)

y_train_prepared = ys.fit_transform(y_train.values.reshape(-1, 1)).ravel()
y_test_prepared  = ys.transform(y_test.values.reshape(-1, 1)).ravel()

# ================== 4) RandomizedSearchCV (CatBoost) ==================
param_distributions = {
    "n_estimators": [50, 100, 150],       # giảm số cây
    "max_depth": [3, 5, 7],               # giới hạn độ sâu
    "min_samples_split": [2, 5, 10],}

model = RandomForestRegressor()

ran_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_distributions,
    n_iter=10,            # số tổ hợp thử (giảm để tiết kiệm RAM/CPU)
    cv=3,                 # k-fold cross validation
    n_jobs=1,             # chạy tuần tự để tránh copy dữ liệu
    random_state=42
)

ran_search.fit(X_train_prepared, y_train_prepared)

best_model   = ran_search.best_estimator_
best_params  = ran_search.best_params_
best_cv_rmse = np.sqrt(-ran_search.best_score_)   # RMSE trên THANG ĐÃ SCALE (y đã scale)

print("Best params:", best_params)
print(f"Best CV RMSE (scaled y): {best_cv_rmse:.6f}")

Best params: {'n_estimators': 50, 'min_samples_split': 5, 'max_depth': 7}
Best CV RMSE (scaled y): nan


  best_cv_rmse = np.sqrt(-ran_search.best_score_)   # RMSE trên THANG ĐÃ SCALE (y đã scale)


In [8]:
# ==== 0. Seed & Load Data ====
SEED = 42
np.random.seed(SEED)

data_path = 'C:/Users/Multiplexon/Desktop/data/d6/total 06.csv'
df = pd.read_csv(data_path)

# ==== 1. Feature Engineering & Cleaning ====
features = ['Original_len', 'Txn Fee', 'logIndex', 'actualGasCost',
            'Blockno', 'DateTime_ts', 'nonce']
target = 'Gas Used'

X = df[features].apply(pd.to_numeric, errors='coerce').replace([np.inf, -np.inf], np.nan).fillna(0.0)
y = pd.to_numeric(df[target], errors='coerce').replace([np.inf, -np.inf], np.nan).fillna(0.0)

# ==== 2. Train/Test Split ====
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

# ==== 3. K-Fold CV ====
kf = KFold(n_splits=10, shuffle=True, random_state=SEED)

mse_list, rmse_list, mae_list, r2_list = [], [], [], []
all_true, all_pred = [], []

print("=== K-Fold RandomForest – Metrics trên dữ liệu đã SCALE ===")
for fold, (tr_idx, test_idx) in enumerate(kf.split(X_train), start=1):
    X_tr, X_te = X_train.iloc[tr_idx], X_train.iloc[test_idx]
    y_tr, y_te = y_train.iloc[tr_idx], y_train.iloc[test_idx]

    # Scale theo từng fold
    sx, sy = MinMaxScaler(), MinMaxScaler()
    X_tr_s, X_te_s = sx.fit_transform(X_tr), sx.transform(X_te)
    y_tr_s = sy.fit_transform(y_tr.values.reshape(-1, 1)).ravel()
    y_te_s = sy.transform(y_te.values.reshape(-1, 1)).ravel()

    # Model
    model = RandomForestRegressor(n_estimators= 50, min_samples_split= 5, max_depth= 7)
    model.fit(X_tr_s, y_tr_s)

    # Predict
    y_pred_s = model.predict(X_te_s)
    # Metrics
    mse  = mean_squared_error(y_te_s, y_pred_s)
    rmse = np.sqrt(mse)
    mae  = mean_absolute_error(y_te_s, y_pred_s)
    r2   = r2_score(y_te_s, y_pred_s)

    mse_list.append(mse); rmse_list.append(rmse)
    mae_list.append(mae); r2_list.append(r2)

    print(f"[Fold {fold:2d}] RMSE: {rmse:.6f} | MSE: {mse:.6f} | MAE: {mae:.6f} | R²: {r2:.6f}")

# ==== 4. Trung bình ====
print("\n=== Trung bình qua 10 folds (trên dữ liệu scaled) ===")
print(f"Mean RMSE: {np.mean(rmse_list):.6f}")
print(f"Mean MSE : {np.mean(mse_list):.6f}")
print(f"Mean MAE : {np.mean(mae_list):.6f}")
print(f"Mean R²  : {np.mean(r2_list):.6f}")


=== K-Fold RandomForest – Metrics trên dữ liệu đã SCALE ===
[Fold  1] RMSE: 0.007490 | MSE: 0.000056 | MAE: 0.004713 | R²: 0.640198
[Fold  2] RMSE: 0.007384 | MSE: 0.000055 | MAE: 0.004784 | R²: 0.593876
[Fold  3] RMSE: 0.008309 | MSE: 0.000069 | MAE: 0.004795 | R²: 0.578311
[Fold  4] RMSE: 0.007292 | MSE: 0.000053 | MAE: 0.004723 | R²: 0.635813
[Fold  5] RMSE: 0.007768 | MSE: 0.000060 | MAE: 0.004790 | R²: 0.611712
[Fold  6] RMSE: 0.007751 | MSE: 0.000060 | MAE: 0.004793 | R²: 0.586716
[Fold  7] RMSE: 0.007541 | MSE: 0.000057 | MAE: 0.004762 | R²: 0.632989
[Fold  8] RMSE: 0.007644 | MSE: 0.000058 | MAE: 0.004798 | R²: 0.613726
[Fold  9] RMSE: 0.013848 | MSE: 0.000192 | MAE: 0.007249 | R²: 0.563734
[Fold 10] RMSE: 0.008417 | MSE: 0.000071 | MAE: 0.004863 | R²: 0.551840

=== Trung bình qua 10 folds (trên dữ liệu scaled) ===
Mean RMSE: 0.008345
Mean MSE : 0.000073
Mean MAE : 0.005027
Mean R²  : 0.600891


In [None]:
#actual and predicted scatter plot
# ================== 4) SCATTER PLOT with Gradient Color ==================
from sklearn.ensemble import RandomForestRegressor
import numpy as np
import matplotlib.pyplot as plt

# ================== 0) SEED ==================
SEED = 42
np.random.seed(SEED)
# ================== 4) TRAIN CATBOOST (SCALED) ==================
cb = RandomForestRegressor(n_estimators= 50, min_samples_split= 5, max_depth= 7)

cb.fit(X_tr_s, y_tr_s)

# ================== 5) PREDICT & SCATTER (SCALED) ==================
y_val_pred_s = cb.predict(X_te_s)

fig, ax = plt.subplots(figsize=(7, 6))
ax.scatter(y_te_s, y_val_pred_s, s=40, alpha=0.8,
           color="lightskyblue", edgecolors="k")

# Đường y = x theo giới hạn hiện tại của trục (không dùng 'lims')
x0, x1 = ax.get_xlim()
y0, y1 = ax.get_ylim()
ax.plot([min(x0, y0), max(x1, y1)], [min(x0, y0), max(x1, y1)],
        "--", lw=1, color="red")

ax.set_title("RandomForest: Predictions vs Actual (scaled)")
ax.set_xlabel("Actual (scaled)")
ax.set_ylabel("Predicted (scaled)")
plt.show()

In [None]:
# ================== 0) SEED ==================
SEED = 42
np.random.seed(SEED)

data_path = 'C:/Users/Multiplexon/Desktop/data/d6/total 06.csv'
df = pd.read_csv(data_path)

features = [
  'Original_len',
'Txn Fee','logIndex','actualGasCost',
    'Blockno','DateTime_ts', 'nonce'
]
target = 'Gas Used'


X = df[features].astype(float)
y = df[target].astype(float)

# Hold-out chỉ để cố định tập làm việc (không dùng test ở dưới)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

# ================== 2) 20 EPOCHS × 10-FOLD ==================
epochs = 20
rd = np.zeros((epochs, 4), dtype=np.float64)   # [RMSE, MSE, MAE, R2]

for ep in range(epochs):
    # shuffle train cho epoch này
    idx = np.random.permutation(len(X_train))
    X_ep = X_train.iloc[idx].reset_index(drop=True)
    y_ep = y_train.iloc[idx].reset_index(drop=True)

    kf=KFold(n_splits=10, shuffle=True, random_state=SEED)

    rmse_scores, mse_scores, mae_scores, r2_scores = [], [], [], []

    for train_idx, test_idx in kf.split(X_ep):
        X_tr_raw, X_te_raw = X_ep.iloc[train_idx], X_ep.iloc[test_idx]
        y_tr_raw, y_te_raw = y_ep.iloc[train_idx], y_ep.iloc[test_idx]

        # scale TRONG fold (fit trên train → transform train/val)
        xs, ys = MinMaxScaler(), MinMaxScaler()
        X_tr_s = xs.fit_transform(X_tr_raw)
        X_te_s = xs.transform(X_te_raw)
        y_tr_s = ys.fit_transform(y_tr_raw.values.reshape(-1,1)).ravel()
        y_te_s = ys.transform(y_te_raw.values.reshape(-1,1)).ravel()

        # XGBoost (params mặc định, thêm random_state cho tái lập)
        model = RandomForestRegressor(n_estimators= 50, min_samples_split= 5, max_depth= 7)
        model.fit(X_tr_s, y_tr_s)

        # metrics trên dữ liệu ĐÃ SCALE
        y_pred_s = model.predict(X_te_s)
        mse  = mean_squared_error(y_te_s, y_pred_s)
        rmse = float(np.sqrt(mse))
        mae  = mean_absolute_error(y_te_s, y_pred_s)
        r2   = r2_score(y_te_s, y_pred_s)

        rmse_scores.append(rmse); mse_scores.append(mse)
        mae_scores.append(mae);   r2_scores.append(r2)

    rd[ep, :] = [np.mean(rmse_scores), np.mean(mse_scores),
                 np.mean(mae_scores),  np.mean(r2_scores)]

# ================== 3) OUTPUT ==================
np.set_printoptions(precision=8, suppress=False)
print("array(" + np.array2string(rd, separator=', ', prefix='array(') + ")")

final_mean = rd.mean(axis=0)
final_std  = rd.std(axis=0, ddof=1)
print("\nMean over 20 runs (RMSE, MSE, MAE, R2):", final_mean)
print("Std  over 20 runs (RMSE, MSE, MAE, R2):", final_std)

array([[5.67964007e-03, 3.46317858e-05, 2.46593151e-03, 8.16124729e-01],
       [5.72042276e-03, 3.47386513e-05, 2.46334153e-03, 8.12880949e-01],
       [5.80794680e-03, 3.59481111e-05, 2.46801020e-03, 8.06720216e-01],
       [5.72054757e-03, 3.48521359e-05, 2.46803097e-03, 8.12515742e-01],
       [5.74047821e-03, 3.52217645e-05, 2.46726671e-03, 8.11827942e-01],
       [5.66452299e-03, 3.40937819e-05, 2.46657741e-03, 8.16701276e-01],
       [5.76204349e-03, 3.53116981e-05, 2.46619172e-03, 8.09072607e-01],
       [5.72659562e-03, 3.49828716e-05, 2.46478312e-03, 8.12900461e-01],
       [5.69565009e-03, 3.43590542e-05, 2.46513520e-03, 8.14206279e-01],
       [5.75092388e-03, 3.57421829e-05, 2.46676805e-03, 8.12382370e-01],
       [5.71157117e-03, 3.52913526e-05, 2.47053244e-03, 8.14804592e-01],
       [5.74076890e-03, 3.50525489e-05, 2.46809564e-03, 8.12042681e-01],
       [5.69691999e-03, 3.47073506e-05, 2.46682919e-03, 8.14659069e-01],
       [5.73037167e-03, 3.48616889e-05, 2.46409276e