1.要加三个约束条件让模型自己找规律
A.bde/bdfe
B.energy:空间复杂性
C.C原子的 复杂性： 指标：周边原子数、平均bde

# 简单添加三个条件的软约束

In [7]:
import os
import pickle
import warnings

import networkx as nx
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch import nn
from torch.utils.data import Dataset, DataLoader, random_split

from torch_geometric.data import Data, DataLoader as PyGDataLoader
from torch_geometric.nn import GINEConv, global_mean_pool, BatchNorm
from torch_geometric.nn.conv.gcn_conv import gcn_norm
from torch_geometric.utils import from_networkx

from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import KFold

import optuna  # <-- For hyperparameter optimization

# Dash imports for showing results in a dashboard
import dash
from dash import dcc
from dash import html
import dash_table

warnings.filterwarnings('ignore')

# =============================================================================
#                             DATASET & PREP
# =============================================================================

class MolDataset(Dataset):
    """
    A custom dataset that:
      - Reads external factors from CSV
      - Loads the corresponding pickle for the molecule's graph
      - Converts it into a PyG Data object
    """
    def __init__(
        self,
        raw_dataframe: pd.DataFrame,
        nx_graph_dict: dict,
        *,
        component_col: str,
        global_state_cols: list[str],
        label_col: str,
        transform=None
    ):
        """
        Args:
            raw_dataframe: The input dataframe containing molecule info.
            nx_graph_dict: Dictionary mapping component names to networkx graphs.
            component_col: Column name for the component.
            global_state_cols: List of columns representing external factors.
            label_col: Column name for the regression target.
            transform: Any transform to apply to each PyG Data object.
        """
        self.raw_dataframe = raw_dataframe
        self.nx_graph_dict = nx_graph_dict
        self.component_col = [component_col] if isinstance(component_col, str) else component_col
        self.global_state_cols = global_state_cols
        self.label_col = [label_col] if isinstance(label_col, str) else label_col
        self.transform = transform

        required_cols = set(self.global_state_cols + self.label_col + self.component_col)
        for col in required_cols:
            if col not in self.raw_dataframe.columns:
                raise ValueError(f"Missing column in DataFrame: '{col}'")

    def __len__(self):
        return len(self.raw_dataframe)

    def __getitem__(self, idx):
        row = self.raw_dataframe.iloc[idx]

        # 1. Load the molecule graph
        component_name = row[self.component_col[0]]  # e.g. "C23"
        pyg_data = self.nx_graph_dict[component_name]

        # 2. Prepare the external factors
        externals = torch.tensor(row[self.global_state_cols].values.astype(float), dtype=torch.float)
        externals = externals.unsqueeze(0)

        # 3. Prepare the label (regression target)
        label = torch.tensor([row[self.label_col][0]], dtype=torch.float)

        # 4. Attach externals & label to the Data object
        pyg_data.externals = externals  # shape [1, external_in_dim]
        pyg_data.y = label  # shape [1]

        if self.transform:
            pyg_data = self.transform(pyg_data)

        return pyg_data


def networkx_to_pyg(nx_graph):
    """
    Convert a networkx graph to a torch_geometric.data.Data object.
    This is a basic template; adjust for your actual node/edge features.
    """
    # Sort nodes to ensure consistent ordering
    node_mapping = {node: i for i, node in enumerate(nx_graph.nodes())}

    x_list = []
    edge_index_list = []
    edge_attr_list = []

    # Node features
    for node in nx_graph.nodes(data=True):
        original_id = node[0]
        attrs = node[1]
        symbol = attrs.get("symbol", "C")
        symbol_id = 0 if symbol == "C" else 1 if symbol == "H" else 2
        # Here, x_list could be expanded to include 'energy' if your data provides it
        x_list.append([symbol_id])

    # Edge features
    for u, v, edge_attrs in nx_graph.edges(data=True):
        u_idx = node_mapping[u]
        v_idx = node_mapping[v]
        edge_index_list.append((u_idx, v_idx))
        bde_pred = edge_attrs.get("bde_pred", 0.0) or 0.0
        bdfe_pred = edge_attrs.get("bdfe_pred", 0.0) or 0.0
        edge_attr_list.append([bde_pred, bdfe_pred])

    x = torch.tensor(x_list, dtype=torch.float)
    edge_index = torch.tensor(edge_index_list, dtype=torch.long).t().contiguous()
    edge_attr = torch.tensor(edge_attr_list, dtype=torch.float)

    data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr)
    return data


# =============================================================================
#                     BASE GNN MODEL (WITH DIM MATCH)
# =============================================================================

class GINE_Regression(nn.Module):
    """
    A GNN for regression using GINEConv layers + edge attributes,
    where all layers have the same hidden_dim (no dimension mismatch).
    """
    def __init__(
        self,
        node_in_dim: int,
        edge_in_dim: int,
        external_in_dim: int,
        hidden_dim: int = 128,
        num_layers: int = 3,
        dropout: float = 0.1
    ):
        super().__init__()

        # Encode edges from edge_in_dim to hidden_dim
        self.edge_encoder = nn.Sequential(
            nn.Linear(edge_in_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim)
        )

        # Encode nodes from node_in_dim to hidden_dim
        self.node_encoder = nn.Linear(node_in_dim, hidden_dim)

        # Multiple GINEConv layers & corresponding BatchNorm
        self.convs = nn.ModuleList()
        self.bns = nn.ModuleList()

        for _ in range(num_layers):
            net = nn.Sequential(
                nn.Linear(hidden_dim, hidden_dim),
                nn.ReLU(),
                nn.Linear(hidden_dim, hidden_dim)
            )
            conv = GINEConv(nn=net)
            self.convs.append(conv)
            self.bns.append(BatchNorm(hidden_dim))

        self.dropout = nn.Dropout(p=dropout)

        # Process external factors
        self.externals_mlp = nn.Sequential(
            nn.Linear(external_in_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(p=dropout),
            nn.Linear(hidden_dim, hidden_dim)
        )

        # Final regression
        self.final_regressor = nn.Sequential(
            nn.Linear(hidden_dim + hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(p=dropout),
            nn.Linear(hidden_dim, 1)
        )

    def forward(self, data):
        x, edge_index, edge_attr, batch = data.x, data.edge_index, data.edge_attr, data.batch

        # Encode
        x = self.node_encoder(x)
        edge_emb = self.edge_encoder(edge_attr)

        # Pass through GINEConv layers
        for conv, bn in zip(self.convs, self.bns):
            x = conv(x, edge_index, edge_emb)
            x = bn(x)
            x = F.relu(x)
            x = self.dropout(x)

        # Global pooling
        graph_emb = global_mean_pool(x, batch)

        # Process external factors
        ext_emb = self.externals_mlp(data.externals)

        # Combine & regress
        combined = torch.cat([graph_emb, ext_emb], dim=-1)
        out = self.final_regressor(combined).squeeze(-1)
        return out


# =============================================================================
#                   SOFT CONSTRAINTS (HELPER FUNCTIONS)
# =============================================================================

def compute_c_neighbors(batch_data):
    """
    Compute the total (or average) number of carbon neighbors across all nodes in the batch.
    We'll do a simple approach:
      - For each node, check how many edges connect to a node with symbol_id == 0 (C).
      - Then average or sum across the graph.
    This is just an example "soft" aggregator.
    """
    edge_index = batch_data.edge_index
    symbol_ids = batch_data.x[:, 0]

    c_neighbor_count = torch.zeros_like(symbol_ids)
    num_edges = edge_index.size(1)

    for e in range(num_edges):
        src = edge_index[0, e]
        tgt = edge_index[1, e]
        # If the target is carbon
        if symbol_ids[tgt] == 0:  # 0 => 'C'
            c_neighbor_count[src] += 1
        # If the source is carbon
        if symbol_ids[src] == 0:
            c_neighbor_count[tgt] += 1

    avg_c_neighbors = c_neighbor_count.mean()
    return avg_c_neighbors


def compute_bde_bdfe(batch_data):
    """
    Compute an aggregated BDE/BDFe value for the entire graph.
    We'll just take the mean of bde_pred and bdfe_pred across edges as an example.
    """
    edge_attrs = batch_data.edge_attr  # shape [E, 2] => columns: [bde_pred, bdfe_pred]
    if edge_attrs.size(0) == 0:
        return 0.0, 0.0
    bde_mean = edge_attrs[:, 0].mean()
    bdfe_mean = edge_attrs[:, 1].mean()
    return bde_mean, bdfe_mean


def compute_atom_energy(batch_data):
    """
    If you store 'energy' in node features or in a separate attribute,
    here's where you'd aggregate it. We only have 'symbol_id' for now.
    So let's do a dummy approach: sum the symbol IDs as a stand-in "energy".
    """
    energy = batch_data.x[:, 0].sum()  # Just a placeholder
    return energy


def compute_soft_constraints(batch_data, preds):
    """
    Returns the three soft constraint losses:
      - BDE_BDFe loss
      - Energy loss
      - Num C neighbors loss
    Each is defined as mean(pred * aggregated_value).
    """
    # 1) BDE and BDFe
    bde_mean, bdfe_mean = compute_bde_bdfe(batch_data)
    aggregated_bde = bde_mean + bdfe_mean
    loss_bde = (preds * aggregated_bde).mean()

    # 2) Atom energy
    aggregated_energy = compute_atom_energy(batch_data)
    loss_energy = (preds * aggregated_energy).mean()

    # 3) Number of C neighbors
    avg_c_neighbors = compute_c_neighbors(batch_data)
    loss_c = (preds * avg_c_neighbors).mean()

    return loss_bde, loss_energy, loss_c


# =============================================================================
#                   TRAIN/VALID/EVALUATION UTILS (WITH SOFT CONSTRAINTS)
# =============================================================================

def train_one_epoch_with_constraints(
    model, loader, optimizer, device,
    base_criterion, lambda_bde=0.001, lambda_energy=0.001, lambda_c=0.001
):
    """
    Train for one epoch, combining MSE (or other base_criterion) + additional soft constraints.
    """
    model.train()
    total_loss = 0.0
    count = 0

    for batch_data in loader:
        batch_data = batch_data.to(device)
        optimizer.zero_grad()
        preds = model(batch_data)
        y = batch_data.y.to(device).view(-1)

        # Base regression loss
        base_loss = base_criterion(preds, y)

        # Soft constraints
        loss_bde, loss_energy, loss_c = compute_soft_constraints(batch_data, preds)
        total_soft_loss = lambda_bde * loss_bde + lambda_energy * loss_energy + lambda_c * loss_c

        # Final combined loss
        loss = base_loss + total_soft_loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * batch_data.num_graphs
        count += batch_data.num_graphs

    return total_loss / count if count > 0 else 0.0


def validate_with_constraints(model, loader, device, base_criterion,
                              lambda_bde=0.001, lambda_energy=0.001, lambda_c=0.001):
    """
    Validate for one epoch, combining MSE (or other base_criterion) + additional soft constraints.
    """
    model.eval()
    total_loss = 0.0
    count = 0

    with torch.no_grad():
        for batch_data in loader:
            batch_data = batch_data.to(device)
            preds = model(batch_data)
            y = batch_data.y.to(device).view(-1)

            # Base regression loss
            base_loss = base_criterion(preds, y)

            # Soft constraints
            loss_bde, loss_energy, loss_c = compute_soft_constraints(batch_data, preds)
            total_soft_loss = lambda_bde * loss_bde + lambda_energy * loss_energy + lambda_c * loss_c

            loss = base_loss + total_soft_loss

            total_loss += loss.item() * batch_data.num_graphs
            count += batch_data.num_graphs

    return total_loss / count if count > 0 else 0.0


def evaluate_model(model, loader, device):
    """
    Evaluate the model on a dataset loader and compute R² and RMSE.
    """
    model.eval()
    y_true, y_pred = [], []

    with torch.no_grad():
        for batch in loader:
            batch = batch.to(device)
            preds = model(batch)
            y_true.append(batch.y.cpu())
            y_pred.append(preds.cpu())

    y_true = torch.cat(y_true).numpy().squeeze()
    y_pred = torch.cat(y_pred).numpy().squeeze()

    r2 = r2_score(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))

    print(f"R²: {r2:.4f}")
    print(f"RMSE: {rmse:.4f}")

    return r2, rmse


# =============================================================================
#                             DATA PREPARATION
# =============================================================================

env_file = r"F:\2025 energing\PYTHON\GNN_chemicalENV-main\GNN molecules\graph_pickles\dataset02.xlsx"
data = pd.read_excel(env_file, engine='openpyxl').dropna(subset=['degradation_rate'])
data['seawater'] = data['seawater'].map({'art': 1, 'sea': 0})

folder_path = r"F:\2025 energing\PYTHON\GNN_chemicalENV-main\GNN molecules\graph_pickles\molecules"
graph_pickles = [f for f in os.listdir(folder_path) if f.endswith(".pkl")]

base_dir = r"F:\2025 energing\PYTHON\GNN_chemicalENV-main\GNN molecules\graph_pickles\molecules"
if os.path.exists(base_dir):
    print("Directory exists:", base_dir)
    print("Files in directory:", os.listdir(base_dir))
else:
    print(f"Error: Directory {base_dir} does not exist!")

compounds = data.component.unique()
graphs_dict = {}
for compound, graph_pickle in zip(compounds, graph_pickles):
    with open(os.path.join(base_dir, graph_pickle), 'rb') as f:
        graph = pickle.load(f)
        graphs_dict[compound] = networkx_to_pyg(graph)

dataset = MolDataset(
    raw_dataframe=data,
    nx_graph_dict=graphs_dict,
    component_col="component",
    global_state_cols=["temperature", "concentration", "time", "seawater"],
    label_col="degradation_rate",
    transform=None
)

# =============================================================================
#                      OPTUNA HYPERPARAM OPT + CROSS-VALIDATION
# =============================================================================

from torch_geometric.data import DataLoader as PyGDataLoader


def objective(trial):
    """
    Optuna objective function:
      - Sample hyperparams
      - Do a 5-fold cross validation
      - Return avg validation loss
    """

    # Hyperparameters to tune:
    hidden_dim = trial.suggest_int("hidden_dim", 8, 64, step=8)  # Example range
    num_layers = trial.suggest_int("num_layers", 2, 6)
    dropout = trial.suggest_float("dropout", 0.0, 0.5, step=0.1)
    lr = trial.suggest_float("lr", 1e-4, 1e-2, log=True)
    weight_decay = trial.suggest_float("weight_decay", 1e-6, 1e-3, log=True)

    # Soft constraint lambdas:
    lambda_bde = trial.suggest_float("lambda_bde", 1e-4, 1e-1, log=True)
    lambda_energy = trial.suggest_float("lambda_energy", 1e-4, 1e-1, log=True)
    lambda_c = trial.suggest_float("lambda_c", 1e-4, 1e-1, log=True)

    # We'll do 5-fold cross-validation
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    fold_val_losses = []

    for fold, (train_idx, val_idx) in enumerate(kf.split(dataset)):
        train_subset = torch.utils.data.Subset(dataset, train_idx)
        val_subset = torch.utils.data.Subset(dataset, val_idx)

        train_loader = PyGDataLoader(train_subset, batch_size=16, shuffle=True)
        val_loader = PyGDataLoader(val_subset, batch_size=16, shuffle=False)

        model = GINE_Regression(
            node_in_dim=1,
            edge_in_dim=2,
            external_in_dim=4,
            hidden_dim=hidden_dim,
            num_layers=num_layers,
            dropout=dropout
        ).to(device)

        optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
        base_criterion = torch.nn.MSELoss()

        # We'll do fewer epochs for faster hyperparam search
        num_epochs = 100

        for epoch in range(num_epochs):
            train_loss = train_one_epoch_with_constraints(
                model,
                train_loader,
                optimizer,
                device,
                base_criterion,
                lambda_bde=lambda_bde,
                lambda_energy=lambda_energy,
                lambda_c=lambda_c
            )
            val_loss = validate_with_constraints(
                model,
                val_loader,
                device,
                base_criterion,
                lambda_bde=lambda_bde,
                lambda_energy=lambda_energy,
                lambda_c=lambda_c
            )

        fold_val_losses.append(val_loss)

    # Return the average of the validation losses
    avg_val_loss = float(np.mean(fold_val_losses))
    return avg_val_loss


# Create Optuna study
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=30)

print("\n=== Optuna Study Results ===")
print(f"Best trial number: {study.best_trial.number}")
print(f"Best trial value (loss): {study.best_trial.value:.4f}")
print("Best hyperparameters:")
for k, v in study.best_params.items():
    print(f"  {k}: {v}")

# =============================================================================
#          RETRAIN FINAL MODEL WITH BEST HYPERPARAMS + REPORT TEST METRICS
# =============================================================================

best_params = study.best_params
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

hidden_dim = best_params["hidden_dim"]
num_layers = best_params["num_layers"]
dropout = best_params["dropout"]
lr = best_params["lr"]
weight_decay = best_params["weight_decay"]
lambda_bde = best_params["lambda_bde"]
lambda_energy = best_params["lambda_energy"]
lambda_c = best_params["lambda_c"]

kf = KFold(n_splits=5, shuffle=True, random_state=42)
fold_results = []

for fold, (train_idx, val_idx) in enumerate(kf.split(dataset)):
    print(f"\n--- Retrain Fold {fold + 1} with Best Hyperparams ---")

    train_subset = torch.utils.data.Subset(dataset, train_idx)
    val_subset = torch.utils.data.Subset(dataset, val_idx)

    train_loader = PyGDataLoader(train_subset, batch_size=16, shuffle=True)
    val_loader = PyGDataLoader(val_subset, batch_size=16, shuffle=False)

    model = GINE_Regression(
        node_in_dim=1,
        edge_in_dim=2,
        external_in_dim=4,
        hidden_dim=hidden_dim,
        num_layers=num_layers,
        dropout=dropout
    ).to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    base_criterion = nn.MSELoss()

    num_epochs = 300

    for epoch in range(1, num_epochs + 1):
        train_loss = train_one_epoch_with_constraints(
            model,
            train_loader,
            optimizer,
            device,
            base_criterion,
            lambda_bde=lambda_bde,
            lambda_energy=lambda_energy,
            lambda_c=lambda_c
        )
        val_loss = validate_with_constraints(
            model,
            val_loader,
            device,
            base_criterion,
            lambda_bde=lambda_bde,
            lambda_energy=lambda_energy,
            lambda_c=lambda_c
        )

        if epoch % 50 == 0:
            print(f"[Fold {fold + 1} Epoch {epoch}] Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")

    print(f"Evaluating fold {fold + 1} ...")
    r2, rmse = evaluate_model(model, val_loader, device)
    fold_results.append({"fold": fold + 1, "r2": r2, "rmse": rmse})

r2_scores = [res["r2"] for res in fold_results]
rmse_scores = [res["rmse"] for res in fold_results]

print("\n--- Final Cross-Validation Summary with Best Hyperparams ---")
for res in fold_results:
    print(f"Fold {res['fold']}: R² = {res['r2']:.4f}, RMSE = {res['rmse']:.4f}")

print(f"\nAverage R²: {np.mean(r2_scores):.4f} ± {np.std(r2_scores):.4f}")
print(f"Average RMSE: {np.mean(rmse_scores):.4f} ± {np.std(rmse_scores):.4f}")


# =============================================================================
#                           DASHBOARD FOR RESULTS
# =============================================================================

# Convert fold_results to a DataFrame for easy display
fold_results_df = pd.DataFrame(fold_results)

app = dash.Dash(__name__)

app.layout = html.Div([
    html.H1("Optuna + GNN Training Results Dashboard"),
    html.H2("Best Hyperparameters Found by Optuna"),
    html.Ul([
        html.Li(f"{key}: {value}") for key, value in best_params.items()
    ]),
    html.P(f"Best trial value (loss): {study.best_trial.value:.4f}"),
    html.Br(),
    html.H2("Cross-Validation Results with Best Hyperparams"),
    dash_table.DataTable(
        id='fold-results-table',
        columns=[{"name": i, "id": i} for i in fold_results_df.columns],
        data=fold_results_df.to_dict('records'),
        style_table={'overflowX': 'auto'},
        style_cell={'textAlign': 'left'}
    ),
    html.Br(),
    html.Div([
        html.H3("Averaged Metrics Across Folds:"),
        html.P(f"Average R²: {np.mean(r2_scores):.4f} ± {np.std(r2_scores):.4f}"),
        html.P(f"Average RMSE: {np.mean(rmse_scores):.4f} ± {np.std(rmse_scores):.4f}")
    ])
])

if __name__ == '__main__':
    # Run the Dash app
    # Note: The rest of the model training has already been done above.
    app.run_server(debug=False)


[I 2025-03-12 12:47:17,633] A new study created in memory with name: no-name-d6eb2740-ca21-45f3-8d68-96c667e5e568


Directory exists: F:\2025 energing\PYTHON\GNN_chemicalENV-main\GNN molecules\graph_pickles\molecules
Files in directory: ['gpickle_graph_0.pkl', 'gpickle_graph_1.pkl', 'gpickle_graph_10.pkl', 'gpickle_graph_11.pkl', 'gpickle_graph_12.pkl', 'gpickle_graph_13.pkl', 'gpickle_graph_14.pkl', 'gpickle_graph_15.pkl', 'gpickle_graph_16.pkl', 'gpickle_graph_17.pkl', 'gpickle_graph_18.pkl', 'gpickle_graph_19.pkl', 'gpickle_graph_2.pkl', 'gpickle_graph_3.pkl', 'gpickle_graph_4.pkl', 'gpickle_graph_5.pkl', 'gpickle_graph_6.pkl', 'gpickle_graph_7.pkl', 'gpickle_graph_8.pkl', 'gpickle_graph_9.pkl']


[I 2025-03-12 13:25:50,279] Trial 0 finished with value: 0.6498154129034478 and parameters: {'hidden_dim': 8, 'num_layers': 3, 'dropout': 0.4, 'lr': 0.00022245941537414965, 'weight_decay': 3.247044171584706e-05, 'lambda_bde': 0.004616586913109393, 'lambda_energy': 0.0010743133480093142, 'lambda_c': 0.0002746876713893252}. Best is trial 0 with value: 0.6498154129034478.
[I 2025-03-12 14:14:25,463] Trial 1 finished with value: 0.7361006607135934 and parameters: {'hidden_dim': 64, 'num_layers': 6, 'dropout': 0.4, 'lr': 0.004962370369068733, 'weight_decay': 0.0001343485824061221, 'lambda_bde': 0.010702283356320775, 'lambda_energy': 0.00022674793505244787, 'lambda_c': 0.007047858328958299}. Best is trial 0 with value: 0.6498154129034478.
[I 2025-03-12 15:00:12,111] Trial 2 finished with value: 0.3035213933231613 and parameters: {'hidden_dim': 64, 'num_layers': 6, 'dropout': 0.5, 'lr': 0.0005408077716102977, 'weight_decay': 6.766651693564072e-05, 'lambda_bde': 0.00016876468231366942, 'lambda


=== Optuna Study Results ===
Best trial number: 19
Best trial value (loss): -324.5518
Best hyperparameters:
  hidden_dim: 40
  num_layers: 5
  dropout: 0.2
  lr: 0.00196480286848805
  weight_decay: 2.0267095851164693e-05
  lambda_bde: 0.005450074237182427
  lambda_energy: 0.09902377512485237
  lambda_c: 0.0018969098937381469

--- Retrain Fold 1 with Best Hyperparams ---
[Fold 1 Epoch 50] Train Loss: -330.9276 | Val Loss: -324.9432
[Fold 1 Epoch 100] Train Loss: -330.6546 | Val Loss: -324.9934
[Fold 1 Epoch 150] Train Loss: -332.2515 | Val Loss: -325.6322
[Fold 1 Epoch 200] Train Loss: -331.6856 | Val Loss: -322.0673
[Fold 1 Epoch 250] Train Loss: -332.2506 | Val Loss: -325.8296
[Fold 1 Epoch 300] Train Loss: -332.4581 | Val Loss: -325.4078
Evaluating fold 1 ...
R²: -5785.0156
RMSE: 18.0308

--- Retrain Fold 2 with Best Hyperparams ---
[Fold 2 Epoch 50] Train Loss: -330.6627 | Val Loss: -323.6939
[Fold 2 Epoch 100] Train Loss: -330.7971 | Val Loss: -325.9724
[Fold 2 Epoch 150] Train Lo

---

### OPTUNA

In [None]:
import os
import pickle
import pandas as pd
import numpy as np

import networkx as nx

import torch
import torch.nn.functional as F
from torch import nn
from torch.utils.data import Dataset, DataLoader, random_split
from torch_geometric.data import Data, DataLoader as PyGDataLoader
from torch_geometric.utils import from_networkx
from torch_geometric.nn import GINEConv, global_mean_pool, BatchNorm
from torch_geometric.nn.conv.gcn_conv import gcn_norm

from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import KFold

import warnings
warnings.filterwarnings('ignore')

# =============================================================================
#                             DATASET & PREP
# =============================================================================

class MolDataset(Dataset):
    """
    A custom dataset that:
      - Reads external factors from CSV
      - Loads the corresponding pickle for the molecule's graph
      - Converts it into a PyG Data object
    """
    def __init__(self,
                 raw_dataframe: pd.DataFrame,
                 nx_graph_dict: dict,
                 *,
                 component_col: str,
                 global_state_cols: list[str],
                 label_col: str,
                 transform=None):
        """
        Args:
            raw_dataframe: The input dataframe containing molecule info.
            nx_graph_dict: Dictionary mapping component names to networkx graphs.
            component_col: Column name for the component.
            global_state_cols: List of columns representing external factors.
            label_col: Column name for the regression target.
            transform: Any transform to apply to each PyG Data object.
        """
        self.raw_dataframe = raw_dataframe
        self.nx_graph_dict = nx_graph_dict
        self.component_col = [component_col] if type(component_col) is str else component_col
        self.global_state_cols = global_state_cols
        self.label_col = [label_col] if type(label_col) is str else label_col
        self.transform = transform
        
        required_cols = set(self.global_state_cols + self.label_col + self.component_col)
        for col in required_cols:
            if col not in self.raw_dataframe.columns:
                raise ValueError(f"Missing column in DataFrame: '{col}'")

    def __len__(self):
        return len(self.raw_dataframe)

    def __getitem__(self, idx):
        row = self.raw_dataframe.iloc[idx]
        
        # 1. Load the molecule graph
        component_name = row[self.component_col[0]]  # e.g. "C23"
        pyg_data = self.nx_graph_dict[component_name]

        # 2. Prepare the external factors
        externals = torch.tensor(row[self.global_state_cols].values.astype(float), dtype=torch.float)
        externals = externals.unsqueeze(0)

        # 3. Prepare the label (regression target)
        label = torch.tensor([row[self.label_col][0]], dtype=torch.float)

        # 4. Attach externals & label to the Data object
        pyg_data.externals = externals  # shape [1, external_in_dim]
        pyg_data.y = label  # shape [1]

        if self.transform:
            pyg_data = self.transform(pyg_data)

        return pyg_data


def networkx_to_pyg(nx_graph):
    """
    Convert a networkx graph to a torch_geometric.data.Data object.
    This is a basic template; adjust for your actual node/edge features.
    """
    # Sort nodes to ensure consistent ordering
    node_mapping = {node: i for i, node in enumerate(nx_graph.nodes())}

    x_list = []
    edge_index_list = []
    edge_attr_list = []

    # Node features
    for node in nx_graph.nodes(data=True):
        original_id = node[0]
        attrs = node[1]
        symbol = attrs.get("symbol", "C")
        symbol_id = 0 if symbol == "C" else 1 if symbol == "H" else 2
        x_list.append([symbol_id])

    # Edge features
    for u, v, edge_attrs in nx_graph.edges(data=True):
        u_idx = node_mapping[u]
        v_idx = node_mapping[v]
        edge_index_list.append((u_idx, v_idx))
        bde_pred = edge_attrs.get("bde_pred", 0.0) or 0.0
        bdfe_pred = edge_attrs.get("bdfe_pred", 0.0) or 0.0
        edge_attr_list.append([bde_pred, bdfe_pred])

    x = torch.tensor(x_list, dtype=torch.float)
    edge_index = torch.tensor(edge_index_list, dtype=torch.long).t().contiguous()
    edge_attr = torch.tensor(edge_attr_list, dtype=torch.float)

    data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr)
    return data


# =============================================================================
#                     BASE GNN MODEL (WITH DIM MATCH)
# =============================================================================

class GINE_Regression(nn.Module):
    """
    A GNN for regression using GINEConv layers + edge attributes,
    where all layers have the same hidden_dim (no dimension mismatch).
    """
    def __init__(self,
                 node_in_dim: int,
                 edge_in_dim: int,
                 external_in_dim: int,
                 hidden_dim: int = 128,
                 num_layers: int = 3,
                 dropout: float = 0.1):
        super().__init__()
        
        # Encode edges from edge_in_dim to hidden_dim
        self.edge_encoder = nn.Sequential(
            nn.Linear(edge_in_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim)
        )
        
        # Encode nodes from node_in_dim to hidden_dim
        self.node_encoder = nn.Linear(node_in_dim, hidden_dim)
        
        # Multiple GINEConv layers & corresponding BatchNorm
        self.convs = nn.ModuleList()
        self.bns = nn.ModuleList()
        
        for _ in range(num_layers):
            net = nn.Sequential(
                nn.Linear(hidden_dim, hidden_dim),
                nn.ReLU(),
                nn.Linear(hidden_dim, hidden_dim)
            )
            conv = GINEConv(nn=net)
            self.convs.append(conv)
            self.bns.append(BatchNorm(hidden_dim))

        self.dropout = nn.Dropout(p=dropout)

        # Process external factors
        self.externals_mlp = nn.Sequential(
            nn.Linear(external_in_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(p=dropout),
            nn.Linear(hidden_dim, hidden_dim)
        )

        # Final regression
        self.final_regressor = nn.Sequential(
            nn.Linear(hidden_dim + hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(p=dropout),
            nn.Linear(hidden_dim, 1)
        )

    def forward(self, data):
        x, edge_index, edge_attr, batch = data.x, data.edge_index, data.edge_attr, data.batch

        # Encode
        x = self.node_encoder(x)
        edge_emb = self.edge_encoder(edge_attr)
        
        # Pass through GINEConv layers
        for conv, bn in zip(self.convs, self.bns):
            x = conv(x, edge_index, edge_emb)
            x = bn(x)
            x = F.relu(x)
            x = self.dropout(x)

        # Global pooling
        graph_emb = global_mean_pool(x, batch)

        # Process external factors
        ext_emb = self.externals_mlp(data.externals)

        # Combine & regress
        combined = torch.cat([graph_emb, ext_emb], dim=-1)
        out = self.final_regressor(combined).squeeze(-1)
        return out


# =============================================================================
#                   TRAIN/VALID/EVALUATION UTILS
# =============================================================================

def train_one_epoch(model, loader, optimizer, criterion, device):
    model.train()
    total_loss = 0.0
    count = 0
    for batch_data in loader:
        batch_data = batch_data.to(device)
        optimizer.zero_grad()
        preds = model(batch_data)
        y = batch_data.y.to(device).view(-1)
        loss = criterion(preds, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * batch_data.num_graphs
        count += batch_data.num_graphs
    return total_loss / count if count > 0 else 0.0


def validate(model, loader, criterion, device):
    model.eval()
    total_loss = 0.0
    count = 0
    with torch.no_grad():
        for batch_data in loader:
            batch_data = batch_data.to(device)
            preds = model(batch_data)
            y = batch_data.y.to(device).view(-1)
            loss = criterion(preds, y)
            total_loss += loss.item() * batch_data.num_graphs
            count += batch_data.num_graphs
    return total_loss / count if count > 0 else 0.0


def evaluate_model(model, loader, device):
    """
    Evaluate the model on a dataset loader and compute R² and RMSE.
    """
    model.eval()
    y_true, y_pred = [], []

    with torch.no_grad():
        for batch in loader:
            batch = batch.to(device)
            preds = model(batch)
            y_true.append(batch.y.cpu())
            y_pred.append(preds.cpu())

    y_true = torch.cat(y_true).numpy().squeeze()
    y_pred = torch.cat(y_pred).numpy().squeeze()

    r2 = r2_score(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))

    print(f"R²: {r2:.4f}")
    print(f"RMSE: {rmse:.4f}")

    return r2, rmse


# =============================================================================
#                             DATA PREPARATION
# =============================================================================

env_file = r"numberGNN molecules\graph_pickles\dataset02.xlsx"
data = pd.read_excel(env_file, engine='openpyxl').dropna(subset=['degradation_rate'])
data['seawater'] = data['seawater'].map({'art': 1, 'sea': 0})

folder_path = r"F:\2025 energing\PYTHON\GNN_chemicalENV-main\GNN molecules\graph_pickles\molecules"
graph_pickles = [f for f in os.listdir(folder_path) if f.endswith(".pkl")]

base_dir = r"F:\2025 energing\PYTHON\GNN_chemicalENV-main\GNN molecules\graph_pickles\molecules"
if os.path.exists(base_dir):
    print("Directory exists:", base_dir)
    print("Files in directory:", os.listdir(base_dir))
else:
    print(f"Error: Directory {base_dir} does not exist!")

compounds = data.component.unique()
graphs_dict = {}
for compound, graph_pickle in zip(compounds, graph_pickles):
    with open(os.path.join(base_dir, graph_pickle), 'rb') as f:
        graph = pickle.load(f)
        graphs_dict[compound] = networkx_to_pyg(graph)

dataset = MolDataset(
    raw_dataframe=data,
    nx_graph_dict=graphs_dict,
    component_col="component",
    global_state_cols=["temperature", "concentration", "time", "seawater"],
    label_col="degradation_rate",
    transform=None
)

# =============================================================================
#                      CROSS-VALIDATION (FIXED MODEL)
# =============================================================================
from torch_geometric.data import DataLoader as PyGDataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
k = 5  # number of folds
from sklearn.model_selection import KFold
kf = KFold(n_splits=k, shuffle=True, random_state=42)

fold_results = []

for fold, (train_idx, val_idx) in enumerate(kf.split(dataset)):
    print(f"\n--- Fold {fold + 1} ---")

    train_subset = torch.utils.data.Subset(dataset, train_idx)
    val_subset = torch.utils.data.Subset(dataset, val_idx)

    train_loader = PyGDataLoader(train_subset, batch_size=16, shuffle=True)
    val_loader = PyGDataLoader(val_subset, batch_size=16, shuffle=False)

    model = GINE_Regression(
        node_in_dim=1,
        edge_in_dim=2,
        external_in_dim=4,
        hidden_dim=16,  # Example hidden_dim
        num_layers=5,
        dropout=0.1
    ).to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)
    criterion = torch.nn.MSELoss()

    num_epochs = 500
    for epoch in range(1, num_epochs + 1):
        train_loss = train_one_epoch(model, train_loader, optimizer, criterion, device)
        val_loss = validate(model, val_loader, criterion, device)

        if epoch % 10 == 0:
            print(f"[Fold {fold + 1} Epoch {epoch}] Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")

    print(f"Evaluating fold {fold + 1} ...")
    r2, rmse = evaluate_model(model, val_loader, device)
    fold_results.append({"fold": fold + 1, "r2": r2, "rmse": rmse})

r2_scores = [res["r2"] for res in fold_results]
rmse_scores = [res["rmse"] for res in fold_results]

print("\n--- Cross-Validation Summary ---")
for res in fold_results:
    print(f"Fold {res['fold']}: R² = {res['r2']:.4f}, RMSE = {res['rmse']:.4f}")

print(f"\nAverage R²: {np.mean(r2_scores):.4f} ± {np.std(r2_scores):.4f}")
print(f"Average RMSE: {np.mean(rmse_scores):.4f} ± {np.std(rmse_scores):.4f}")


# =============================================================================
#             IMPROVED MODEL WITH TRAPEZOID DIMENSIONS & PROJECTIONS
# =============================================================================
import optuna
import optuna.visualization as vis

def build_trapezoid_dims(num_layers):
    """
    Build a list of hidden dimensions in a trapezoid manner:
    up to the first 4 layers: [128, 64, 32, 16]
    if num_layers > 4, extend with 16 for extra layers.
    """
    base_dims = [128, 64, 32, 16]
    if num_layers > 4:
        extra_layers = num_layers - 4
        return base_dims + [16] * extra_layers
    else:
        return base_dims[:num_layers]


class GINE_RegressionTrapezoid(nn.Module):
    """
    A GINEConv-based regression model that uses a list of hidden dimensions
    to build layers with decreasing size (trapezoid architecture),
    ensuring dimension consistency with projection layers between convs.
    """
    def __init__(self,
                 node_in_dim: int,
                 edge_in_dim: int,
                 external_in_dim: int,
                 hidden_dims: list,
                 dropout: float = 0.1):
        super().__init__()

        # For the first layer, encode edges to hidden_dims[0], and encode nodes as well
        self.initial_edge_encoder = nn.Linear(edge_in_dim, hidden_dims[0])
        self.initial_node_encoder = nn.Linear(node_in_dim, hidden_dims[0])

        # We'll build each GINEConv to transform dimension: hidden_dims[i] -> hidden_dims[i].
        # After each conv i, if i < len(hidden_dims)-1, we project to hidden_dims[i+1].
        self.convs = nn.ModuleList()
        self.bns = nn.ModuleList()
        self.projections = nn.ModuleList()  # for node features
        self.edge_projections = nn.ModuleList()  # for edge features

        for i in range(len(hidden_dims)):
            # GINEConv's internal MLP
            net = nn.Sequential(
                nn.Linear(hidden_dims[i], hidden_dims[i]),
                nn.ReLU(),
                nn.Linear(hidden_dims[i], hidden_dims[i])
            )
            conv = GINEConv(nn=net)
            self.convs.append(conv)
            self.bns.append(BatchNorm(hidden_dims[i]))

            # If there's a next layer, we need a projection from hidden_dims[i] -> hidden_dims[i+1]
            if i < len(hidden_dims) - 1:
                self.projections.append(nn.Linear(hidden_dims[i], hidden_dims[i+1]))
                self.edge_projections.append(nn.Linear(hidden_dims[i], hidden_dims[i+1]))
            else:
                # No projection needed for the last layer
                self.projections.append(None)
                self.edge_projections.append(None)

        self.dropout_layer = nn.Dropout(p=dropout)

        # We'll process external factors to the final dimension
        final_dim = hidden_dims[-1]
        self.externals_mlp = nn.Sequential(
            nn.Linear(external_in_dim, final_dim),
            nn.ReLU(),
            nn.Dropout(p=dropout),
            nn.Linear(final_dim, final_dim)
        )

        # Final regression: combine final node features + final external features
        self.final_regressor = nn.Sequential(
            nn.Linear(final_dim + final_dim, final_dim),
            nn.ReLU(),
            nn.Dropout(p=dropout),
            nn.Linear(final_dim, 1)
        )

    def forward(self, data):
        x, edge_index, edge_attr, batch = data.x, data.edge_index, data.edge_attr, data.batch

        # 1) Encode node/edge to hidden_dims[0]
        x = self.initial_node_encoder(x)
        edge_emb = self.initial_edge_encoder(edge_attr)

        # 2) Pass through each GINEConv
        for i, (conv, bn) in enumerate(zip(self.convs, self.bns)):
            # GINEConv forward
            x = conv(x, edge_index, edge_emb)
            x = bn(x)
            x = F.relu(x)
            x = self.dropout_layer(x)

            # If there's a next layer, project node features and edge_emb to hidden_dims[i+1]
            if i < len(self.projections) - 1 and self.projections[i] is not None:
                x = self.projections[i](x)
                edge_emb = self.edge_projections[i](edge_emb)

        # 3) Global mean pooling
        graph_emb = global_mean_pool(x, batch)

        # 4) Process external factors into final dimension
        ext_emb = self.externals_mlp(data.externals)

        # 5) Concatenate and final regression
        combined = torch.cat([graph_emb, ext_emb], dim=-1)
        out = self.final_regressor(combined).squeeze(-1)
        return out


# =============================================================================
#                          OPTUNA OBJECTIVE FUNCTION
# =============================================================================

def objective(trial):
    """
    Objective function for Optuna.
    Performs k-fold cross validation using GINE_RegressionTrapezoid,
    returning negative average R². We log RMSE as a user attribute.
    """
    # Hyperparameter search space
    #lr = trial.suggest_categorical("learning_rate", [1e-3, 3e-3, 1e-4, 3e-4])
    lr = trial.suggest_float("learning_rate", 1e-4, 5e-4, log=True)

    dropout = trial.suggest_categorical("dropout", [0.1, 0.5])
    num_layers = trial.suggest_int("num_layers", 2, 6)

    hidden_dims = build_trapezoid_dims(num_layers)
    num_epochs = 100  # Could also be a hyperparameter

    # Local 5-fold for CV
    kf_local = KFold(n_splits=5, shuffle=True, random_state=42)
    r2_scores = []
    rmse_scores = []

    for fold_idx, (train_idx, val_idx) in enumerate(kf_local.split(dataset)):
        train_subset = torch.utils.data.Subset(dataset, train_idx)
        val_subset = torch.utils.data.Subset(dataset, val_idx)

        train_loader = PyGDataLoader(train_subset, batch_size=16, shuffle=True)
        val_loader = PyGDataLoader(val_subset, batch_size=16, shuffle=False)

        model = GINE_RegressionTrapezoid(
            node_in_dim=1,
            edge_in_dim=2,
            external_in_dim=4,
            hidden_dims=hidden_dims,
            dropout=dropout
        ).to(device)

        optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-5)
        criterion = nn.MSELoss()

        for epoch in range(1, num_epochs + 1):
            train_loss = train_one_epoch(model, train_loader, optimizer, criterion, device)
            val_loss = validate(model, val_loader, criterion, device)

            # Report intermediate values (optional)
            if epoch % 10 == 0:
                trial.report(val_loss, step=epoch)
                if trial.should_prune():
                    raise optuna.TrialPruned()

        # Evaluate R² and RMSE for this fold
        model.eval()
        y_true_fold, y_pred_fold = [], []
        with torch.no_grad():
            for batch_data in val_loader:
                batch_data = batch_data.to(device)
                preds = model(batch_data)
                y_true_fold.append(batch_data.y.cpu())
                y_pred_fold.append(preds.cpu())

        y_true_fold = torch.cat(y_true_fold).numpy().squeeze()
        y_pred_fold = torch.cat(y_pred_fold).numpy().squeeze()

        fold_r2 = r2_score(y_true_fold, y_pred_fold)
        fold_rmse = np.sqrt(mean_squared_error(y_true_fold, y_pred_fold))

        r2_scores.append(fold_r2)
        rmse_scores.append(fold_rmse)

    avg_r2 = np.mean(r2_scores)
    avg_rmse = np.mean(rmse_scores)

    # Log RMSE as user attribute
    trial.set_user_attr("avg_rmse", float(avg_rmse))

    # Return negative R² because Optuna minimizes the objective
    return -avg_r2


# =============================================================================
#                           OPTUNA STUDY & DASHBOARD
# =============================================================================
if __name__ == "__main__":
    study = optuna.create_study(
        storage="sqlite:///gnn_mix_op05.sqlite3",
        study_name="GNN-mixed model01",
        direction="minimize",
        load_if_exists=True
    )

    study.optimize(objective, n_trials=100, show_progress_bar=True)

    print("\n================= Optuna Study Results =================")
    best_trial = study.best_trial
    print(f"Best Trial Value (Negative R²): {best_trial.value}")
    print("Best Hyperparameters:")
    for key, val in best_trial.params.items():
        print(f"  {key}: {val}")
    print(f"User Attrs (e.g., RMSE): {best_trial.user_attrs}")

    try:
        fig1 = vis.plot_optimization_history(study)
        fig1.show()
    except Exception as e:
        print(f"Could not generate optimization history plot: {e}")

    try:
        fig2 = vis.plot_param_importances(study)
        fig2.show()
    except Exception as e:
        print(f"Could not generate hyperparameter importance plot: {e}")

    try:
        fig3 = vis.plot_intermediate_values(study)
        fig3.show()
    except Exception as e:
        print(f"Could not generate intermediate values plot: {e}")

    print("\n================= End of Optuna Tuning =================")


Directory exists: F:\2025 energing\PYTHON\GNN_chemicalENV-main\GNN molecules\graph_pickles\molecules
Files in directory: ['gpickle_graph_0.pkl', 'gpickle_graph_1.pkl', 'gpickle_graph_10.pkl', 'gpickle_graph_11.pkl', 'gpickle_graph_12.pkl', 'gpickle_graph_13.pkl', 'gpickle_graph_14.pkl', 'gpickle_graph_15.pkl', 'gpickle_graph_16.pkl', 'gpickle_graph_17.pkl', 'gpickle_graph_18.pkl', 'gpickle_graph_19.pkl', 'gpickle_graph_2.pkl', 'gpickle_graph_3.pkl', 'gpickle_graph_4.pkl', 'gpickle_graph_5.pkl', 'gpickle_graph_6.pkl', 'gpickle_graph_7.pkl', 'gpickle_graph_8.pkl', 'gpickle_graph_9.pkl']

--- Fold 1 ---
[Fold 1 Epoch 10] Train Loss: 0.1393 | Val Loss: 0.0680
[Fold 1 Epoch 20] Train Loss: 0.0879 | Val Loss: 0.0614
[Fold 1 Epoch 30] Train Loss: 0.0824 | Val Loss: 0.0537
[Fold 1 Epoch 40] Train Loss: 0.0880 | Val Loss: 0.0546
[Fold 1 Epoch 50] Train Loss: 0.0699 | Val Loss: 0.0505
[Fold 1 Epoch 60] Train Loss: 0.0712 | Val Loss: 0.0493
[Fold 1 Epoch 70] Train Loss: 0.0664 | Val Loss: 0.0496


[I 2025-03-07 09:39:16,992] A new study created in RDB with name: GNN-mixed model01


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2025-03-07 09:52:28,448] Trial 0 finished with value: 0.3789858102798462 and parameters: {'learning_rate': 0.00024832838082315255, 'dropout': 0.5, 'num_layers': 5}. Best is trial 0 with value: 0.3789858102798462.
[I 2025-03-07 10:06:22,990] Trial 1 finished with value: -0.039005601406097413 and parameters: {'learning_rate': 0.00019602220882739989, 'dropout': 0.1, 'num_layers': 6}. Best is trial 1 with value: -0.039005601406097413.
[I 2025-03-07 10:16:29,003] Trial 2 finished with value: 0.5399153232574463 and parameters: {'learning_rate': 0.00016255138420160465, 'dropout': 0.5, 'num_layers': 2}. Best is trial 1 with value: -0.039005601406097413.
[I 2025-03-07 10:28:41,602] Trial 3 finished with value: -0.06721740961074829 and parameters: {'learning_rate': 0.0004243064104699774, 'dropout': 0.1, 'num_layers': 3}. Best is trial 3 with value: -0.06721740961074829.
[I 2025-03-07 10:41:29,270] Trial 4 finished with value: 0.03246970176696777 and parameters: {'learning_rate': 0.00037713955

---
---

###     Cross Validation

In [20]:
from sklearn.model_selection import KFold  #  k 折交叉验证

# ---------------------
# k-Fold Cross Validation
# ---------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
k = 5  # number of folds
kf = KFold(n_splits=k, shuffle=True, random_state=42) # shuffle = True 参数 随机打乱

fold_results = []

for fold, (train_idx, val_idx) in enumerate(kf.split(dataset)): # 循环 kf.split的折叠 enumerate函数添加计数
                                                                # fold 计数器 迭代次数
    print(f"\n--- Fold {fold + 1} ---")
    
    # Create train and validation subsets
    train_subset = torch.utils.data.Subset(dataset, train_idx) # 子集：训练集
    val_subset = torch.utils.data.Subset(dataset, val_idx)
    
    train_loader = PyGDataLoader(train_subset, batch_size=16, shuffle=True) # PyG 数据加载
    val_loader = PyGDataLoader(val_subset, batch_size=16, shuffle=False)
    
    # Initialize a new instance of the model, optimizer, and loss function for each fold
    model = GINE_Regression(
        node_in_dim=1,
        edge_in_dim=2,
        external_in_dim=4,
        hidden_dim=16,
        num_layers=5,
        dropout=0.1
    ).to(device)
    
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)
    criterion = torch.nn.MSELoss()
    
    num_epochs = 500
    for epoch in range(1, num_epochs + 1):
        train_loss = train_one_epoch(model, train_loader, optimizer, criterion, device)
        val_loss = validate(model, val_loader, criterion, device)
        
        if epoch % 10 == 0:
            print(f"[Fold {fold + 1} Epoch {epoch}] Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")
    
    print(f"Evaluating fold {fold + 1} ...")
    r2, rmse = evaluate_model(model, val_loader, device)
    fold_results.append({"fold": fold + 1, "r2": r2, "rmse": rmse})

# ---------------------
# Summary of Cross-Validation Results
# ---------------------
r2_scores = [res["r2"] for res in fold_results]
rmse_scores = [res["rmse"] for res in fold_results]

print("\n--- Cross-Validation Summary ---")
for res in fold_results:
    print(f"Fold {res['fold']}: R² = {res['r2']:.4f}, RMSE = {res['rmse']:.4f}")

print(f"\nAverage R²: {np.mean(r2_scores):.4f} ± {np.std(r2_scores):.4f}")
print(f"Average RMSE: {np.mean(rmse_scores):.4f} ± {np.std(rmse_scores):.4f}")


--- Fold 1 ---
[Fold 1 Epoch 10] Train Loss: 0.1210 | Val Loss: 0.0644
[Fold 1 Epoch 20] Train Loss: 0.0977 | Val Loss: 0.0594
[Fold 1 Epoch 30] Train Loss: 0.0972 | Val Loss: 0.0560
[Fold 1 Epoch 40] Train Loss: 0.0762 | Val Loss: 0.0503
[Fold 1 Epoch 50] Train Loss: 0.0821 | Val Loss: 0.0633
[Fold 1 Epoch 60] Train Loss: 0.0713 | Val Loss: 0.0479
[Fold 1 Epoch 70] Train Loss: 0.0706 | Val Loss: 0.0467
[Fold 1 Epoch 80] Train Loss: 0.0635 | Val Loss: 0.0454
[Fold 1 Epoch 90] Train Loss: 0.0611 | Val Loss: 0.0453
[Fold 1 Epoch 100] Train Loss: 0.0603 | Val Loss: 0.0442
[Fold 1 Epoch 110] Train Loss: 0.0595 | Val Loss: 0.0426
[Fold 1 Epoch 120] Train Loss: 0.0574 | Val Loss: 0.0488
[Fold 1 Epoch 130] Train Loss: 0.0613 | Val Loss: 0.0419
[Fold 1 Epoch 140] Train Loss: 0.0658 | Val Loss: 0.0567
[Fold 1 Epoch 150] Train Loss: 0.0585 | Val Loss: 0.0405
[Fold 1 Epoch 160] Train Loss: 0.0558 | Val Loss: 0.0384
[Fold 1 Epoch 170] Train Loss: 0.0468 | Val Loss: 0.0374
[Fold 1 Epoch 180] Train