In [5]:
import os
import pickle
import pandas as pd
import numpy as np

import networkx as nx

import torch
import torch.nn.functional as F
from torch import nn
from torch.utils.data import Dataset, DataLoader, random_split

from torch_geometric.data import Data, DataLoader as PyGDataLoader
from torch_geometric.utils import from_networkx
from torch_geometric.nn import GCNConv, GINEConv, global_mean_pool, BatchNorm
from torch_geometric.nn.conv.gcn_conv import gcn_norm

from sklearn.metrics import r2_score, mean_squared_error



In [6]:
import warnings
warnings.filterwarnings('ignore')


In [7]:
class MolDataset(Dataset):
    """
    A custom dataset that:
      - Reads external factors from CSV
      - Loads the corresponding pickle for the molecule's graph
      - Converts it into a PyG Data object
    """
    def __init__(self,
                 raw_dataframe: pd.DataFrame,
                 nx_graph_dict: dict,
                 *,
                 component_col: str,
                 global_state_cols: list[str],
                 label_col: str,
                 transform=None):
        """
        Args:

        """
        self.raw_dataframe = raw_dataframe
        self.nx_graph_dict = nx_graph_dict
        self.component_col = [component_col] if type(component_col) is str else component_col
        self.global_state_cols = global_state_cols
        self.label_col = [label_col] if type(label_col) is str else label_col
        self.transform = transform
        
        required_cols = set(self.global_state_cols + self.label_col + self.component_col)
        for col in required_cols:
            if col not in self.raw_dataframe.columns:
                raise ValueError(f"Missing column in DataFrame: '{col}'")


    def __len__(self):
        return len(self.raw_dataframe)

    def __getitem__(self, idx):
        row = self.raw_dataframe.iloc[idx]
        
        # 1. Load the molecule graph
        component_name = row[self.component_col[0]]  # e.g. "C23"
        pyg_data = self.nx_graph_dict[component_name]

        # 3. Prepare the external factors
        #    Convert selected columns into a float tensor
        externals = torch.tensor(row[self.global_state_cols].values.astype(float), dtype=torch.float)
        externals = externals.unsqueeze(0)

        # 4. Prepare the label (regression target)
        label = torch.tensor([row[self.label_col][0]], dtype=torch.float)

        # 5. Attach externals & label to the Data object for use in the model
        #    (We can store them in Data object attributes if you like)
        pyg_data.externals = externals  # 1D vector of external factors
        pyg_data.y = label  # shape [1]

        if self.transform:
            pyg_data = self.transform(pyg_data)

        return pyg_data


---
### 1. Standard

In [26]:
from sklearn.preprocessing import StandardScaler
import torch

class MolDataset(Dataset):
    def __init__(self,
                 raw_dataframe: pd.DataFrame,
                 nx_graph_dict: dict,
                 *,                             # 所有在 * 之后定义的参数（component_col, global_state_cols, label_col, transform）必须以关键字（即 参数名=值）的方式传递，而不能用位置参数（positional arguments）。
                 component_col: str,
                 global_state_cols: list[str],
                 label_col: str,
                 transform=None):
        
        self.raw_dataframe = raw_dataframe
        self.nx_graph_dict = nx_graph_dict
        self.component_col = [component_col] if type(component_col) is str else component_col
        self.global_state_cols = global_state_cols
        self.label_col = [label_col] if type(label_col) is str else label_col
        self.transform = transform
        
        required_cols = set(self.global_state_cols + self.label_col + self.component_col)
        for col in required_cols:
            if col not in self.raw_dataframe.columns:
                raise ValueError(f"Missing column in DataFrame: '{col}'")

        # Standardization 
        # StandardScaler (类 将数据 转换为均值为 0，标准差为 1 的标准正态分布)
        self.node_scaler = StandardScaler() # 创建 StandardScaler 对象 
        self.edge_scaler = StandardScaler()
        self.env_scaler = StandardScaler()

        # Collect all node and edge features for standardization
        node_features = []
        edge_features = []
        env_features = []

        for _, row in self.raw_dataframe.iterrows():
            component_name = row[self.component_col[0]]
            pyg_data = self.nx_graph_dict[component_name]

            if pyg_data.x is not None:
                node_features.append(pyg_data.x.numpy())

            if pyg_data.edge_attr is not None:
                edge_features.append(pyg_data.edge_attr.numpy())

            env_features.append(row[self.global_state_cols].values.astype(float))

        # Fit scalers
        if node_features:
            all_node_features = np.vstack(node_features)   #将所有节点特征合并成一个大矩阵
            self.node_scaler.fit(all_node_features)        # 计算均值和标准差

        if edge_features:
            all_edge_features = np.vstack(edge_features)
            self.edge_scaler.fit(all_edge_features)

        all_env_features = np.vstack(env_features)
        self.env_scaler.fit(all_env_features)

    def __len__(self):
        return len(self.raw_dataframe)

    def __getitem__(self, idx):
        row = self.raw_dataframe.iloc[idx]
        component_name = row[self.component_col[0]]
        pyg_data = self.nx_graph_dict[component_name]

        # Standardize node features
        if pyg_data.x is not None:
            pyg_data.x = torch.tensor(self.node_scaler.transform(pyg_data.x.numpy()), dtype=torch.float) #对 新的数据 进行标准化转换

        # Standardize edge features
        if pyg_data.edge_attr is not None:
            pyg_data.edge_attr = torch.tensor(self.edge_scaler.transform(pyg_data.edge_attr.numpy()), dtype=torch.float)

        # Standardize environmental data
        externals = row[self.global_state_cols].values.astype(float)
        externals = torch.tensor(self.env_scaler.transform([externals])[0], dtype=torch.float).unsqueeze(0)
        pyg_data.externals = externals  

        # Prepare label
        label = torch.tensor([row[self.label_col][0]], dtype=torch.float)
        pyg_data.y = label  

        if self.transform:
            pyg_data = self.transform(pyg_data)

        return pyg_data


In [8]:

def networkx_to_pyg(nx_graph):
    """
    Convert a networkx graph to a torch_geometric.data.Data object.
    This is a basic template; adjust for your actual node/edge features.
    """
    # Sort nodes to ensure consistent ordering
    # e.g. node 0, node 1, ...
    # In some networkx graphs, node labels might be strings. We’ll map them to integers.
    node_mapping = {node: i for i, node in enumerate(nx_graph.nodes())}

    # Build lists for PyG
    x_list = []
    edge_index_list = []
    edge_attr_list = []

    for node in nx_graph.nodes(data=True):
        original_id = node[0]
        attrs = node[1]
        # Example: 'symbol' might be in attrs, etc.
        # For demonstration, let's store only "symbol" as a simple categorical embedding
        # You might do something more sophisticated (e.g., one-hot) for real usage
        symbol = attrs.get("symbol", "C")
        # Convert symbol to a simple ID (C=0, H=1, etc.) or some vector
        # We'll do a naive approach here:
        symbol_id = 0 if symbol == "C" else 1 if symbol == "H" else 2
        
        x_list.append([symbol_id])

    for u, v, edge_attrs in nx_graph.edges(data=True):
        u_idx = node_mapping[u]
        v_idx = node_mapping[v]
        edge_index_list.append((u_idx, v_idx))
        # Possibly store bond features: "bond_index", "bde_pred", etc.
        bde_pred = edge_attrs.get("bde_pred", 0.0)
        if bde_pred is None:
            bde_pred = 0.0
        bdfe_pred = edge_attrs.get("bdfe_pred", 0.0)
        if bdfe_pred is None:
            bdfe_pred = 0.0
        edge_attr_list.append([bde_pred, bdfe_pred])
    
    # Convert to torch tensors
    x = torch.tensor(x_list, dtype=torch.float)  # shape [num_nodes, num_node_features]
    edge_index = torch.tensor(edge_index_list, dtype=torch.long).t().contiguous()  # shape [2, num_edges]
    edge_attr = torch.tensor(edge_attr_list, dtype=torch.float)  # shape [num_edges, edge_feat_dim]

    data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr)
    return data


In [9]:
class GINE_Regression(nn.Module):
    def __init__(self,
                 node_in_dim: int,
                 edge_in_dim: int,
                 external_in_dim: int,
                 hidden_dim: int = 128,
                 num_layers: int = 3,
                 dropout: float = 0.1):
        """
        A more 'realistic' GNN for regression, using GINEConv layers + edge attributes.
        
        Args:
            node_in_dim (int): Dim of node features (e.g. 1 or 3).
            edge_in_dim (int): Dim of edge features (e.g. 2 for [bde_pred, bdfe_pred]).
            external_in_dim (int): Dim of external factor features (e.g. 6).
            hidden_dim (int): Hidden embedding size for GNN layers.
            num_layers (int): Number of GNN layers.
            dropout (float): Dropout probability.
        """
        super().__init__()
        
        # A learnable linear transform for edge features (required by GINEConv's "nn" argument):
        # Typically GINEConv uses a small MLP to incorporate edge_attr into the message.
        self.edge_encoder = nn.Sequential(
            nn.Linear(edge_in_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim)
        )
        
        # A learnable linear transform for node features:
        self.node_encoder = nn.Linear(node_in_dim, hidden_dim)
        
        # Create multiple GINEConv layers
        self.convs = nn.ModuleList()
        self.bns = nn.ModuleList()
        
        for _ in range(num_layers):
            # GINEConv requires an MLP for node update:
            # We'll use a simple 2-layer MLP
            net = nn.Sequential(
                nn.Linear(hidden_dim, hidden_dim),
                nn.ReLU(),
                nn.Linear(hidden_dim, hidden_dim)
            )
            conv = GINEConv(nn=net)
            self.convs.append(conv)
            self.bns.append(BatchNorm(hidden_dim))  # batch norm for stability

        self.dropout = nn.Dropout(p=dropout)

        # An MLP to process external factors
        self.externals_mlp = nn.Sequential(
            nn.Linear(external_in_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(p=dropout),
            nn.Linear(hidden_dim, hidden_dim)
        )

        # Final regression MLP after pooling + external embedding
        self.final_regressor = nn.Sequential(
            nn.Linear(hidden_dim + hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(p=dropout),
            nn.Linear(hidden_dim, 1)
        )

    def forward(self, data):
        """
        Args:
            data: PyG Data object, expected fields:
                - x: Node features [num_nodes, node_in_dim]
                - edge_index: [2, num_edges]
                - edge_attr: [num_edges, edge_in_dim]
                - batch: [num_nodes] mapping each node to a graph ID
                - externals: [batch_size, external_in_dim]
        Returns:
            A tensor of shape [batch_size], the predicted regression value.
        """
        x, edge_index, edge_attr, batch = data.x, data.edge_index, data.edge_attr, data.batch
        
        # 1) Encode node features and edge features
        x = self.node_encoder(x)                 # [num_nodes, hidden_dim]
        edge_emb = self.edge_encoder(edge_attr)  # [num_edges, hidden_dim]
        
        # 2) Pass through multiple GINEConv layers
        for conv, bn in zip(self.convs, self.bns):
            x = conv(x, edge_index, edge_emb)
            x = bn(x)
            x = F.relu(x)
            x = self.dropout(x)

        # 3) Global pooling to get graph embedding
        graph_emb = global_mean_pool(x, batch)  # [batch_size, hidden_dim]

        # 4) Process external factors
        ext_emb = self.externals_mlp(data.externals)  # [batch_size, hidden_dim]

        # 5) Combine + final regression
        combined = torch.cat([graph_emb, ext_emb], dim=-1)  # [batch_size, hidden_dim * 2]
        out = self.final_regressor(combined).squeeze(-1)    # [batch_size]
        return out


In [10]:
def train_one_epoch(model, loader, optimizer, criterion, device):
    model.train()
    total_loss = 0.0
    count = 0
    for batch_data in loader:
        batch_data = batch_data.to(device)
        optimizer.zero_grad()
        preds = model(batch_data)               # [batch_size]
        y = batch_data.y.to(device).view(-1)    # [batch_size]
        loss = criterion(preds, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * batch_data.num_graphs
        count += batch_data.num_graphs
    return total_loss / count if count > 0 else 0.0

def validate(model, loader, criterion, device):
    model.eval()
    total_loss = 0.0
    count = 0
    with torch.no_grad():
        for batch_data in loader:
            batch_data = batch_data.to(device)
            preds = model(batch_data)
            y = batch_data.y.to(device).view(-1)
            loss = criterion(preds, y)
            total_loss += loss.item() * batch_data.num_graphs
            count += batch_data.num_graphs
    return total_loss / count if count > 0 else 0.0


In [11]:


def evaluate_model(model, loader, device):
    """
    Evaluate the model on a dataset loader and compute R² and RMSE.

    Args:
        model (nn.Module): The trained GNN model.
        loader (DataLoader): The PyG DataLoader for the evaluation dataset.
        device (torch.device): The device to run on.
    
    Returns:
        r2 (float): Coefficient of determination.
        rmse (float): Root Mean Squared Error.
    """
    model.eval()
    y_true, y_pred = [], []

    with torch.no_grad():
        for batch in loader:
            batch = batch.to(device)
            preds = model(batch)
            y_true.append(batch.y.cpu())
            y_pred.append(preds.cpu())

    # If your labels are stored as tensors with an extra dimension, use .squeeze() if needed.
    y_true = torch.cat(y_true).numpy().squeeze()
    y_pred = torch.cat(y_pred).numpy().squeeze()

    r2 = r2_score(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))

    print(f"R²: {r2:.4f}")
    print(f"RMSE: {rmse:.4f}")

    return r2, rmse


In [12]:
env_file = r"C:\Users\80710\OneDrive - Imperial College London\2025 engineering\GNN molecules\graph_pickles\dataset02.xlsx"

data = pd.read_excel(env_file, engine='openpyxl').dropna(subset=['degradation_rate'])
data['seawater'] = data['seawater'].map({'art': 1, 'sea': 0})

In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1023 entries, 0 to 1039
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   data number       1023 non-null   float64
 1   temperature       1023 non-null   float64
 2   seawater          1023 non-null   int64  
 3   concentration     1023 non-null   int64  
 4   time              1023 non-null   int64  
 5   component         1023 non-null   object 
 6   BDE               1023 non-null   float64
 7   BDFE              1023 non-null   float64
 8   energy            1023 non-null   float64
 9   degradation_rate  1023 non-null   float64
dtypes: float64(6), int64(3), object(1)
memory usage: 87.9+ KB


In [14]:
folder_path = r"C:\Users\80710\OneDrive - Imperial College London\2025 engineering\GNN molecules\graph_pickles\molecules"
graph_pickles = [f for f in os.listdir(folder_path) if f.endswith(".pkl")]

#graph_pickles = [f for f in os.listdir('./molecules') if f.endswith('.pkl')]

In [16]:
compounds = data.component.unique()
graphs_dict = {}

for compound, graph_pickle in zip(compounds, graph_pickles):
    #with open(f'./molecules/{graph_pickle}', 'rb') as f:
    with open(os.path.join(base_dir, graph_pickle), 'rb') as f:

        graph = pickle.load(f)
        graphs_dict[compound] = networkx_to_pyg(graph)


In [2]:
import os

base_dir = r"C:\Users\80710\OneDrive - Imperial College London\2025 engineering\GNN molecules\graph_pickles\molecules"

if os.path.exists(base_dir):
    print("Directory exists:", base_dir)
    print("Files in directory:", os.listdir(base_dir))
else:
    print(f"Error: Directory {base_dir} does not exist!")


Directory exists: C:\Users\80710\OneDrive - Imperial College London\2025 engineering\GNN molecules\graph_pickles\molecules
Files in directory: ['gpickle_graph_0.pkl', 'gpickle_graph_1.pkl', 'gpickle_graph_10.pkl', 'gpickle_graph_11.pkl', 'gpickle_graph_12.pkl', 'gpickle_graph_13.pkl', 'gpickle_graph_14.pkl', 'gpickle_graph_15.pkl', 'gpickle_graph_16.pkl', 'gpickle_graph_17.pkl', 'gpickle_graph_18.pkl', 'gpickle_graph_19.pkl', 'gpickle_graph_2.pkl', 'gpickle_graph_3.pkl', 'gpickle_graph_4.pkl', 'gpickle_graph_5.pkl', 'gpickle_graph_6.pkl', 'gpickle_graph_7.pkl', 'gpickle_graph_8.pkl', 'gpickle_graph_9.pkl']


In [17]:

dataset = MolDataset(
    raw_dataframe=data,
    nx_graph_dict=graphs_dict,
    component_col="component",
    global_state_cols=["temperature", "concentration", "time", "seawater"],
    label_col="degradation_rate",
    transform=None
)

# Simple random split for demonstration
train_size = int(0.8 * len(dataset))  # 80% train
val_size   = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_loader = PyGDataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader   = PyGDataLoader(val_dataset, batch_size=16, shuffle=False)



In [27]:

# -----------------------------------
# 2) Instantiate model + optimizer
# -----------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [28]:
model = GINE_Regression(
    node_in_dim=1,
    edge_in_dim=2,
    external_in_dim=4,
    hidden_dim=16,
    num_layers=5,
    dropout=0.1
).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)
criterion = torch.nn.MSELoss()


In [29]:

# -----------------------------------
# 3) Training Loop
# -----------------------------------
num_epochs = 500
for epoch in range(1, num_epochs+1):
    train_loss = train_one_epoch(model, train_loader, optimizer, criterion, device)
    val_loss = validate(model, val_loader, criterion, device)

    if epoch % 10 == 0:
        print(f"[Epoch {epoch}] train_loss: {train_loss:.4f}, val_loss: {val_loss:.4f}")

# Optionally, test or save the model
# torch.save(model.state_dict(), "trained_gine_model.pt")

[Epoch 10] train_loss: 0.1542, val_loss: 0.1230
[Epoch 20] train_loss: 0.0964, val_loss: 0.0923
[Epoch 30] train_loss: 0.0786, val_loss: 0.0773
[Epoch 40] train_loss: 0.0649, val_loss: 0.0789
[Epoch 50] train_loss: 0.0861, val_loss: 0.0951
[Epoch 60] train_loss: 0.0773, val_loss: 0.0853
[Epoch 70] train_loss: 0.0637, val_loss: 0.0708
[Epoch 80] train_loss: 0.0594, val_loss: 0.0673
[Epoch 90] train_loss: 0.0753, val_loss: 0.0748
[Epoch 100] train_loss: 0.0635, val_loss: 0.0658
[Epoch 110] train_loss: 0.0563, val_loss: 0.0679
[Epoch 120] train_loss: 0.0619, val_loss: 0.0622
[Epoch 130] train_loss: 0.0590, val_loss: 0.0613
[Epoch 140] train_loss: 0.0492, val_loss: 0.0648
[Epoch 150] train_loss: 0.0531, val_loss: 0.0642
[Epoch 160] train_loss: 0.0530, val_loss: 0.0569
[Epoch 170] train_loss: 0.0530, val_loss: 0.0608
[Epoch 180] train_loss: 0.0544, val_loss: 0.0643
[Epoch 190] train_loss: 0.0423, val_loss: 0.0642
[Epoch 200] train_loss: 0.0433, val_loss: 0.0546
[Epoch 210] train_loss: 0.044

In [30]:

# Example usage after training:
r2, rmse = evaluate_model(model, val_loader, device)

R²: 0.5281
RMSE: 0.2013
