## 1. Load the environmental variable

In [14]:
import pandas as pd

# Load the environmental data from Excel
env_file = r"C:\Users\80710\OneDrive - Imperial College London\2025 engineering\GNN molecules\graph_pickles\dataset02.xlsx"
env_df = pd.read_excel(env_file, engine='openpyxl')

# Select only the relevant columns for the environment
env_columns = ["temperature", "seawater", "time", "component","concentration", "degradation_rate"]

# Ensure all columns exist in the dataset
env_var = env_df[env_columns].copy()

# Convert categorical "seawater" to numerical (if needed)
env_var["seawater"] = env_var["seawater"].map({"sea": 1, "art": 0})  # Map "sea" → 1, "art" → 0

# Drop rows with missing values
env_var = env_var.dropna().reset_index(drop=True)

# Check if it matches the number of graphs
print(f"Loaded {len(env_var)} environment rows")
print(env_var.head())


Loaded 1023 environment rows
   temperature  seawater  time component  concentration  degradation_rate
0         35.6         1    30       C23             70          0.670914
1         35.6         1    30       C24             70          0.680071
2         35.6         1    30       C25             70          0.655230
3         35.6         1    30       C26             70          0.625193
4         35.6         1    30      C28a             70          0.605853


### compounds names

In [4]:
# Get unique compounds (should be 20 total)
compounds = env_var["component"].unique()
print(compounds)


['C23' 'C24' 'C25' 'C26' 'C28a' 'C28b' 'C29a' 'C29b' 'Ts' 'Tm' 'C29' 'C30'
 'H31S' 'H31R' 'H32S' 'H32R' 'H33S' 'H33R' 'H34S' 'H34R']


### 2. Load molecules graph datas from pickle

In [None]:
import os
import pickle
import networkx as nx


folder_path = r"C:\Users\80710\OneDrive - Imperial College London\2025 engineering\GNN molecules\graph_pickles"
pkl_files = [f for f in os.listdir(folder_path) if f.endswith(".pkl")]

graphs_dict = {}

for filename in pkl_files:
    file_path = os.path.join(folder_path, filename)
    
    # Extract the compound name from the filename, e.g., "C23.pkl" -> "C23"
    compound_name = os.path.splitext(filename)[0]

    # Load the pickle file
    with open(file_path, "rb") as file:
        graph = pickle.load(file)
    
    # Store the graph in the dictionary under the compound name
    graphs_dict[compound_name] = graph


print("Loaded graphs for compounds:", list(graphs_dict.keys()))

Loaded graphs for compounds: ['C23', 'C24', 'C25', 'C26', 'C28a', 'C28b', 'C29', 'C29a', 'C29b', 'C30', 'H31R', 'H31S', 'H32R', 'H32S', 'H33R', 'H33S', 'H34R', 'H34S', 'Tm', 'Ts']


### 转换为PyG

In [None]:
from torch_geometric.utils import from_networkx

pyg_graphs_dict = {}
printprintpp
for compound_name, nx_graph in graphs_dict.items():
    # Convert the NetworkX graph to a PyG Data object
    data = from_networkx(nx_graph)
    pyg_graphs_dict[compound_name] = data

print("Converted graphs:", pyg_graphs_dict.keys())

Converted graphs: dict_keys(['C23', 'C24', 'C25', 'C26', 'C28a', 'C28b', 'C29', 'C29a', 'C29b', 'C30', 'H31R', 'H31S', 'H32R', 'H32S', 'H33R', 'H33S', 'H34R', 'H34S', 'Tm', 'Ts'])


In [11]:
pyg_graphs_dict["C23"]

Data(edge_index=[2, 88], symbol=[42], rdkit_idx=[42], bond_index=[88], bde_pred=[88], bdfe_pred=[88], mol=<rdkit.Chem.rdchem.Mol object at 0x000001F83B641DD0>, num_nodes=42)

---

In [15]:
print(len(pyg_graphs_dict))
print(len(env_df)) 

20
1040


---

## 3. Use environmental variables as Transformer Positional Encoding connect

simply use nx graph is enough

### 环境变量的PyG

In [None]:
import torch
from torch_geometric.data import Data

# Specify which environment columns you want as features
env_columns = ["temperature", "seawater", "time", "concentration"]

data_list = []

for idx, row in env_df.iterrows():
    compound_N = row["component"]

    if compound_N in pyg_graphs_dict:
        # Get the base PyG graph
        base_data = pyg_graphs_dict[compound_N] #应该是用graph去匹配compounds吧？晕了已经

        # 这出问题了
        data = Data(
            x=base_data.x.clone(),
            edge_index=base_data.edge_index.clone(),
            edge_attr=base_data.edge_attr.clone() if base_data.edge_attr is not None else None
        )

        # Create a tensor for environment features
        env_feats = torch.tensor([row[col] for col in env_columns], dtype=torch.float) #提取一整行，转换tensor
        data.env = env_feats  # Attach env features

        # Create a tensor for the target (degradation rate)
        y = torch.tensor([row["degradation_rate"]], dtype=torch.float)
        data.y = y

        data_list.append(data)
    else:
        # If a compound isn't found in pyg_graphs_dict, you can skip or log a warning
        print(f"Warning: Compound {compound_name} not in pyg_graphs_dict.")

print(f"Constructed {len(data_list)} (graph + env) samples.")


SyntaxError: invalid syntax (715384983.py, line 14)

### 3.1  建立Build DataLoader
PYG的Data对象代表图形，图形在节点和边缘的数量上有所不同。默认的DataLoader试图像张量一样堆叠它们，是不行滴。
 Custom Collate？

### 3.2 建立 env + molecules graph层 → 建立GNN
由于PYG将多个图批量分为一个大图结构，因此我们需要data.batch来确保每个节点都会收到其相应图的正确环境嵌入
1.PYG将多个图形组合在一起→来自不同图形的节点被连接
2.data.batch tracks nodes属于哪个图。
3.我们使用env_pos_emb[data.batch]添加每个节点的正确环境效果
4.这确保每个节点都会获取其相应的环境编码

In [22]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import TransformerConv, global_mean_pool

class EnvGraphTransformer(nn.Module):
    def __init__(self, 
                 in_node_dim,      # dimension of node features
                 in_env_dim,       # dimension of environment features
                 hidden_dim=64, 
                 num_layers=2, 
                 out_dim=1):
        super().__init__()
        
        # Project environment to same dimension as node embeddings for addition
        self.env_proj = nn.Linear(in_env_dim, hidden_dim)
        
        # If your node features are not 'hidden_dim', project them first
        self.node_proj = nn.Linear(in_node_dim, hidden_dim)

        # Build a stack of TransformerConv layers
        self.transformer_convs = nn.ModuleList()
        for _ in range(num_layers):
            conv = TransformerConv(
                hidden_dim, hidden_dim,
                heads=4,   # multi-head
                dropout=0.1,
                edge_dim=None  # if you have edge features, set dimension here
            )
            self.transformer_convs.append(conv)
        
        # Final linear to map from hidden_dim -> out_dim (e.g. 1 for regression)
        self.final_fc = nn.Linear(hidden_dim, out_dim)

    def forward(self, data):
        # data.x: node features, shape [num_nodes_in_batch, in_node_dim] 
        # data.env: environment, shape [batch_size, in_env_dim] 
        # data.edge_index: shape [2, E]
        # data.batch: shape [num_nodes_in_batch], which graph each node belongs to

        x, edge_index, batch = data.x, data.edge_index, data.batch
        
        # 1) environment projection -> shape [batch_size, hidden_dim]
        #    We'll broadcast to nodes in each graph
        env_emb = self.env_proj(data.env)  # [batch_size, hidden_dim]

        # 2) node projection -> shape [num_nodes_in_batch, hidden_dim]
        x = self.node_proj(x)

        # 3) add environment embedding to each node
        #    broadcast env_emb -> shape [num_nodes_in_batch, hidden_dim]
        x = x + env_emb[batch]  # adds environment positional encoding to each node

        # 4) pass through TransformerConv layers
        for conv in self.transformer_convs:
            x = conv(x, edge_index)
            x = F.relu(x)

        # 5) global pooling to get a single graph-level embedding
        graph_emb = global_mean_pool(x, batch)

        # 6) final regression
        out = self.final_fc(graph_emb)  # shape [batch_size, 1]
        return out


### 3.3 MSE/RMSE/R2 检验 → 测试model

In [23]:
from torch_geometric.loader import DataLoader

# Suppose your data_list has each item with data.x, data.env, data.y
# We do a random shuffle + batch them
loader = DataLoader(data_list, batch_size=16, shuffle=True)

# The dimension of node features:
in_node_dim = data_list[0].x.shape[1]
# The dimension of environment features, e.g. "env_columns = 4"
in_env_dim = len(env_columns)

model = EnvGraphTransformer(in_node_dim, in_env_dim, hidden_dim=64)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.MSELoss()

# If you have a GPU:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = model.to(device)

num_epochs = 10

model.train()
for epoch in range(num_epochs):
    total_loss = 0
    for batch_data in loader:
        # batch_data = batch_data.to(device)  # if using GPU
        optimizer.zero_grad()

        # Forward pass
        pred = model(batch_data).squeeze(dim=-1)  # shape [batch_size]
        target = batch_data.y.squeeze(dim=-1)     # shape [batch_size]

        loss = loss_fn(pred, target)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(loader)
    print(f"Epoch {epoch+1:02d}, Loss: {avg_loss:.4f}")


ValueError: num_samples should be a positive integer value, but got num_samples=0

---

## 4. fixed code

---